LLVM 23.0.0git
LoopInterchange.cpp
Go to the documentation of this file.
1//===- LoopInterchange.cpp - Loop interchange pass-------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This Pass handles loop interchange transform.
10// This pass interchanges loops to provide a more cache-friendly memory access
11// patterns.
12//
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/STLExtras.h"
17#include "llvm/ADT/SmallSet.h"
19#include "llvm/ADT/Statistic.h"
20#include "llvm/ADT/StringMap.h"
21#include "llvm/ADT/StringRef.h"
30#include "llvm/IR/BasicBlock.h"
32#include "llvm/IR/Dominators.h"
33#include "llvm/IR/Function.h"
34#include "llvm/IR/IRBuilder.h"
35#include "llvm/IR/InstrTypes.h"
36#include "llvm/IR/Instruction.h"
38#include "llvm/IR/User.h"
39#include "llvm/IR/Value.h"
42#include "llvm/Support/Debug.h"
49#include <cassert>
50#include <utility>
51#include <vector>
52
53using namespace llvm;
54
55#define DEBUG_TYPE "loop-interchange"
56
57STATISTIC(LoopsInterchanged, "Number of loops interchanged");
58
60 "loop-interchange-threshold", cl::init(0), cl::Hidden,
61 cl::desc("Interchange if you gain more than this number"));
62
64 "loop-interchange-max-mem-instr-ratio", cl::init(4), cl::Hidden,
65 cl::desc("Maximum number of load/store instructions squared in relation to "
66 "the total number of instructions. Higher value may lead to more "
67 "interchanges at the cost of compile-time"));
68
69namespace {
70
72
73/// A list of direction vectors. Each entry represents a direction vector
74/// corresponding to one or more dependencies existing in the loop nest. The
75/// length of all direction vectors is equal and is N + 1, where N is the depth
76/// of the loop nest. The first N elements correspond to the dependency
77/// direction of each N loops. The last one indicates whether this entry is
78/// forward dependency ('<') or not ('*'). The term "forward" aligns with what
79/// is defined in LoopAccessAnalysis.
80// TODO: Check if we can use a sparse matrix here.
81using CharMatrix = std::vector<std::vector<char>>;
82
83/// Types of rules used in profitability check.
84enum class RuleTy {
85 PerLoopCacheAnalysis,
86 PerInstrOrderCost,
87 ForVectorization,
88 Ignore
89};
90
91} // end anonymous namespace
92
93// Minimum loop depth supported.
95 "loop-interchange-min-loop-nest-depth", cl::init(2), cl::Hidden,
96 cl::desc("Minimum depth of loop nest considered for the transform"));
97
98// Maximum loop depth supported.
100 "loop-interchange-max-loop-nest-depth", cl::init(10), cl::Hidden,
101 cl::desc("Maximum depth of loop nest considered for the transform"));
102
103// We prefer cache cost to vectorization by default.
105 "loop-interchange-profitabilities", cl::MiscFlags::CommaSeparated,
107 cl::desc("List of profitability heuristics to be used. They are applied in "
108 "the given order"),
109 cl::list_init<RuleTy>({RuleTy::PerInstrOrderCost,
110 RuleTy::ForVectorization}),
111 cl::values(clEnumValN(RuleTy::PerLoopCacheAnalysis, "cache",
112 "Prioritize loop cache cost"),
113 clEnumValN(RuleTy::PerInstrOrderCost, "instorder",
114 "Prioritize the IVs order of each instruction"),
115 clEnumValN(RuleTy::ForVectorization, "vectorize",
116 "Prioritize vectorization"),
117 clEnumValN(RuleTy::Ignore, "ignore",
118 "Ignore profitability, force interchange (does not "
119 "work with other options)")));
120
121// Support for the inner-loop reduction pattern.
123 "loop-interchange-reduction-to-mem", cl::init(false), cl::Hidden,
124 cl::desc("Support for the inner-loop reduction pattern."));
125
126#ifndef NDEBUG
129 for (RuleTy Rule : Rules) {
130 if (!Set.insert(Rule).second)
131 return false;
132 if (Rule == RuleTy::Ignore)
133 return false;
134 }
135 return true;
136}
137
138static void printDepMatrix(CharMatrix &DepMatrix) {
139 for (auto &Row : DepMatrix) {
140 // Drop the last element because it is a flag indicating whether this is
141 // forward dependency or not, which doesn't affect the legality check.
142 for (char D : drop_end(Row))
143 LLVM_DEBUG(dbgs() << D << " ");
144 LLVM_DEBUG(dbgs() << "\n");
145 }
146}
147
148/// Return true if \p Src appears before \p Dst in the same basic block.
149/// Precondition: \p Src and \Dst are distinct instructions within the same
150/// basic block.
151static bool inThisOrder(const Instruction *Src, const Instruction *Dst) {
152 assert(Src->getParent() == Dst->getParent() && Src != Dst &&
153 "Expected Src and Dst to be different instructions in the same BB");
154
155 bool FoundSrc = false;
156 for (const Instruction &I : *(Src->getParent())) {
157 if (&I == Src) {
158 FoundSrc = true;
159 continue;
160 }
161 if (&I == Dst)
162 return FoundSrc;
163 }
164
165 llvm_unreachable("Dst not found");
166}
167#endif
168
169static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
170 Loop *L, DependenceInfo *DI,
171 ScalarEvolution *SE,
174
175 ValueVector MemInstr;
176 unsigned NumInsts = 0;
177
178 // For each block.
179 for (BasicBlock *BB : L->blocks()) {
180 // Scan the BB and collect legal loads and stores.
181 for (Instruction &I : *BB) {
182 NumInsts++;
183 if (auto *Ld = dyn_cast<LoadInst>(&I)) {
184 if (!Ld->isSimple())
185 return false;
186 MemInstr.push_back(&I);
187 } else if (auto *St = dyn_cast<StoreInst>(&I)) {
188 if (!St->isSimple())
189 return false;
190 MemInstr.push_back(&I);
191 }
192 }
193 }
194
195 // To populate the dependence matrix, we perform dependence test for each pair
196 // of memory instructions, which has O(NumMemInstr^2) complexity. This implies
197 // that even if the number of memory instructions is small, the analysis can
198 // still be expensive if the most of the instructions in the loop are memory
199 // instructions. On the other hand, if the number of memory instructions is
200 // not small, but the loop is large (i.e., it contains many non-memory
201 // instructions), the analysis can still be affordable.
202 unsigned NumMemInstr = MemInstr.size();
203 LLVM_DEBUG(dbgs() << "Found " << NumMemInstr
204 << " Loads and Stores to analyze\n");
205 if (MaxMemInstrRatio * NumInsts < NumMemInstr * NumMemInstr) {
206 ORE->emit([&]() {
207 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoop",
208 L->getStartLoc(), L->getHeader())
209 << "Number of loads/stores exceeded, the supported maximum can be "
210 "increased with option -loop-interchange-max-mem-instr-ratio.";
211 });
212 return false;
213 }
214 ValueVector::iterator I, IE, J, JE;
215
216 // Manage direction vectors that are already seen. Map each direction vector
217 // to an index of DepMatrix at which it is stored.
219
220 for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
221 for (J = I, JE = MemInstr.end(); J != JE; ++J) {
222 std::vector<char> Dep;
225 // Ignore Input dependencies.
226 if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
227 continue;
228 // Track Output, Flow, and Anti dependencies.
229 if (auto D = DI->depends(Src, Dst)) {
230 assert(D->isOrdered() && "Expected an output, flow or anti dep.");
231 // If the direction vector is negative, normalize it to
232 // make it non-negative.
233 if (D->normalize(SE))
234 LLVM_DEBUG(dbgs() << "Negative dependence vector normalized.\n");
235 LLVM_DEBUG(StringRef DepType =
236 D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
237 dbgs() << "Found " << DepType
238 << " dependency between Src and Dst\n"
239 << " Src:" << *Src << "\n Dst:" << *Dst << '\n');
240 unsigned Levels = D->getLevels();
241 char Direction;
242 for (unsigned II = 1; II <= Levels; ++II) {
243 // `DVEntry::LE` is converted to `*`. This is because `LE` means `<`
244 // or `=`, for which we don't have an equivalent representation, so
245 // that the conservative approximation is necessary. The same goes for
246 // `DVEntry::GE`.
247 // TODO: Use of fine-grained expressions allows for more accurate
248 // analysis.
249 unsigned Dir = D->getDirection(II);
250 if (Dir == Dependence::DVEntry::LT)
251 Direction = '<';
252 else if (Dir == Dependence::DVEntry::GT)
253 Direction = '>';
254 else if (Dir == Dependence::DVEntry::EQ)
255 Direction = '=';
256 else
257 Direction = '*';
258 Dep.push_back(Direction);
259 }
260
261 // If the Dependence object doesn't have any information, fill the
262 // dependency vector with '*'.
263 if (D->isConfused()) {
264 assert(Dep.empty() && "Expected empty dependency vector");
265 Dep.assign(Level, '*');
266 }
267
268 while (Dep.size() != Level) {
269 Dep.push_back('I');
270 }
271
272 // If all the elements of any direction vector have only '*', legality
273 // can't be proven. Exit early to save compile time.
274 if (all_of(Dep, equal_to('*'))) {
275 ORE->emit([&]() {
276 return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
277 L->getStartLoc(), L->getHeader())
278 << "All loops have dependencies in all directions.";
279 });
280 return false;
281 }
282
283 // Test whether the dependency is forward or not.
284 bool IsKnownForward = true;
285 if (Src->getParent() != Dst->getParent()) {
286 // In general, when Src and Dst are in different BBs, the execution
287 // order of them within a single iteration is not guaranteed. Treat
288 // conservatively as not-forward dependency in this case.
289 IsKnownForward = false;
290 } else {
291 // Src and Dst are in the same BB. If they are the different
292 // instructions, Src should appear before Dst in the BB as they are
293 // stored to MemInstr in that order.
294 assert((Src == Dst || inThisOrder(Src, Dst)) &&
295 "Unexpected instructions");
296
297 // If the Dependence object is reversed (due to normalization), it
298 // represents the dependency from Dst to Src, meaning it is a backward
299 // dependency. Otherwise it should be a forward dependency.
300 bool IsReversed = D->getSrc() != Src;
301 if (IsReversed)
302 IsKnownForward = false;
303 }
304
305 // Initialize the last element. Assume forward dependencies only; it
306 // will be updated later if there is any non-forward dependency.
307 Dep.push_back('<');
308
309 // The last element should express the "summary" among one or more
310 // direction vectors whose first N elements are the same (where N is
311 // the depth of the loop nest). Hence we exclude the last element from
312 // the Seen map.
313 auto [Ite, Inserted] = Seen.try_emplace(
314 StringRef(Dep.data(), Dep.size() - 1), DepMatrix.size());
315
316 // Make sure we only add unique entries to the dependency matrix.
317 if (Inserted)
318 DepMatrix.push_back(Dep);
319
320 // If we cannot prove that this dependency is forward, change the last
321 // element of the corresponding entry. Since a `[... *]` dependency
322 // includes a `[... <]` dependency, we do not need to keep both and
323 // change the existing entry instead.
324 if (!IsKnownForward)
325 DepMatrix[Ite->second].back() = '*';
326 }
327 }
328 }
329
330 return true;
331}
332
333// A loop is moved from index 'from' to an index 'to'. Update the Dependence
334// matrix by exchanging the two columns.
335static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx,
336 unsigned ToIndx) {
337 for (auto &Row : DepMatrix)
338 std::swap(Row[ToIndx], Row[FromIndx]);
339}
340
341// Check if a direction vector is lexicographically positive. Return true if it
342// is positive, nullopt if it is "zero", otherwise false.
343// [Theorem] A permutation of the loops in a perfect nest is legal if and only
344// if the direction matrix, after the same permutation is applied to its
345// columns, has no ">" direction as the leftmost non-"=" direction in any row.
346static std::optional<bool>
347isLexicographicallyPositive(ArrayRef<char> DV, unsigned Begin, unsigned End) {
348 for (unsigned char Direction : DV.slice(Begin, End - Begin)) {
349 if (Direction == '<')
350 return true;
351 if (Direction == '>' || Direction == '*')
352 return false;
353 }
354 return std::nullopt;
355}
356
357// Checks if it is legal to interchange 2 loops.
358static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
359 unsigned InnerLoopId,
360 unsigned OuterLoopId) {
361 unsigned NumRows = DepMatrix.size();
362 std::vector<char> Cur;
363 // For each row check if it is valid to interchange.
364 for (unsigned Row = 0; Row < NumRows; ++Row) {
365 // Create temporary DepVector check its lexicographical order
366 // before and after swapping OuterLoop vs InnerLoop
367 Cur = DepMatrix[Row];
368
369 // If the surrounding loops already ensure that the direction vector is
370 // lexicographically positive, nothing within the loop will be able to break
371 // the dependence. In such a case we can skip the subsequent check.
372 if (isLexicographicallyPositive(Cur, 0, OuterLoopId) == true)
373 continue;
374
375 // Check if the direction vector is lexicographically positive (or zero)
376 // for both before/after exchanged. Ignore the last element because it
377 // doesn't affect the legality.
378 if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false)
379 return false;
380 std::swap(Cur[InnerLoopId], Cur[OuterLoopId]);
381 if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false)
382 return false;
383 }
384 return true;
385}
386
387static void populateWorklist(Loop &L, LoopVector &LoopList) {
388 LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: "
389 << L.getHeader()->getParent()->getName() << " Loop: %"
390 << L.getHeader()->getName() << '\n');
391 assert(LoopList.empty() && "LoopList should initially be empty!");
392 Loop *CurrentLoop = &L;
393 const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
394 while (!Vec->empty()) {
395 // The current loop has multiple subloops in it hence it is not tightly
396 // nested.
397 // Discard all loops above it added into Worklist.
398 if (Vec->size() != 1) {
399 LoopList = {};
400 return;
401 }
402
403 LoopList.push_back(CurrentLoop);
404 CurrentLoop = Vec->front();
405 Vec = &CurrentLoop->getSubLoops();
406 }
407 LoopList.push_back(CurrentLoop);
408}
409
412 unsigned LoopNestDepth = LoopList.size();
413 if (LoopNestDepth < MinLoopNestDepth || LoopNestDepth > MaxLoopNestDepth) {
414 LLVM_DEBUG(dbgs() << "Unsupported depth of loop nest " << LoopNestDepth
415 << ", the supported range is [" << MinLoopNestDepth
416 << ", " << MaxLoopNestDepth << "].\n");
417 Loop *OuterLoop = LoopList.front();
418 ORE.emit([&]() {
419 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoopNestDepth",
420 OuterLoop->getStartLoc(),
421 OuterLoop->getHeader())
422 << "Unsupported depth of loop nest, the supported range is ["
423 << std::to_string(MinLoopNestDepth) << ", "
424 << std::to_string(MaxLoopNestDepth) << "].\n";
425 });
426 return false;
427 }
428 return true;
429}
430
432 ArrayRef<Loop *> LoopList) {
433 for (Loop *L : LoopList) {
434 const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
435 if (isa<SCEVCouldNotCompute>(ExitCountOuter)) {
436 LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n");
437 return false;
438 }
439 if (L->getNumBackEdges() != 1) {
440 LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
441 return false;
442 }
443 if (!L->getExitingBlock()) {
444 LLVM_DEBUG(dbgs() << "Loop doesn't have unique exit block\n");
445 return false;
446 }
447 }
448 return true;
449}
450
451namespace {
452
453/// LoopInterchangeLegality checks if it is legal to interchange the loop.
454class LoopInterchangeLegality {
455public:
456 LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
457 OptimizationRemarkEmitter *ORE, DominatorTree *DT)
458 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), DT(DT), ORE(ORE) {}
459
460 /// Check if the loops can be interchanged.
461 bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
462 CharMatrix &DepMatrix);
463
464 /// Check if the loop structure is understood. We do not handle triangular
465 /// loops for now.
466 bool isLoopStructureUnderstood();
467
468 bool currentLimitations();
469
470 const SmallPtrSetImpl<PHINode *> &getOuterInnerReductions() const {
471 return OuterInnerReductions;
472 }
473
474 const ArrayRef<PHINode *> getInnerLoopInductions() const {
475 return InnerLoopInductions;
476 }
477
478 ArrayRef<Instruction *> getHasNoWrapReductions() const {
479 return HasNoWrapReductions;
480 }
481
482 ArrayRef<Instruction *> getHasNoInfInsts() const { return HasNoInfInsts; }
483
484 /// Record reductions in the inner loop. Currently supported reductions:
485 /// - initialized from a constant.
486 /// - reduction PHI node has only one user.
487 /// - located in the innermost loop.
488 struct InnerReduction {
489 /// The reduction itself.
490 PHINode *Reduction;
491 Value *Init;
492 Value *Next;
493 /// The Lcssa PHI.
494 PHINode *LcssaPhi;
495 /// Store reduction result into memory object.
496 StoreInst *LcssaStore;
497 /// The memory Location.
498 Value *MemRef;
499 Type *ElemTy;
500 };
501
502 ArrayRef<InnerReduction> getInnerReductions() const {
503 return InnerReductions;
504 }
505
506private:
507 bool tightlyNested(Loop *Outer, Loop *Inner);
508 bool containsUnsafeInstructions(BasicBlock *BB, Instruction *Skip);
509
510 /// Traverse all PHI nodes in the header of each loop in the loop nest
511 /// starting from \p OuterLoop, and perform the following checks:
512 ///
513 /// - Identify induction variables in the child loop of \p OuterLoop.
514 /// - Check for reductions across the inner loop and \p OuterLoop.
515 /// - Detect unsupported PHI nodes.
516 ///
517 /// Return false if any unsupported PHI node is found or if no induction
518 /// variable is found in the child loop of \p OuterLoop. Otherwise return
519 /// true.
520 bool checkInductionsAndReductions(Loop *OuterLoop);
521
522 /// Detect and record the reduction of the inner loop. Add them to
523 /// InnerReductions.
524 ///
525 /// innerloop:
526 /// Re = phi<0.0, Next>
527 /// Next = Re op ...
528 /// OuterLoopLatch:
529 /// Lcssa = phi<Next> ; lcssa phi
530 /// store Lcssa, MemRef ; LcssaStore
531 ///
532 bool isInnerReduction(Loop *L, PHINode *Phi,
533 SmallVectorImpl<Instruction *> &HasNoWrapInsts);
534
535 Loop *OuterLoop;
536 Loop *InnerLoop;
537
538 ScalarEvolution *SE;
539 DominatorTree *DT;
540
541 /// Interface to emit optimization remarks.
542 OptimizationRemarkEmitter *ORE;
543
544 /// Set of reduction PHIs taking part of a reduction across the inner and
545 /// outer loop.
546 SmallPtrSet<PHINode *, 4> OuterInnerReductions;
547
548 /// Set of inner loop induction PHIs
549 SmallVector<PHINode *, 8> InnerLoopInductions;
550
551 /// Hold instructions that have nuw/nsw flags and involved in reductions,
552 /// like integer addition/multiplication. Those flags must be dropped when
553 /// interchanging the loops.
554 SmallVector<Instruction *, 4> HasNoWrapReductions;
555
556 /// Hold instructions that have ninf flags and involved in reductions. Those
557 /// flags must be dropped when interchanging the loops.
558 SmallVector<Instruction *, 4> HasNoInfInsts;
559
560 /// Vector of reductions in the inner loop.
561 SmallVector<InnerReduction, 8> InnerReductions;
562};
563
564/// Manages information utilized by the profitability check for cache. The main
565/// purpose of this class is to delay the computation of CacheCost until it is
566/// actually needed.
567class CacheCostManager {
568 Loop *OutermostLoop;
569 LoopStandardAnalysisResults *AR;
570 DependenceInfo *DI;
571
572 /// CacheCost for \ref OutermostLoop. Once it is computed, it is cached. Note
573 /// that the result can be nullptr.
574 std::optional<std::unique_ptr<CacheCost>> CC;
575
576 /// Maps each loop to an index representing the optimal position within the
577 /// loop-nest, as determined by the cache cost analysis.
578 DenseMap<const Loop *, unsigned> CostMap;
579
580 void computeIfUnitinialized();
581
582public:
583 CacheCostManager(Loop *OutermostLoop, LoopStandardAnalysisResults *AR,
584 DependenceInfo *DI)
585 : OutermostLoop(OutermostLoop), AR(AR), DI(DI) {}
586 CacheCost *getCacheCost();
587 const DenseMap<const Loop *, unsigned> &getCostMap();
588};
589
590/// LoopInterchangeProfitability checks if it is profitable to interchange the
591/// loop.
592class LoopInterchangeProfitability {
593public:
594 LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
595 OptimizationRemarkEmitter *ORE)
596 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
597
598 /// Check if the loop interchange is profitable.
599 bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
600 unsigned InnerLoopId, unsigned OuterLoopId,
601 CharMatrix &DepMatrix, CacheCostManager &CCM);
602
603private:
604 int getInstrOrderCost();
605 std::optional<bool> isProfitablePerLoopCacheAnalysis(
606 const DenseMap<const Loop *, unsigned> &CostMap, CacheCost *CC);
607 std::optional<bool> isProfitablePerInstrOrderCost();
608 std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId,
609 unsigned OuterLoopId,
610 CharMatrix &DepMatrix);
611 Loop *OuterLoop;
612 Loop *InnerLoop;
613
614 /// Scev analysis.
615 ScalarEvolution *SE;
616
617 /// Interface to emit optimization remarks.
618 OptimizationRemarkEmitter *ORE;
619};
620
621/// LoopInterchangeTransform interchanges the loop.
622class LoopInterchangeTransform {
623public:
624 LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
625 LoopInfo *LI, DominatorTree *DT,
626 const LoopInterchangeLegality &LIL)
627 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), LIL(LIL) {}
628
629 /// Interchange OuterLoop and InnerLoop.
630 bool transform(ArrayRef<Instruction *> DropNoWrapInsts,
631 ArrayRef<Instruction *> DropNoInfInsts);
632 void reduction2Memory();
633 void restructureLoops(Loop *NewInner, Loop *NewOuter,
634 BasicBlock *OrigInnerPreHeader,
635 BasicBlock *OrigOuterPreHeader);
636 void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
637
638private:
639 bool adjustLoopLinks();
640 bool adjustLoopBranches();
641
642 Loop *OuterLoop;
643 Loop *InnerLoop;
644
645 /// Scev analysis.
646 ScalarEvolution *SE;
647
648 LoopInfo *LI;
649 DominatorTree *DT;
650
651 const LoopInterchangeLegality &LIL;
652};
653
654struct LoopInterchange {
655 ScalarEvolution *SE = nullptr;
656 LoopInfo *LI = nullptr;
657 DependenceInfo *DI = nullptr;
658 DominatorTree *DT = nullptr;
659 LoopStandardAnalysisResults *AR = nullptr;
660
661 /// Interface to emit optimization remarks.
662 OptimizationRemarkEmitter *ORE;
663
664 LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI,
665 DominatorTree *DT, LoopStandardAnalysisResults *AR,
666 OptimizationRemarkEmitter *ORE)
667 : SE(SE), LI(LI), DI(DI), DT(DT), AR(AR), ORE(ORE) {}
668
669 bool run(Loop *L) {
670 if (L->getParentLoop())
671 return false;
672 SmallVector<Loop *, 8> LoopList;
673 populateWorklist(*L, LoopList);
674 return processLoopList(LoopList);
675 }
676
677 bool run(LoopNest &LN) {
678 SmallVector<Loop *, 8> LoopList(LN.getLoops());
679 for (unsigned I = 1; I < LoopList.size(); ++I)
680 if (LoopList[I]->getParentLoop() != LoopList[I - 1])
681 return false;
682 return processLoopList(LoopList);
683 }
684
685 unsigned selectLoopForInterchange(ArrayRef<Loop *> LoopList) {
686 // TODO: Add a better heuristic to select the loop to be interchanged based
687 // on the dependence matrix. Currently we select the innermost loop.
688 return LoopList.size() - 1;
689 }
690
691 bool processLoopList(SmallVectorImpl<Loop *> &LoopList) {
692 bool Changed = false;
693
694 // Ensure proper loop nest depth.
695 assert(hasSupportedLoopDepth(LoopList, *ORE) &&
696 "Unsupported depth of loop nest.");
697
698 unsigned LoopNestDepth = LoopList.size();
699
700 LLVM_DEBUG({
701 dbgs() << "Processing LoopList of size = " << LoopNestDepth
702 << " containing the following loops:\n";
703 for (auto *L : LoopList) {
704 dbgs() << " - ";
705 L->print(dbgs());
706 }
707 });
708
709 CharMatrix DependencyMatrix;
710 Loop *OuterMostLoop = *(LoopList.begin());
711 if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
712 OuterMostLoop, DI, SE, ORE)) {
713 LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
714 return false;
715 }
716
717 LLVM_DEBUG(dbgs() << "Dependency matrix before interchange:\n";
718 printDepMatrix(DependencyMatrix));
719
720 // Get the Outermost loop exit.
721 BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock();
722 if (!LoopNestExit) {
723 LLVM_DEBUG(dbgs() << "OuterMostLoop '" << OuterMostLoop->getName()
724 << "' needs an unique exit block");
725 return false;
726 }
727
728 unsigned SelecLoopId = selectLoopForInterchange(LoopList);
729 CacheCostManager CCM(LoopList[0], AR, DI);
730 // We try to achieve the globally optimal memory access for the loopnest,
731 // and do interchange based on a bubble-sort fasion. We start from
732 // the innermost loop, move it outwards to the best possible position
733 // and repeat this process.
734 for (unsigned j = SelecLoopId; j > 0; j--) {
735 bool ChangedPerIter = false;
736 for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
737 bool Interchanged =
738 processLoop(LoopList, i, i - 1, DependencyMatrix, CCM);
739 ChangedPerIter |= Interchanged;
740 Changed |= Interchanged;
741 }
742 // Early abort if there was no interchange during an entire round of
743 // moving loops outwards.
744 if (!ChangedPerIter)
745 break;
746 }
747 return Changed;
748 }
749
750 bool processLoop(SmallVectorImpl<Loop *> &LoopList, unsigned InnerLoopId,
751 unsigned OuterLoopId,
752 std::vector<std::vector<char>> &DependencyMatrix,
753 CacheCostManager &CCM) {
754 Loop *OuterLoop = LoopList[OuterLoopId];
755 Loop *InnerLoop = LoopList[InnerLoopId];
756 LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId
757 << " and OuterLoopId = " << OuterLoopId << "\n");
758 LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE, DT);
759 if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
760 LLVM_DEBUG(dbgs() << "Cannot prove legality, not interchanging loops '"
761 << OuterLoop->getName() << "' and '"
762 << InnerLoop->getName() << "'\n");
763 return false;
764 }
765 LLVM_DEBUG(dbgs() << "Loops '" << OuterLoop->getName() << "' and '"
766 << InnerLoop->getName()
767 << "' are legal to interchange\n");
768 LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
769 if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
770 DependencyMatrix, CCM)) {
771 LLVM_DEBUG(dbgs() << "Interchanging loops '" << OuterLoop->getName()
772 << "' and '" << InnerLoop->getName()
773 << "' not profitable.\n");
774 return false;
775 }
776
777 ORE->emit([&]() {
778 return OptimizationRemark(DEBUG_TYPE, "Interchanged",
779 InnerLoop->getStartLoc(),
780 InnerLoop->getHeader())
781 << "Loop interchanged with enclosing loop.";
782 });
783
784 LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LIL);
785 LIT.transform(LIL.getHasNoWrapReductions(), LIL.getHasNoInfInsts());
786 LLVM_DEBUG(dbgs() << "Loops interchanged: outer loop '"
787 << OuterLoop->getName() << "' and inner loop '"
788 << InnerLoop->getName() << "'\n");
789 LoopsInterchanged++;
790
791 llvm::formLCSSARecursively(*OuterLoop, *DT, LI, SE);
792
793 // Loops interchanged, update LoopList accordingly.
794 std::swap(LoopList[OuterLoopId], LoopList[InnerLoopId]);
795 // Update the DependencyMatrix
796 interChangeDependencies(DependencyMatrix, InnerLoopId, OuterLoopId);
797
798 LLVM_DEBUG(dbgs() << "Dependency matrix after interchange:\n";
799 printDepMatrix(DependencyMatrix));
800
801 return true;
802 }
803};
804
805} // end anonymous namespace
806
807bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB,
808 Instruction *Skip) {
809 return any_of(*BB, [Skip](const Instruction &I) {
810 if (&I == Skip)
811 return false;
812 return I.mayHaveSideEffects() || I.mayReadFromMemory();
813 });
814}
815
816bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
817 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
818 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
819 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
820
821 LLVM_DEBUG(dbgs() << "Checking if loops '" << OuterLoop->getName()
822 << "' and '" << InnerLoop->getName()
823 << "' are tightly nested\n");
824
825 // In a perfectly nested loop the outer header branches only into the inner
826 // loop. If it can also reach the outer latch, it conditionally guards the
827 // inner loop (an imperfect nest), so the inner loop runs on only a subset of
828 // the outer iterations. Interchanging such a nest would run the inner loop on
829 // every outer iteration, including the guarded-off ones, which is illegal
830 // when the inner loop relies on the guard to terminate (e.g. an eq/ne exit
831 // whose trip count is degenerate once the guard is false). Reject by allowing
832 // the outer header to branch only into the inner loop.
833 //
834 // TODO: This is conservative. A guarded nest is still safe to interchange
835 // when the inner loop has a computable trip count that is empty exactly when
836 // the guard is false, e.g.:
837 // for (i = 0; i < N; i++)
838 // if (M > 0) // loop-invariant guard
839 // for (j = 0; j < M; j++) // empty when M <= 0
840 // A[j][i] = ...;
841 // Interchanging is legal here because the inner loop runs zero times on the
842 // guarded-off iterations.
843 for (BasicBlock *Succ : successors(OuterLoopHeader))
844 if (Succ != InnerLoopPreHeader && Succ != InnerLoop->getHeader())
845 return false;
846
847 LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
848
849 // The inner loop reduction pattern requires storing the LCSSA PHI in
850 // the OuterLoop Latch. Therefore, when reduction2Memory is enabled, skip
851 // that store during checks.
852 Instruction *Skip = nullptr;
853 assert(InnerReductions.size() <= 1 &&
854 "So far we only support at most one reduction.");
855 if (InnerReductions.size() == 1)
856 Skip = InnerReductions[0].LcssaStore;
857
858 // We do not have any basic block in between now make sure the outer header
859 // and outer loop latch doesn't contain any unsafe instructions.
860 if (containsUnsafeInstructions(OuterLoopHeader, Skip) ||
861 containsUnsafeInstructions(OuterLoopLatch, Skip))
862 return false;
863
864 // Also make sure the inner loop preheader does not contain any unsafe
865 // instructions. Note that all instructions in the preheader will be moved to
866 // the outer loop header when interchanging.
867 if (InnerLoopPreHeader != OuterLoopHeader &&
868 containsUnsafeInstructions(InnerLoopPreHeader, Skip))
869 return false;
870
871 BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
872 // Ensure the inner loop exit block flows to the outer loop latch possibly
873 // through empty blocks.
874 const BasicBlock &SuccInner =
875 LoopNest::skipEmptyBlockUntil(InnerLoopExit, OuterLoopLatch);
876 if (&SuccInner != OuterLoopLatch) {
877 LLVM_DEBUG(dbgs() << "Inner loop exit block " << *InnerLoopExit
878 << " does not lead to the outer loop latch.\n";);
879 return false;
880 }
881 // The inner loop exit block does flow to the outer loop latch and not some
882 // other BBs, now make sure it contains safe instructions, since it will be
883 // moved into the (new) inner loop after interchange.
884 if (containsUnsafeInstructions(InnerLoopExit, Skip))
885 return false;
886
887 LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n");
888 // We have a perfect loop nest.
889 return true;
890}
891
892bool LoopInterchangeLegality::isLoopStructureUnderstood() {
893 BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
894 for (PHINode *InnerInduction : InnerLoopInductions) {
895 unsigned Num = InnerInduction->getNumOperands();
896 for (unsigned i = 0; i < Num; ++i) {
897 Value *Val = InnerInduction->getOperand(i);
898 if (isa<Constant>(Val))
899 continue;
901 if (!I)
902 return false;
903 // TODO: Handle triangular loops.
904 // e.g. for(int i=0;i<N;i++)
905 // for(int j=i;j<N;j++)
906 unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
907 if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==
908 InnerLoopPreheader &&
909 !OuterLoop->isLoopInvariant(I)) {
910 return false;
911 }
912 }
913 }
914
915 // TODO: Handle triangular loops of another form.
916 // e.g. for(int i=0;i<N;i++)
917 // for(int j=0;j<i;j++)
918 // or,
919 // for(int i=0;i<N;i++)
920 // for(int j=0;j*i<N;j++)
921 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
922 CondBrInst *InnerLoopLatchBI =
923 dyn_cast<CondBrInst>(InnerLoopLatch->getTerminator());
924 if (!InnerLoopLatchBI)
925 return false;
926
927 CmpInst *InnerLoopCmp = dyn_cast<CmpInst>(InnerLoopLatchBI->getCondition());
928 if (!InnerLoopCmp)
929 return false;
930
931 Value *Op0 = InnerLoopCmp->getOperand(0);
932 Value *Op1 = InnerLoopCmp->getOperand(1);
933
934 // LHS and RHS of the inner loop exit condition, e.g.,
935 // in "for(int j=0;j<i;j++)", LHS is j and RHS is i.
936 Value *Left = nullptr;
937 Value *Right = nullptr;
938
939 // Check if V only involves inner loop induction variable.
940 // Return true if V is InnerInduction, or a cast from
941 // InnerInduction, or a binary operator that involves
942 // InnerInduction and a constant.
943 std::function<bool(Value *)> IsPathToInnerIndVar;
944 IsPathToInnerIndVar = [this, &IsPathToInnerIndVar](const Value *V) -> bool {
945 if (llvm::is_contained(InnerLoopInductions, V))
946 return true;
947 if (isa<Constant>(V))
948 return true;
950 if (!I)
951 return false;
952 if (isa<CastInst>(I))
953 return IsPathToInnerIndVar(I->getOperand(0));
955 return IsPathToInnerIndVar(I->getOperand(0)) &&
956 IsPathToInnerIndVar(I->getOperand(1));
957 return false;
958 };
959
960 // In case of multiple inner loop indvars, it is okay if LHS and RHS
961 // are both inner indvar related variables.
962 if (IsPathToInnerIndVar(Op0) && IsPathToInnerIndVar(Op1))
963 return true;
964
965 // Otherwise we check if the cmp instruction compares an inner indvar
966 // related variable (Left) with a outer loop invariant (Right).
967 if (IsPathToInnerIndVar(Op0) && !isa<Constant>(Op0)) {
968 Left = Op0;
969 Right = Op1;
970 } else if (IsPathToInnerIndVar(Op1) && !isa<Constant>(Op1)) {
971 Left = Op1;
972 Right = Op0;
973 }
974
975 if (Left == nullptr)
976 return false;
977
978 const SCEV *S = SE->getSCEV(Right);
979 if (!SE->isLoopInvariant(S, OuterLoop))
980 return false;
981
982 return true;
983}
984
985// If SV is a LCSSA PHI node with a single incoming value, return the incoming
986// value.
987static Value *followLCSSA(Value *SV) {
989 if (!PHI)
990 return SV;
991
992 if (PHI->getNumIncomingValues() != 1)
993 return SV;
994 return followLCSSA(PHI->getIncomingValue(0));
995}
996
998 SmallVectorImpl<Instruction *> &HasNoWrapInsts,
999 SmallVectorImpl<Instruction *> &HasNoInfInsts) {
1002 // Detect floating point reduction only when it can be reordered.
1003 if (RD.getExactFPMathInst() != nullptr)
1004 return false;
1005
1006 RecurKind RK = RD.getRecurrenceKind();
1007 switch (RK) {
1008 case RecurKind::Or:
1009 case RecurKind::And:
1010 case RecurKind::Xor:
1011 case RecurKind::SMin:
1012 case RecurKind::SMax:
1013 case RecurKind::UMin:
1014 case RecurKind::UMax:
1015 return true;
1016
1017 // Interchanging the loops that contain AnyOf reduction is not always legal.
1018 // Especially, when the result value of the AnyOf is not loop-invariant with
1019 // respect to the outer loop, interchanging may change the semantics. The
1020 // following is an example of such case:
1021 // int A = {{ 1, 0 }, { 0, 1 }};
1022 // int red = 0;
1023 // for (int i = 0; i < 2; i++)
1024 // for (int j = 0; j < 2; j++)
1025 // red = (A[j][i] == 0) ? i + 1 : red;
1026 //
1027 // TODO: We may be able to support interchanging loops with AnyOf reduction
1028 // by checking the operand of the reduction is loop-invariant with respect
1029 // to the outer loop as well.
1030 case RecurKind::AnyOf:
1031 return false;
1032
1033 // Changing the order of floating-point operations may alter the results. If
1034 // a certain instruction has the ninf flag, it means that reordering can
1035 // produce a poison value, which may lead to undefined behavior. To prevent
1036 // this, we must drop the ninf flags if we decide to apply the
1037 // transformation.
1038 case RecurKind::FAdd:
1039 case RecurKind::FMul:
1040 case RecurKind::FMin:
1041 case RecurKind::FMax:
1046 case RecurKind::FMulAdd:
1047 for (Instruction *I : RD.getReductionOpChain(PHI, L))
1048 if (isa<FPMathOperator>(I) && I->hasNoInfs())
1049 HasNoInfInsts.push_back(I);
1050 return true;
1051
1052 // Change the order of integer addition/multiplication may change the
1053 // semantics. Consider the following case:
1054 //
1055 // int A[2][2] = {{ INT_MAX, INT_MAX }, { INT_MIN, INT_MIN }};
1056 // int sum = 0;
1057 // for (int i = 0; i < 2; i++)
1058 // for (int j = 0; j < 2; j++)
1059 // sum += A[j][i];
1060 //
1061 // If the above loops are exchanged, the addition will cause an
1062 // overflow. To prevent this, we must drop the nuw/nsw flags from the
1063 // addition/multiplication instructions when we actually exchanges the
1064 // loops.
1065 case RecurKind::Add:
1066 case RecurKind::Mul: {
1067 unsigned OpCode = RecurrenceDescriptor::getOpcode(RK);
1069
1070 // Bail out when we fail to collect reduction instructions chain.
1071 if (Ops.empty())
1072 return false;
1073
1074 for (Instruction *I : Ops) {
1075 assert(I->getOpcode() == OpCode &&
1076 "Expected the instruction to be the reduction operation");
1077 (void)OpCode;
1078
1079 // If the instruction has nuw/nsw flags, we must drop them when the
1080 // transformation is actually performed.
1081 if (I->hasNoSignedWrap() || I->hasNoUnsignedWrap())
1082 HasNoWrapInsts.push_back(I);
1083 }
1084 return true;
1085 }
1086
1087 default:
1088 return false;
1089 }
1090 } else
1091 return false;
1092}
1093
1094// Check V's users to see if it is involved in a reduction in L.
1095static PHINode *
1097 SmallVectorImpl<Instruction *> &HasNoWrapInsts,
1098 SmallVectorImpl<Instruction *> &HasNoInfInsts) {
1099 // Reduction variables cannot be constants.
1100 if (isa<Constant>(V))
1101 return nullptr;
1102
1103 for (Value *User : V->users()) {
1105 if (PHI->getNumIncomingValues() == 1)
1106 continue;
1107
1108 if (checkReductionKind(L, PHI, HasNoWrapInsts, HasNoInfInsts))
1109 return PHI;
1110 else
1111 return nullptr;
1112 }
1113 }
1114
1115 return nullptr;
1116}
1117
1118bool LoopInterchangeLegality::isInnerReduction(
1119 Loop *L, PHINode *Phi, SmallVectorImpl<Instruction *> &HasNoWrapInsts) {
1120
1121 // Only support reduction2Mem when the loop nest to be interchanged is
1122 // the innermost two loops.
1123 if (!L->isInnermost()) {
1124 LLVM_DEBUG(dbgs() << "Only supported when the loop is the innermost.\n");
1125 ORE->emit([&]() {
1126 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1127 L->getStartLoc(), L->getHeader())
1128 << "Only supported when the loop is the innermost.";
1129 });
1130 return false;
1131 }
1132
1133 if (Phi->getNumIncomingValues() != 2)
1134 return false;
1135
1136 Value *Init = Phi->getIncomingValueForBlock(L->getLoopPreheader());
1137 Value *Next = Phi->getIncomingValueForBlock(L->getLoopLatch());
1138
1139 // So far only supports constant initial value.
1140 if (!isa<Constant>(Init)) {
1141 LLVM_DEBUG(
1142 dbgs()
1143 << "Only supported for the reduction with a constant initial value.\n");
1144 ORE->emit([&]() {
1145 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1146 L->getStartLoc(), L->getHeader())
1147 << "Only supported for the reduction with a constant initial "
1148 "value.";
1149 });
1150 return false;
1151 }
1152
1153 // The reduction result must live in the inner loop.
1154 if (Instruction *I = dyn_cast<Instruction>(Next)) {
1155 BasicBlock *BB = I->getParent();
1156 if (!L->contains(BB))
1157 return false;
1158 }
1159
1160 // The reduction should have only one user.
1161 if (!Phi->hasOneUser())
1162 return false;
1163
1164 // Check the reduction kind.
1165 if (!checkReductionKind(L, Phi, HasNoWrapInsts, HasNoInfInsts))
1166 return false;
1167
1168 // Find lcssa_phi in OuterLoop's Latch
1169 BasicBlock *ExitBlock = L->getExitBlock();
1170 if (!ExitBlock)
1171 return false;
1172
1173 PHINode *Lcssa = NULL;
1174 for (auto *U : Next->users()) {
1175 if (auto *P = dyn_cast<PHINode>(U)) {
1176 if (P == Phi)
1177 continue;
1178
1179 if (Lcssa == NULL && P->getParent() == ExitBlock &&
1180 P->getIncomingValueForBlock(L->getLoopLatch()) == Next)
1181 Lcssa = P;
1182 else
1183 return false;
1184 } else
1185 return false;
1186 }
1187 if (!Lcssa)
1188 return false;
1189
1190 if (!Lcssa->hasOneUser()) {
1191 LLVM_DEBUG(dbgs() << "Only supported when the reduction is used once in "
1192 "the outer loop.\n");
1193 ORE->emit([&]() {
1194 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1195 L->getStartLoc(), L->getHeader())
1196 << "Only supported when the reduction is used once in the outer "
1197 "loop.";
1198 });
1199 return false;
1200 }
1201
1202 StoreInst *LcssaStore =
1204 if (!LcssaStore || LcssaStore->getParent() != ExitBlock)
1205 return false;
1206
1207 Value *MemRef = LcssaStore->getOperand(1);
1208 Type *ElemTy = LcssaStore->getOperand(0)->getType();
1209
1210 // LcssaStore stores the reduction result in BB.
1211 // When the reduction is initialized from a constant value, we need to load
1212 // from the memory object into the target basic block of the inner loop. This
1213 // means the memory reference was used prematurely. So we must ensure that the
1214 // memory reference does not dominate the target basic block.
1215 // TODO: Move the memory reference definition into the loop header.
1216 if (!DT->dominates(dyn_cast<Instruction>(MemRef), L->getHeader())) {
1217 LLVM_DEBUG(dbgs() << "Only supported when memory reference dominate "
1218 "the inner loop.\n");
1219 ORE->emit([&]() {
1220 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1221 L->getStartLoc(), L->getHeader())
1222 << "Only supported when memory reference dominate the inner "
1223 "loop.";
1224 });
1225 return false;
1226 }
1227
1228 // Found a reduction in the inner loop.
1229 InnerReduction SR;
1230 SR.Reduction = Phi;
1231 SR.Init = Init;
1232 SR.Next = Next;
1233 SR.LcssaPhi = Lcssa;
1234 SR.LcssaStore = LcssaStore;
1235 SR.MemRef = MemRef;
1236 SR.ElemTy = ElemTy;
1237
1238 InnerReductions.push_back(SR);
1239 return true;
1240}
1241
1242bool LoopInterchangeLegality::checkInductionsAndReductions(Loop *OuterLoop) {
1243 auto ChildLoop = [](Loop *L) {
1244 assert(L->getSubLoops().size() <= 1 &&
1245 "Expect at most one child loop for now.");
1246 return L->getSubLoops().empty() ? nullptr : L->getSubLoops().front();
1247 };
1248
1249 Loop *InnerLoop = ChildLoop(OuterLoop);
1250 for (Loop *CurLoop = OuterLoop; CurLoop; CurLoop = ChildLoop(CurLoop)) {
1251 for (PHINode &PHI : CurLoop->getHeader()->phis()) {
1252 InductionDescriptor ID;
1253 if (InductionDescriptor::isInductionPHI(&PHI, CurLoop, SE, ID)) {
1254 if (CurLoop == InnerLoop)
1255 InnerLoopInductions.push_back(&PHI);
1256 continue;
1257 }
1258
1259 if (CurLoop == OuterLoop) {
1260 // PHIs in inner loops need to be part of a reduction in the outer loop,
1261 assert(PHI.getNumIncomingValues() == 2 &&
1262 "Phis in loop header should have exactly 2 incoming values");
1263 // Check if we have a PHI node in the outer loop that has a reduction
1264 // result from the inner loop as an incoming value.
1265 Value *V = followLCSSA(
1266 PHI.getIncomingValueForBlock(OuterLoop->getLoopLatch()));
1267 PHINode *InnerRedPhi = findInnerReductionPhi(
1268 InnerLoop, V, HasNoWrapReductions, HasNoInfInsts);
1269 if (!InnerRedPhi ||
1270 !llvm::is_contained(InnerRedPhi->incoming_values(), &PHI)) {
1271 LLVM_DEBUG(
1272 dbgs()
1273 << "Failed to recognize PHI as an induction or reduction.\n");
1274 ORE->emit([&]() {
1275 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter",
1276 OuterLoop->getStartLoc(),
1277 OuterLoop->getHeader())
1278 << "Only outer loops with induction or reduction PHI nodes "
1279 "can be interchanged currently.";
1280 });
1281 return false;
1282 }
1283 OuterInnerReductions.insert(&PHI);
1284 OuterInnerReductions.insert(InnerRedPhi);
1285 } else {
1286 if (OuterInnerReductions.count(&PHI)) {
1287 LLVM_DEBUG(dbgs() << "Found a reduction across the outer loop.\n");
1288 } else if (EnableReduction2Memory &&
1289 isInnerReduction(CurLoop, &PHI, HasNoWrapReductions)) {
1290 LLVM_DEBUG(dbgs() << "Found a reduction in the inner loop: \n"
1291 << PHI << '\n');
1292 } else {
1293 ORE->emit([&]() {
1294 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner",
1295 CurLoop->getStartLoc(),
1296 CurLoop->getHeader())
1297 << "Only inner loops with induction or reduction PHI nodes "
1298 "can be interchanged currently.";
1299 });
1300 return false;
1301 }
1302 }
1303 }
1304
1305 // For now we only support at most one reduction.
1306 if (InnerReductions.size() > 1) {
1307 LLVM_DEBUG(dbgs() << "Only supports at most one reduction.\n");
1308 ORE->emit([&]() {
1309 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1310 CurLoop->getStartLoc(),
1311 CurLoop->getHeader())
1312 << "Only supports at most one reduction.";
1313 });
1314 return false;
1315 }
1316 }
1317
1318 return !InnerLoopInductions.empty();
1319}
1320
1321// This function indicates the current limitations in the transform as a result
1322// of which we do not proceed.
1323bool LoopInterchangeLegality::currentLimitations() {
1324 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1325
1326 // transform currently expects the loop latches to also be the exiting
1327 // blocks.
1328 if (InnerLoop->getExitingBlock() != InnerLoopLatch ||
1329 OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch() ||
1330 !isa<CondBrInst>(InnerLoopLatch->getTerminator()) ||
1331 !isa<CondBrInst>(OuterLoop->getLoopLatch()->getTerminator())) {
1332 LLVM_DEBUG(
1333 dbgs() << "Loops where the latch is not the exiting block are not"
1334 << " supported currently.\n");
1335 ORE->emit([&]() {
1336 return OptimizationRemarkMissed(DEBUG_TYPE, "ExitingNotLatch",
1337 OuterLoop->getStartLoc(),
1338 OuterLoop->getHeader())
1339 << "Loops where the latch is not the exiting block cannot be"
1340 " interchange currently.";
1341 });
1342 return true;
1343 }
1344
1345 // TODO: Triangular loops are not handled for now.
1346 if (!isLoopStructureUnderstood()) {
1347 LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n");
1348 ORE->emit([&]() {
1349 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner",
1350 InnerLoop->getStartLoc(),
1351 InnerLoop->getHeader())
1352 << "Inner loop structure not understood currently.";
1353 });
1354 return true;
1355 }
1356
1357 return false;
1358}
1359
1360/// We currently only support LCSSA PHI nodes in the inner loop exit if their
1361/// users are either of the following:
1362///
1363/// - Reduction PHIs
1364/// - PHIs outside the outer loop
1365/// - PHIs belonging to the latch of the outer loop
1366///
1367/// These conditions mean that we are only interested in the final value after
1368/// the inner loop.
1369static bool
1372 PHINode *LcssaReduction) {
1373 BasicBlock *InnerExit = InnerL->getUniqueExitBlock();
1374 for (PHINode &PHI : InnerExit->phis()) {
1375 // The reduction LCSSA PHI will have only one incoming block, which comes
1376 // from the loop latch.
1377 if (PHI.getNumIncomingValues() > 1)
1378 return false;
1379 // The reduction LCSSA PHI's store user is rewritten by reduction2Memory();
1380 // skip its user-check but keep validating the remaining LCSSA PHIs.
1381 if (&PHI == LcssaReduction)
1382 continue;
1383 if (any_of(PHI.users(), [&Reductions, OuterL](User *U) {
1384 PHINode *PN = dyn_cast<PHINode>(U);
1385 if (!PN)
1386 return true;
1387 if (Reductions.count(PN))
1388 return false;
1389 BasicBlock *PB = PN->getParent();
1390 if (!OuterL->contains(PB))
1391 return false;
1392 return PB != OuterL->getLoopLatch();
1393 }))
1394 return false;
1395 }
1396 return true;
1397}
1398
1399// We currently support LCSSA PHI nodes in the outer loop exit, if their
1400// incoming values do not come from the outer loop latch or if the
1401// outer loop latch has a single predecessor. In that case, the value will
1402// be available if both the inner and outer loop conditions are true, which
1403// will still be true after interchanging. If we have multiple predecessor,
1404// that may not be the case, e.g. because the outer loop latch may be executed
1405// if the inner loop is not executed.
1406static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
1407 BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock();
1408 for (PHINode &PHI : LoopNestExit->phis()) {
1409 for (Value *Incoming : PHI.incoming_values()) {
1410 Instruction *IncomingI = dyn_cast<Instruction>(Incoming);
1411 if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch())
1412 continue;
1413
1414 // The incoming value is defined in the outer loop latch. Currently we
1415 // only support that in case the outer loop latch has a single predecessor.
1416 // This guarantees that the outer loop latch is executed if and only if
1417 // the inner loop is executed (because tightlyNested() guarantees that the
1418 // outer loop header only branches to the inner loop or the outer loop
1419 // latch).
1420 // FIXME: We could weaken this logic and allow multiple predecessors,
1421 // if the values are produced outside the loop latch. We would need
1422 // additional logic to update the PHI nodes in the exit block as
1423 // well.
1424 if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)
1425 return false;
1426 }
1427 }
1428 return true;
1429}
1430
1431// In case of multi-level nested loops, it may occur that lcssa phis exist in
1432// the latch of InnerLoop, i.e., when defs of the incoming values are further
1433// inside the loopnest. Sometimes those incoming values are not available
1434// after interchange, since the original inner latch will become the new outer
1435// latch which may have predecessor paths that do not include those incoming
1436// values.
1437// TODO: Handle transformation of lcssa phis in the InnerLoop latch in case of
1438// multi-level loop nests.
1439static bool areInnerLoopLatchPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
1440 if (InnerLoop->getSubLoops().empty())
1441 return true;
1442 // If the original outer latch has only one predecessor, then values defined
1443 // further inside the looploop, e.g., in the innermost loop, will be available
1444 // at the new outer latch after interchange.
1445 if (OuterLoop->getLoopLatch()->getUniquePredecessor() != nullptr)
1446 return true;
1447
1448 // The outer latch has more than one predecessors, i.e., the inner
1449 // exit and the inner header.
1450 // PHI nodes in the inner latch are lcssa phis where the incoming values
1451 // are defined further inside the loopnest. Check if those phis are used
1452 // in the original inner latch. If that is the case then bail out since
1453 // those incoming values may not be available at the new outer latch.
1454 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1455 for (PHINode &PHI : InnerLoopLatch->phis()) {
1456 for (auto *U : PHI.users()) {
1458 if (InnerLoopLatch == UI->getParent())
1459 return false;
1460 }
1461 }
1462 return true;
1463}
1464
1465bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
1466 unsigned OuterLoopId,
1467 CharMatrix &DepMatrix) {
1468 if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
1469 LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
1470 << " and OuterLoopId = " << OuterLoopId
1471 << " due to dependence\n");
1472 ORE->emit([&]() {
1473 return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
1474 InnerLoop->getStartLoc(),
1475 InnerLoop->getHeader())
1476 << "Cannot interchange loops due to dependences.";
1477 });
1478 return false;
1479 }
1480 // Check if outer and inner loop contain legal instructions only.
1481 for (auto *BB : OuterLoop->blocks())
1482 for (Instruction &I : *BB) {
1483 // Loads and stores are checked separately, so we can skip them here.
1485 continue;
1486
1487 // We cannot ignore potential memory reads, e.g., loads inside the called
1488 // function.
1489 if (!I.mayHaveSideEffects() && !I.mayReadFromMemory())
1490 continue;
1491
1492 LLVM_DEBUG(
1493 dbgs()
1494 << "Loops contain instructions that cannot be safely interchanged\n");
1495 ORE->emit([&]() {
1496 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsafeInst",
1497 I.getDebugLoc(), I.getParent())
1498 << "Cannot interchange loops due to instruction that is "
1499 "potentially unsafe to interchange.";
1500 });
1501
1502 return false;
1503 }
1504
1505 if (!checkInductionsAndReductions(OuterLoop)) {
1506 LLVM_DEBUG(dbgs() << "Failed to find inner loop inductions or found "
1507 "unsupported reductions.\n");
1508 return false;
1509 }
1510
1511 if (!areInnerLoopLatchPHIsSupported(OuterLoop, InnerLoop)) {
1512 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop latch.\n");
1513 ORE->emit([&]() {
1514 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerLatchPHI",
1515 InnerLoop->getStartLoc(),
1516 InnerLoop->getHeader())
1517 << "Cannot interchange loops because unsupported PHI nodes found "
1518 "in inner loop latch.";
1519 });
1520 return false;
1521 }
1522
1523 // TODO: The loops could not be interchanged due to current limitations in the
1524 // transform module.
1525 if (currentLimitations()) {
1526 LLVM_DEBUG(dbgs() << "Not legal because of current transform limitation\n");
1527 return false;
1528 }
1529
1530 // Check if the loops are tightly nested.
1531 if (!tightlyNested(OuterLoop, InnerLoop)) {
1532 LLVM_DEBUG(dbgs() << "Loops not tightly nested\n");
1533 ORE->emit([&]() {
1534 return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested",
1535 InnerLoop->getStartLoc(),
1536 InnerLoop->getHeader())
1537 << "Cannot interchange loops because they are not tightly "
1538 "nested.";
1539 });
1540 return false;
1541 }
1542
1543 // The LCSSA PHI for the reduction has passed checks before; its user
1544 // is a store instruction.
1545 PHINode *LcssaReduction = nullptr;
1546 assert(InnerReductions.size() <= 1 &&
1547 "So far we only support at most one reduction.");
1548 if (InnerReductions.size() == 1)
1549 LcssaReduction = InnerReductions[0].LcssaPhi;
1550
1551 if (!areInnerLoopExitPHIsSupported(OuterLoop, InnerLoop, OuterInnerReductions,
1552 LcssaReduction)) {
1553 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n");
1554 ORE->emit([&]() {
1555 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1556 InnerLoop->getStartLoc(),
1557 InnerLoop->getHeader())
1558 << "Found unsupported PHI node in loop exit.";
1559 });
1560 return false;
1561 }
1562
1563 if (!areOuterLoopExitPHIsSupported(OuterLoop, InnerLoop)) {
1564 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n");
1565 ORE->emit([&]() {
1566 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1567 OuterLoop->getStartLoc(),
1568 OuterLoop->getHeader())
1569 << "Found unsupported PHI node in loop exit.";
1570 });
1571 return false;
1572 }
1573
1574 if (any_of(OuterLoop->getLoopLatch()->phis(),
1575 [](PHINode &PHI) { return PHI.getNumIncomingValues() != 1; })) {
1576 LLVM_DEBUG(dbgs() << "Only outer loop latch PHI nodes with one incoming "
1577 "value are supported.\n");
1578 ORE->emit([&]() {
1579 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLatchPHI",
1580 OuterLoop->getStartLoc(),
1581 OuterLoop->getHeader())
1582 << "Only outer loop latch PHI nodes with one incoming value are "
1583 "supported.";
1584 });
1585 return false;
1586 }
1587
1588 // Regarding def-use chains that begin at an LCSSA PHI in the inner loop exit
1589 // and end at any instruction in the outer loop latch, we currently support
1590 // only the case where the chain contains only PHI nodes. Since we already
1591 // call `tightlyNested()`, we know that if there is a def-use chain that we
1592 // don't support (i.e., a chain that contains a non-PHI user), then the
1593 // non-PHI user must be in the outer loop latch.
1594 if (InnerLoop->getExitBlock() != OuterLoop->getLoopLatch())
1595 for (PHINode &PHI : OuterLoop->getLoopLatch()->phis())
1596 if (any_of(PHI.users(), [](const User *U) { return !isa<PHINode>(U); })) {
1597 LLVM_DEBUG(dbgs() << "Outer loop latch PHI has a non-PHI user.\n");
1598 ORE->emit([&]() {
1599 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLatchPHI",
1600 OuterLoop->getStartLoc(),
1601 OuterLoop->getHeader())
1602 << "Cannot interchange loops because an outer loop latch PHI "
1603 "node has a non-PHI user.";
1604 });
1605 return false;
1606 }
1607
1608 return true;
1609}
1610
1611void CacheCostManager::computeIfUnitinialized() {
1612 if (CC.has_value())
1613 return;
1614
1615 LLVM_DEBUG(dbgs() << "Compute CacheCost.\n");
1616 CC = CacheCost::getCacheCost(*OutermostLoop, *AR, *DI);
1617 // Obtain the loop vector returned from loop cache analysis beforehand,
1618 // and put each <Loop, index> pair into a map for constant time query
1619 // later. Indices in loop vector reprsent the optimal order of the
1620 // corresponding loop, e.g., given a loopnest with depth N, index 0
1621 // indicates the loop should be placed as the outermost loop and index N
1622 // indicates the loop should be placed as the innermost loop.
1623 //
1624 // For the old pass manager CacheCost would be null.
1625 if (*CC != nullptr)
1626 for (const auto &[Idx, Cost] : enumerate((*CC)->getLoopCosts()))
1627 CostMap[Cost.first] = Idx;
1628}
1629
1630CacheCost *CacheCostManager::getCacheCost() {
1631 computeIfUnitinialized();
1632 return CC->get();
1633}
1634
1635const DenseMap<const Loop *, unsigned> &CacheCostManager::getCostMap() {
1636 computeIfUnitinialized();
1637 return CostMap;
1638}
1639
1640/// If \S contains an affine addrec for \p L, return the step recurrence of it.
1641/// If \S is loop invariant with respect to \p L, return nullptr. Otherwise,
1642/// return std::nullopt, which indicates we cannot determine the coefficient of
1643/// the addrec for \p L in \S.
1644/// TODO: Handle more complex cases. Maybe using SCEVTraversal is a good way to
1645/// do that.
1646std::optional<const SCEV *> getAddRecCoefficient(ScalarEvolution &SE,
1647 const SCEV *S, const Loop *L) {
1649 if (!AR) {
1650 if (SE.isLoopInvariant(S, L))
1651 return nullptr;
1652 return std::nullopt;
1653 }
1654
1655 if (!AR->isAffine()) {
1656 LLVM_DEBUG(dbgs() << "Unexpected non-affine addrec\n");
1657 return std::nullopt;
1658 }
1659
1660 std::optional<const SCEV *> Coeff =
1661 getAddRecCoefficient(SE, AR->getStart(), L);
1662 if (!Coeff.has_value())
1663 return std::nullopt;
1664
1665 if (AR->getLoop() == L) {
1666 assert(!*Coeff && "Found more than one addrec for the same loop");
1667 Coeff = AR->getStepRecurrence(SE);
1668 }
1669 return Coeff;
1670}
1671
1672int LoopInterchangeProfitability::getInstrOrderCost() {
1673 SmallPtrSet<const SCEV *, 4> GoodBasePtrs, BadBasePtrs;
1674 for (BasicBlock *BB : InnerLoop->blocks()) {
1675 for (Instruction &Ins : *BB) {
1676 if (!isa<LoadInst, StoreInst>(&Ins))
1677 continue;
1678 const SCEV *Access = SE->getSCEV(getLoadStorePointerOperand(&Ins));
1679 const SCEV *BasePtr = SE->getPointerBase(Access);
1680 std::optional<const SCEV *> OuterCoeff =
1681 getAddRecCoefficient(*SE, Access, OuterLoop);
1682 std::optional<const SCEV *> InnerCoeff =
1683 getAddRecCoefficient(*SE, Access, InnerLoop);
1684
1685 if (!OuterCoeff.has_value() || !*OuterCoeff || !InnerCoeff.has_value() ||
1686 !*InnerCoeff)
1687 continue;
1688
1689 // This heuristic assumes that a smaller step recurrence implies that the
1690 // induction variable corresponding to the loop is used in the inner
1691 // dimension of the array. Placing such a loop in the inner position would
1692 // be beneficial in terms of locality. If the array access is of the form
1693 // like `A[3*i + 2*j]`, this heuristic may lead to an unprofitable
1694 // interchange, but we expect such cases to be rare.
1695 const SCEV *OuterStep = SE->getAbsExpr(*OuterCoeff, /*IsNSW=*/false);
1696 const SCEV *InnerStep = SE->getAbsExpr(*InnerCoeff, /*IsNSW=*/false);
1697 // If we find the inner induction after an outer induction e.g.
1698 //
1699 // for(int i=0;i<N;i++)
1700 // for(int j=0;j<N;j++)
1701 // A[i][j] = A[i-1][j-1]+k;
1702 //
1703 //
1704 // then it is a good order. If we find the outer induction after an inner
1705 // induction e.g.
1706 //
1707 // for(int i=0;i<N;i++)
1708 // for(int j=0;j<N;j++)
1709 // A[j][i] = A[j-1][i-1]+k;
1710 //
1711 // then it is a bad order.
1712 //
1713 // To avoid counting the same base pointers multiple times, we deduplicate
1714 // them by using a set of base pointers.
1715 if (SE->isKnownPredicate(ICmpInst::ICMP_SLT, InnerStep, OuterStep))
1716 GoodBasePtrs.insert(BasePtr);
1717 else if (SE->isKnownPredicate(ICmpInst::ICMP_SLT, OuterStep, InnerStep))
1718 BadBasePtrs.insert(BasePtr);
1719 }
1720 }
1721
1722 int GoodOrder = GoodBasePtrs.size();
1723 int BadOrder = BadBasePtrs.size();
1724 return GoodOrder - BadOrder;
1725}
1726
1727std::optional<bool>
1728LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis(
1729 const DenseMap<const Loop *, unsigned> &CostMap, CacheCost *CC) {
1730 // This is the new cost model returned from loop cache analysis.
1731 // A smaller index means the loop should be placed an outer loop, and vice
1732 // versa.
1733 auto InnerLoopIt = CostMap.find(InnerLoop);
1734 if (InnerLoopIt == CostMap.end())
1735 return std::nullopt;
1736 auto OuterLoopIt = CostMap.find(OuterLoop);
1737 if (OuterLoopIt == CostMap.end())
1738 return std::nullopt;
1739
1740 if (CC->getLoopCost(*OuterLoop) == CC->getLoopCost(*InnerLoop))
1741 return std::nullopt;
1742 unsigned InnerIndex = InnerLoopIt->second;
1743 unsigned OuterIndex = OuterLoopIt->second;
1744 LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex
1745 << ", OuterIndex = " << OuterIndex << "\n");
1746 assert(InnerIndex != OuterIndex && "CostMap should assign unique "
1747 "numbers to each loop");
1748 return std::optional<bool>(InnerIndex < OuterIndex);
1749}
1750
1751std::optional<bool>
1752LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
1753 // Legacy cost model: this is rough cost estimation algorithm. It counts the
1754 // good and bad order of induction variables in the instruction and allows
1755 // reordering if number of bad orders is more than good.
1756 int Cost = getInstrOrderCost();
1757 LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
1759 return std::optional<bool>(true);
1760
1761 return std::nullopt;
1762}
1763
1764/// Return true if we can vectorize the loop specified by \p LoopId.
1765static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId) {
1766 for (const auto &Dep : DepMatrix) {
1767 char Dir = Dep[LoopId];
1768 char DepType = Dep.back();
1769 assert((DepType == '<' || DepType == '*') &&
1770 "Unexpected element in dependency vector");
1771
1772 // There are no loop-carried dependencies.
1773 if (Dir == '=' || Dir == 'I')
1774 continue;
1775
1776 // DepType being '<' means that this direction vector represents a forward
1777 // dependency. In principle, a loop with '<' direction can be vectorized in
1778 // this case.
1779 if (Dir == '<' && DepType == '<')
1780 continue;
1781
1782 // We cannot prove that the loop is vectorizable.
1783 return false;
1784 }
1785 return true;
1786}
1787
1788std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
1789 unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) {
1790 // If the outer loop cannot be vectorized, it is not profitable to move this
1791 // to inner position.
1792 if (!canVectorize(DepMatrix, OuterLoopId))
1793 return false;
1794
1795 // If the inner loop cannot be vectorized but the outer loop can be, then it
1796 // is profitable to interchange to enable inner loop parallelism.
1797 if (!canVectorize(DepMatrix, InnerLoopId))
1798 return true;
1799
1800 // If both the inner and the outer loop can be vectorized, it is necessary to
1801 // check the cost of each vectorized loop for profitability decision. At this
1802 // time we do not have a cost model to estimate them, so return nullopt.
1803 // TODO: Estimate the cost of vectorized loop when both the outer and the
1804 // inner loop can be vectorized.
1805 return std::nullopt;
1806}
1807
1808bool LoopInterchangeProfitability::isProfitable(
1809 const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
1810 unsigned OuterLoopId, CharMatrix &DepMatrix, CacheCostManager &CCM) {
1811 // Do not consider loops with a backedge that isn't taken, e.g. an
1812 // unconditional branch true/false, as candidates for interchange.
1813 // TODO: when interchange is forced, we should probably also allow
1814 // interchange for these loops, and thus this logic should be moved just
1815 // below the cost-model ignore check below. But this check is done first
1816 // to avoid the issue in #163954.
1817 const SCEV *InnerBTC = SE->getBackedgeTakenCount(InnerLoop);
1818 const SCEV *OuterBTC = SE->getBackedgeTakenCount(OuterLoop);
1819 if (InnerBTC && InnerBTC->isZero()) {
1820 LLVM_DEBUG(dbgs() << "Inner loop back-edge isn't taken, rejecting "
1821 "single iteration loop\n");
1822 return false;
1823 }
1824 if (OuterBTC && OuterBTC->isZero()) {
1825 LLVM_DEBUG(dbgs() << "Outer loop back-edge isn't taken, rejecting "
1826 "single iteration loop\n");
1827 return false;
1828 }
1829
1830 // Return true if interchange is forced and the cost-model ignored.
1831 if (Profitabilities.size() == 1 && Profitabilities[0] == RuleTy::Ignore)
1832 return true;
1834 "Duplicate rules and option 'ignore' are not allowed");
1835
1836 // isProfitable() is structured to avoid endless loop interchange. If the
1837 // highest priority rule (isProfitablePerLoopCacheAnalysis by default) could
1838 // decide the profitability then, profitability check will stop and return the
1839 // analysis result. If it failed to determine it (e.g., cache analysis failed
1840 // to analyze the loopnest due to delinearization issues) then go ahead the
1841 // second highest priority rule (isProfitablePerInstrOrderCost by default).
1842 // Likewise, if it failed to analysis the profitability then only, the last
1843 // rule (isProfitableForVectorization by default) will decide.
1844 std::optional<bool> shouldInterchange;
1845 for (RuleTy RT : Profitabilities) {
1846 switch (RT) {
1847 case RuleTy::PerLoopCacheAnalysis: {
1848 CacheCost *CC = CCM.getCacheCost();
1849 const DenseMap<const Loop *, unsigned> &CostMap = CCM.getCostMap();
1850 shouldInterchange = isProfitablePerLoopCacheAnalysis(CostMap, CC);
1851 break;
1852 }
1853 case RuleTy::PerInstrOrderCost:
1854 shouldInterchange = isProfitablePerInstrOrderCost();
1855 break;
1856 case RuleTy::ForVectorization:
1857 shouldInterchange =
1858 isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
1859 break;
1860 case RuleTy::Ignore:
1861 llvm_unreachable("Option 'ignore' is not supported with other options");
1862 break;
1863 }
1864
1865 // If this rule could determine the profitability, don't call subsequent
1866 // rules.
1867 if (shouldInterchange.has_value())
1868 break;
1869 }
1870
1871 if (!shouldInterchange.has_value()) {
1872 ORE->emit([&]() {
1873 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1874 InnerLoop->getStartLoc(),
1875 InnerLoop->getHeader())
1876 << "Insufficient information to calculate the cost of loop for "
1877 "interchange.";
1878 });
1879 return false;
1880 } else if (!shouldInterchange.value()) {
1881 ORE->emit([&]() {
1882 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1883 InnerLoop->getStartLoc(),
1884 InnerLoop->getHeader())
1885 << "Interchanging loops is not considered to improve cache "
1886 "locality nor vectorization.";
1887 });
1888 return false;
1889 }
1890 return true;
1891}
1892
1893void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
1894 Loop *InnerLoop) {
1895 for (Loop *L : *OuterLoop)
1896 if (L == InnerLoop) {
1897 OuterLoop->removeChildLoop(L);
1898 return;
1899 }
1900 llvm_unreachable("Couldn't find loop");
1901}
1902
1903/// Update LoopInfo, after interchanging. NewInner and NewOuter refer to the
1904/// new inner and outer loop after interchanging: NewInner is the original
1905/// outer loop and NewOuter is the original inner loop.
1906///
1907/// Before interchanging, we have the following structure
1908/// Outer preheader
1909// Outer header
1910// Inner preheader
1911// Inner header
1912// Inner body
1913// Inner latch
1914// outer bbs
1915// Outer latch
1916//
1917// After interchanging:
1918// Inner preheader
1919// Inner header
1920// Outer preheader
1921// Outer header
1922// Inner body
1923// outer bbs
1924// Outer latch
1925// Inner latch
1926void LoopInterchangeTransform::restructureLoops(
1927 Loop *NewInner, Loop *NewOuter, BasicBlock *OrigInnerPreHeader,
1928 BasicBlock *OrigOuterPreHeader) {
1929 Loop *OuterLoopParent = OuterLoop->getParentLoop();
1930 // The original inner loop preheader moves from the new inner loop to
1931 // the parent loop, if there is one.
1932 NewInner->removeBlockFromLoop(OrigInnerPreHeader);
1933 LI->changeLoopFor(OrigInnerPreHeader, OuterLoopParent);
1934
1935 // Switch the loop levels.
1936 if (OuterLoopParent) {
1937 // Remove the loop from its parent loop.
1938 removeChildLoop(OuterLoopParent, NewInner);
1939 removeChildLoop(NewInner, NewOuter);
1940 OuterLoopParent->addChildLoop(NewOuter);
1941 } else {
1942 removeChildLoop(NewInner, NewOuter);
1943 LI->changeTopLevelLoop(NewInner, NewOuter);
1944 }
1945 while (!NewOuter->isInnermost())
1946 NewInner->addChildLoop(NewOuter->removeChildLoop(NewOuter->begin()));
1947 NewOuter->addChildLoop(NewInner);
1948
1949 // BBs from the original inner loop.
1950 SmallVector<BasicBlock *, 8> OrigInnerBBs(NewOuter->blocks());
1951
1952 // Add BBs from the original outer loop to the original inner loop (excluding
1953 // BBs already in inner loop)
1954 for (BasicBlock *BB : NewInner->blocks())
1955 if (LI->getLoopFor(BB) == NewInner)
1956 NewOuter->addBlockEntry(BB);
1957
1958 // Now remove inner loop header and latch from the new inner loop and move
1959 // other BBs (the loop body) to the new inner loop.
1960 BasicBlock *OuterHeader = NewOuter->getHeader();
1961 BasicBlock *OuterLatch = NewOuter->getLoopLatch();
1962 for (BasicBlock *BB : OrigInnerBBs) {
1963 // Nothing will change for BBs in child loops.
1964 if (LI->getLoopFor(BB) != NewOuter)
1965 continue;
1966 // Remove the new outer loop header and latch from the new inner loop.
1967 if (BB == OuterHeader || BB == OuterLatch)
1968 NewInner->removeBlockFromLoop(BB);
1969 else
1970 LI->changeLoopFor(BB, NewInner);
1971 }
1972
1973 // The preheader of the original outer loop becomes part of the new
1974 // outer loop.
1975 NewOuter->addBlockEntry(OrigOuterPreHeader);
1976 LI->changeLoopFor(OrigOuterPreHeader, NewOuter);
1977
1978 // Tell SE that we move the loops around.
1979 SE->forgetLoop(NewOuter);
1980}
1981
1982/// User can write, or optimizers can generate the reduction for inner loop.
1983/// To make the interchange valid, apply Reduction2Mem by moving the
1984/// initializer and store instructions into the inner loop. So far we only
1985/// handle cases where the reduction variable is initialized to a constant.
1986/// For example, below code:
1987///
1988/// loop:
1989/// re = phi<0.0, next>
1990/// next = re op ...
1991/// endloop
1992/// reduc_sum = phi<next> // lcssa phi
1993/// MEM_REF[idx] = reduc_sum // LcssaStore
1994///
1995/// is transformed into:
1996///
1997/// loop:
1998/// tmp = MEM_REF[idx];
1999/// new_var = !first_iteration ? tmp : 0.0;
2000/// next = new_var op ...
2001/// MEM_REF[idx] = next; // after moving
2002/// endloop
2003///
2004/// In this way the initial const is used in the first iteration of loop.
2005void LoopInterchangeTransform::reduction2Memory() {
2007 LIL.getInnerReductions();
2008
2009 assert(InnerReductions.size() == 1 &&
2010 "So far we only support at most one reduction.");
2011
2012 LoopInterchangeLegality::InnerReduction SR = InnerReductions[0];
2013 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
2014 IRBuilder<> Builder(&*(InnerLoopHeader->getFirstNonPHIIt()));
2015
2016 // Check if it's the first iteration.
2017 LLVMContext &Context = InnerLoopHeader->getContext();
2018 PHINode *FirstIter =
2019 Builder.CreatePHI(Type::getInt1Ty(Context), 2, "first.iter");
2020 FirstIter->addIncoming(ConstantInt::get(Type::getInt1Ty(Context), 1),
2021 InnerLoop->getLoopPreheader());
2022 FirstIter->addIncoming(ConstantInt::get(Type::getInt1Ty(Context), 0),
2023 InnerLoop->getLoopLatch());
2024 assert(FirstIter->isComplete() && "The FirstIter PHI node is not complete.");
2025
2026 // When the reduction is initialized from a constant value, we need to add
2027 // a stmt loading from the memory object to target basic block in inner
2028 // loop.
2029 Instruction *LoadMem = Builder.CreateLoad(SR.ElemTy, SR.MemRef);
2030
2031 // Init new_var to MEM_REF or CONST depending on if it is the first iteration.
2032 Value *NewVar = Builder.CreateSelect(FirstIter, SR.Init, LoadMem, "new.var");
2033
2034 // Replace all uses of the reduction variable with a new variable.
2035 SR.Reduction->replaceAllUsesWith(NewVar);
2036
2037 // Move store instruction into inner loop, just after reduction next's
2038 // definition.
2039 SR.LcssaStore->setOperand(0, SR.Next);
2040 SR.LcssaStore->moveAfter(dyn_cast<Instruction>(SR.Next));
2041}
2042
2043bool LoopInterchangeTransform::transform(
2044 ArrayRef<Instruction *> DropNoWrapInsts,
2045 ArrayRef<Instruction *> DropNoInfInsts) {
2046 bool Transformed = false;
2047
2049 LIL.getInnerReductions();
2050 if (InnerReductions.size() == 1)
2051 reduction2Memory();
2052
2053 LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n");
2054 auto &InductionPHIs = LIL.getInnerLoopInductions();
2055 if (InductionPHIs.empty()) {
2056 LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
2057 return false;
2058 }
2059
2060 SmallVector<Instruction *, 8> InnerIndexVarList;
2061 for (PHINode *CurInductionPHI : InductionPHIs) {
2062 Instruction *IncomingValue = dyn_cast<Instruction>(
2063 CurInductionPHI->getIncomingValueForBlock(InnerLoop->getLoopLatch()));
2064 assert(IncomingValue &&
2065 "Incoming value from loop latch isn't an instruction");
2066 if (is_contained(InductionPHIs, IncomingValue))
2067 continue;
2068 InnerIndexVarList.push_back(IncomingValue);
2069 }
2070
2071 // Create a new latch block for the inner loop. We split at the
2072 // current latch's terminator and then move the condition and all
2073 // operands that are not either loop-invariant or the induction PHI into the
2074 // new latch block.
2075 BasicBlock *NewLatch =
2076 SplitBlock(InnerLoop->getLoopLatch(),
2077 InnerLoop->getLoopLatch()->getTerminator(), DT, LI);
2078
2079 SmallSetVector<Instruction *, 4> WorkList;
2080 unsigned i = 0;
2081 auto MoveInstructions = [&i, &WorkList, this, &InductionPHIs, NewLatch]() {
2082 for (; i < WorkList.size(); i++) {
2083 // Duplicate instruction and move it to the new latch. Update uses that
2084 // have been moved.
2085 Instruction *NewI = WorkList[i]->clone();
2086 NewI->insertBefore(NewLatch->getFirstNonPHIIt());
2087 assert(!NewI->mayHaveSideEffects() &&
2088 "Moving instructions with side-effects may change behavior of "
2089 "the loop nest!");
2090 for (Use &U : llvm::make_early_inc_range(WorkList[i]->uses())) {
2091 Instruction *UserI = cast<Instruction>(U.getUser());
2092 if (!InnerLoop->contains(UserI->getParent()) ||
2093 UserI->getParent() == NewLatch ||
2094 llvm::is_contained(InductionPHIs, UserI))
2095 U.set(NewI);
2096 }
2097 // Add operands of moved instruction to the worklist, except if they are
2098 // outside the inner loop or are the induction PHI.
2099 for (Value *Op : WorkList[i]->operands()) {
2101 if (!OpI || this->LI->getLoopFor(OpI->getParent()) != this->InnerLoop ||
2102 llvm::is_contained(InductionPHIs, OpI))
2103 continue;
2104 WorkList.insert(OpI);
2105 }
2106 }
2107 };
2108
2109 // FIXME: Should we interchange when we have a constant condition?
2112 ->getCondition());
2113 if (CondI)
2114 WorkList.insert(CondI);
2115 MoveInstructions();
2116 for (Instruction *InnerIndexVar : InnerIndexVarList)
2117 WorkList.insert(cast<Instruction>(InnerIndexVar));
2118 MoveInstructions();
2119
2120 // Ensure the inner loop phi nodes have a separate basic block.
2121 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
2122 if (&*InnerLoopHeader->getFirstNonPHIIt() !=
2123 InnerLoopHeader->getTerminator()) {
2124 SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHIIt(), DT, LI);
2125 LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n");
2126 }
2127
2128 // Instructions in the original inner loop preheader may depend on values
2129 // defined in the outer loop header. Move them there, because the original
2130 // inner loop preheader will become the entry into the interchanged loop nest.
2131 // Currently we move all instructions and rely on LICM to move invariant
2132 // instructions outside the loop nest.
2133 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
2134 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
2135
2136 if (InnerLoopPreHeader != OuterLoopHeader) {
2137 // Eliminate PHIs in the inner-loop preheader.
2138 for (PHINode &P : make_early_inc_range(InnerLoopPreHeader->phis())) {
2139 assert(P.getNumIncomingValues() == 1 &&
2140 "Expected single-incoming PHIs in inner loop preheader");
2141 P.replaceAllUsesWith(P.getIncomingValue(0));
2142 P.eraseFromParent();
2143 }
2144 for (Instruction &I :
2145 make_early_inc_range(make_range(InnerLoopPreHeader->begin(),
2146 std::prev(InnerLoopPreHeader->end()))))
2147 I.moveBeforePreserving(OuterLoopHeader->getTerminator()->getIterator());
2148 }
2149
2150 Transformed |= adjustLoopLinks();
2151 if (!Transformed) {
2152 LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n");
2153 return false;
2154 }
2155
2156 // Finally, drop the nsw/nuw/ninf flags from the instructions for reduction
2157 // calculations.
2158 for (Instruction *Reduction : DropNoWrapInsts) {
2159 Reduction->setHasNoSignedWrap(false);
2160 Reduction->setHasNoUnsignedWrap(false);
2161 }
2162 for (Instruction *I : DropNoInfInsts)
2163 I->setHasNoInfs(false);
2164
2165 return true;
2166}
2167
2168/// \brief Move all instructions except the terminator from FromBB right before
2169/// InsertBefore
2170static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
2171 BasicBlock *ToBB = InsertBefore->getParent();
2172
2173 ToBB->splice(InsertBefore->getIterator(), FromBB, FromBB->begin(),
2174 FromBB->getTerminator()->getIterator());
2175}
2176
2177/// Swap instructions between \p BB1 and \p BB2 but keep terminators intact.
2178static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2) {
2179 // Save all non-terminator instructions of BB1 into TempInstrs and unlink them
2180 // from BB1 afterwards.
2181 auto Iter = map_range(*BB1, [](Instruction &I) { return &I; });
2182 SmallVector<Instruction *, 4> TempInstrs(Iter.begin(), std::prev(Iter.end()));
2183 for (Instruction *I : TempInstrs)
2184 I->removeFromParent();
2185
2186 // Move instructions from BB2 to BB1.
2187 moveBBContents(BB2, BB1->getTerminator());
2188
2189 // Move instructions from TempInstrs to BB2.
2190 for (Instruction *I : TempInstrs)
2191 I->insertBefore(BB2->getTerminator()->getIterator());
2192}
2193
2194// Update BI to jump to NewBB instead of OldBB. Records updates to the
2195// dominator tree in DTUpdates. If \p MustUpdateOnce is true, assert that
2196// \p OldBB is exactly once in BI's successor list.
2197static void updateSuccessor(Instruction *Term, BasicBlock *OldBB,
2198 BasicBlock *NewBB,
2199 std::vector<DominatorTree::UpdateType> &DTUpdates,
2200 bool MustUpdateOnce = true) {
2201 assert((!MustUpdateOnce || llvm::count(successors(Term), OldBB) == 1) &&
2202 "BI must jump to OldBB exactly once.");
2203 bool Changed = false;
2204 for (Use &Op : Term->operands())
2205 if (Op == OldBB) {
2206 Op.set(NewBB);
2207 Changed = true;
2208 }
2209
2210 if (Changed) {
2211 DTUpdates.push_back(
2212 {DominatorTree::UpdateKind::Insert, Term->getParent(), NewBB});
2213 DTUpdates.push_back(
2214 {DominatorTree::UpdateKind::Delete, Term->getParent(), OldBB});
2215 }
2216 assert(Changed && "Expected a successor to be updated");
2217}
2218
2219// Move Lcssa PHIs to the right place.
2220static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader,
2221 BasicBlock *InnerLatch, BasicBlock *OuterHeader,
2222 BasicBlock *OuterLatch, BasicBlock *OuterExit,
2223 Loop *InnerLoop, LoopInfo *LI) {
2224
2225 // Deal with LCSSA PHI nodes in the exit block of the inner loop, that are
2226 // defined either in the header or latch. Those blocks will become header and
2227 // latch of the new outer loop, and the only possible users can PHI nodes
2228 // in the exit block of the loop nest or the outer loop header (reduction
2229 // PHIs, in that case, the incoming value must be defined in the inner loop
2230 // header). We can just substitute the user with the incoming value and remove
2231 // the PHI.
2232 for (PHINode &P : make_early_inc_range(InnerExit->phis())) {
2233 assert(P.getNumIncomingValues() == 1 &&
2234 "Only loops with a single exit are supported!");
2235
2236 Value *IncomingValue = P.getIncomingValueForBlock(InnerLatch);
2237 auto *IncI = dyn_cast<Instruction>(IncomingValue);
2238 if (!IncI) {
2239 // If the incoming value is not an instruction, it must be loop invariant.
2240 // In that case, we can just replace the PHI with the incoming value and
2241 // remove the PHI.
2242 assert(InnerLoop->isLoopInvariant(IncomingValue) &&
2243 "Expected non-instruction incoming value to be loop invariant");
2244 P.replaceAllUsesWith(IncomingValue);
2245 P.eraseFromParent();
2246 continue;
2247 }
2248
2249 // In case of multi-level nested loops, follow LCSSA to find the incoming
2250 // value defined from the innermost loop.
2251 auto IncIInnerMost = cast<Instruction>(followLCSSA(IncI));
2252 // Skip phis with incoming values from the inner loop body, excluding the
2253 // header and latch.
2254 if (IncIInnerMost->getParent() != InnerLatch &&
2255 IncIInnerMost->getParent() != InnerHeader)
2256 continue;
2257
2258 assert(all_of(P.users(),
2259 [OuterHeader, OuterExit, IncI, InnerHeader](User *U) {
2260 return (cast<PHINode>(U)->getParent() == OuterHeader &&
2261 IncI->getParent() == InnerHeader) ||
2262 cast<PHINode>(U)->getParent() == OuterExit;
2263 }) &&
2264 "Can only replace phis iff the uses are in the loop nest exit or "
2265 "the incoming value is defined in the inner header (it will "
2266 "dominate all loop blocks after interchanging)");
2267 P.replaceAllUsesWith(IncI);
2268 P.eraseFromParent();
2269 }
2270
2271 SmallVector<PHINode *, 8> LcssaInnerExit(
2272 llvm::make_pointer_range(InnerExit->phis()));
2273
2274 SmallVector<PHINode *, 8> LcssaInnerLatch(
2275 llvm::make_pointer_range(InnerLatch->phis()));
2276
2277 // Lcssa PHIs for values used outside the inner loop are in InnerExit.
2278 // If a PHI node has users outside of InnerExit, it has a use outside the
2279 // interchanged loop and we have to preserve it. We move these to
2280 // InnerLatch, which will become the new exit block for the innermost
2281 // loop after interchanging.
2282 for (PHINode *P : LcssaInnerExit)
2283 P->moveBefore(InnerLatch->getFirstNonPHIIt());
2284
2285 // If the inner loop latch contains LCSSA PHIs, those come from a child loop
2286 // and we have to move them to the new inner latch.
2287 for (PHINode *P : LcssaInnerLatch)
2288 P->moveBefore(InnerExit->getFirstNonPHIIt());
2289
2290 // Deal with LCSSA PHI nodes in the loop nest exit block. For PHIs that have
2291 // incoming values defined in the outer loop, we have to add a new PHI
2292 // in the inner loop latch, which became the exit block of the outer loop,
2293 // after interchanging.
2294 if (OuterExit) {
2295 for (PHINode &P : OuterExit->phis()) {
2296 if (P.getNumIncomingValues() != 1)
2297 continue;
2298 // Skip Phis with incoming values defined in the inner loop. Those should
2299 // already have been updated.
2300 auto I = dyn_cast<Instruction>(P.getIncomingValue(0));
2301 if (!I || LI->getLoopFor(I->getParent()) == InnerLoop)
2302 continue;
2303
2304 PHINode *NewPhi = dyn_cast<PHINode>(P.clone());
2305 NewPhi->setIncomingValue(0, P.getIncomingValue(0));
2306 NewPhi->setIncomingBlock(0, OuterLatch);
2307 // We might have incoming edges from other BBs, i.e., the original outer
2308 // header.
2309 for (auto *Pred : predecessors(InnerLatch)) {
2310 if (Pred == OuterLatch)
2311 continue;
2312 NewPhi->addIncoming(P.getIncomingValue(0), Pred);
2313 }
2314 NewPhi->insertBefore(InnerLatch->getFirstNonPHIIt());
2315 P.setIncomingValue(0, NewPhi);
2316 }
2317 }
2318
2319 // Now adjust the incoming blocks for the LCSSA PHIs.
2320 // For PHIs moved from Inner's exit block, we need to replace Inner's latch
2321 // with the new latch.
2322 InnerLatch->replacePhiUsesWith(InnerLatch, OuterLatch);
2323}
2324
2325/// This deals with a corner case when a LCSSA phi node appears in a non-exit
2326/// block: the outer loop latch block does not need to be exit block of the
2327/// inner loop. Consider a loop that was in LCSSA form, but then some
2328/// transformation like loop-unswitch comes along and creates an empty block,
2329/// where BB5 in this example is the outer loop latch block:
2330///
2331/// BB4:
2332/// br label %BB5
2333/// BB5:
2334/// %old.cond.lcssa = phi i16 [ %cond, %BB4 ]
2335/// br outer.header
2336///
2337/// Interchange then brings it in LCSSA form again resulting in this chain of
2338/// single-input phi nodes:
2339///
2340/// BB4:
2341/// %new.cond.lcssa = phi i16 [ %cond, %BB3 ]
2342/// br label %BB5
2343/// BB5:
2344/// %old.cond.lcssa = phi i16 [ %new.cond.lcssa, %BB4 ]
2345///
2346/// The problem is that interchange can reoder blocks BB4 and BB5 placing the
2347/// use before the def if we don't check this. The solution is to simplify
2348/// lcssa phi nodes (remove) if they appear in non-exit blocks.
2349///
2350static void simplifyLCSSAPhis(Loop *OuterLoop, Loop *InnerLoop) {
2351 BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
2352 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
2353
2354 // Do not modify lcssa phis where they actually belong, i.e. in exit blocks.
2355 if (OuterLoopLatch == InnerLoopExit)
2356 return;
2357
2358 // Collect and remove phis in non-exit blocks if they have 1 input.
2360 llvm::make_pointer_range(OuterLoopLatch->phis()));
2361 for (PHINode *Phi : Phis) {
2362 assert(Phi->getNumIncomingValues() == 1 && "Single input phi expected");
2363 LLVM_DEBUG(dbgs() << "Removing 1-input phi in non-exit block: " << *Phi
2364 << "\n");
2365 Phi->replaceAllUsesWith(Phi->getIncomingValue(0));
2366 Phi->eraseFromParent();
2367 }
2368}
2369
2370bool LoopInterchangeTransform::adjustLoopBranches() {
2371 LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n");
2372 std::vector<DominatorTree::UpdateType> DTUpdates;
2373
2374 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
2375 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
2376
2377 assert(OuterLoopPreHeader != OuterLoop->getHeader() &&
2378 InnerLoopPreHeader != InnerLoop->getHeader() && OuterLoopPreHeader &&
2379 InnerLoopPreHeader && "Guaranteed by loop-simplify form");
2380
2381 simplifyLCSSAPhis(OuterLoop, InnerLoop);
2382
2383 // Ensure that both preheaders do not contain PHI nodes and have single
2384 // predecessors. This allows us to move them easily. We use
2385 // InsertPreHeaderForLoop to create an 'extra' preheader, if the existing
2386 // preheaders do not satisfy those conditions.
2387 if (isa<PHINode>(OuterLoopPreHeader->begin()) ||
2388 !OuterLoopPreHeader->getUniquePredecessor())
2389 OuterLoopPreHeader =
2390 InsertPreheaderForLoop(OuterLoop, DT, LI, nullptr, true);
2391 if (InnerLoopPreHeader == OuterLoop->getHeader())
2392 InnerLoopPreHeader =
2393 InsertPreheaderForLoop(InnerLoop, DT, LI, nullptr, true);
2394
2395 // Adjust the loop preheader
2396 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
2397 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
2398 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
2399 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
2400 BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor();
2401 BasicBlock *InnerLoopLatchPredecessor =
2402 InnerLoopLatch->getUniquePredecessor();
2403 BasicBlock *InnerLoopLatchSuccessor;
2404 BasicBlock *OuterLoopLatchSuccessor;
2405
2406 CondBrInst *OuterLoopLatchBI =
2407 dyn_cast<CondBrInst>(OuterLoopLatch->getTerminator());
2408 CondBrInst *InnerLoopLatchBI =
2409 dyn_cast<CondBrInst>(InnerLoopLatch->getTerminator());
2410 Instruction *OuterLoopHeaderBI = OuterLoopHeader->getTerminator();
2411 Instruction *InnerLoopHeaderBI = InnerLoopHeader->getTerminator();
2412
2413 if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor ||
2414 !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI ||
2415 !InnerLoopHeaderBI)
2416 return false;
2417
2418 Instruction *InnerLoopLatchPredecessorBI =
2419 InnerLoopLatchPredecessor->getTerminator();
2420 Instruction *OuterLoopPredecessorBI = OuterLoopPredecessor->getTerminator();
2421
2422 if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI)
2423 return false;
2424 BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor();
2425 if (!InnerLoopHeaderSuccessor)
2426 return false;
2427
2428 // Adjust Loop Preheader and headers.
2429 // The branches in the outer loop predecessor and the outer loop header can
2430 // be unconditional branches or conditional branches with duplicates. Consider
2431 // this when updating the successors.
2432 updateSuccessor(OuterLoopPredecessorBI, OuterLoopPreHeader,
2433 InnerLoopPreHeader, DTUpdates, /*MustUpdateOnce=*/false);
2434 // The outer loop header might or might not branch to the outer latch.
2435 // We are guaranteed to branch to the inner loop preheader.
2436 if (llvm::is_contained(successors(OuterLoopHeaderBI), OuterLoopLatch)) {
2437 // In this case the outerLoopHeader should branch to the InnerLoopLatch.
2438 updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, InnerLoopLatch,
2439 DTUpdates,
2440 /*MustUpdateOnce=*/false);
2441 }
2442 updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader,
2443 InnerLoopHeaderSuccessor, DTUpdates,
2444 /*MustUpdateOnce=*/false);
2445
2446 // Adjust reduction PHI's now that the incoming block has changed.
2447 InnerLoopHeaderSuccessor->replacePhiUsesWith(InnerLoopHeader,
2448 OuterLoopHeader);
2449
2450 updateSuccessor(InnerLoopHeaderBI, InnerLoopHeaderSuccessor,
2451 OuterLoopPreHeader, DTUpdates);
2452
2453 // -------------Adjust loop latches-----------
2454 if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader)
2455 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1);
2456 else
2457 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0);
2458
2459 updateSuccessor(InnerLoopLatchPredecessorBI, InnerLoopLatch,
2460 InnerLoopLatchSuccessor, DTUpdates);
2461
2462 if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader)
2463 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1);
2464 else
2465 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0);
2466
2467 updateSuccessor(InnerLoopLatchBI, InnerLoopLatchSuccessor,
2468 OuterLoopLatchSuccessor, DTUpdates);
2469 updateSuccessor(OuterLoopLatchBI, OuterLoopLatchSuccessor, InnerLoopLatch,
2470 DTUpdates);
2471
2472 DT->applyUpdates(DTUpdates);
2473 restructureLoops(OuterLoop, InnerLoop, InnerLoopPreHeader,
2474 OuterLoopPreHeader);
2475
2476 moveLCSSAPhis(InnerLoopLatchSuccessor, InnerLoopHeader, InnerLoopLatch,
2477 OuterLoopHeader, OuterLoopLatch, InnerLoop->getExitBlock(),
2478 InnerLoop, LI);
2479 // For PHIs in the exit block of the outer loop, outer's latch has been
2480 // replaced by Inners'.
2481 OuterLoopLatchSuccessor->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
2482
2483 auto &OuterInnerReductions = LIL.getOuterInnerReductions();
2484 // Now update the reduction PHIs in the inner and outer loop headers.
2485 SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;
2486 for (PHINode &PHI : InnerLoopHeader->phis())
2487 if (OuterInnerReductions.contains(&PHI))
2488 InnerLoopPHIs.push_back(&PHI);
2489
2490 for (PHINode &PHI : OuterLoopHeader->phis())
2491 if (OuterInnerReductions.contains(&PHI))
2492 OuterLoopPHIs.push_back(&PHI);
2493
2494 // Now move the remaining reduction PHIs from outer to inner loop header and
2495 // vice versa. The PHI nodes must be part of a reduction across the inner and
2496 // outer loop and all the remains to do is and updating the incoming blocks.
2497 for (PHINode *PHI : OuterLoopPHIs) {
2498 LLVM_DEBUG(dbgs() << "Outer loop reduction PHIs:\n"; PHI->dump(););
2499 PHI->moveBefore(InnerLoopHeader->getFirstNonPHIIt());
2500 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
2501 }
2502 for (PHINode *PHI : InnerLoopPHIs) {
2503 LLVM_DEBUG(dbgs() << "Inner loop reduction PHIs:\n"; PHI->dump(););
2504 PHI->moveBefore(OuterLoopHeader->getFirstNonPHIIt());
2505 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
2506 }
2507
2508 // Update the incoming blocks for moved PHI nodes.
2509 OuterLoopHeader->replacePhiUsesWith(InnerLoopPreHeader, OuterLoopPreHeader);
2510 OuterLoopHeader->replacePhiUsesWith(InnerLoopLatch, OuterLoopLatch);
2511 InnerLoopHeader->replacePhiUsesWith(OuterLoopPreHeader, InnerLoopPreHeader);
2512 InnerLoopHeader->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
2513
2514 // Values defined in the outer loop header could be used in the inner loop
2515 // latch. In that case, we need to create LCSSA phis for them, because after
2516 // interchanging they will be defined in the new inner loop and used in the
2517 // new outer loop.
2518 SmallVector<Instruction *, 4> MayNeedLCSSAPhis;
2519 for (Instruction &I :
2520 make_range(OuterLoopHeader->begin(), std::prev(OuterLoopHeader->end())))
2521 MayNeedLCSSAPhis.push_back(&I);
2522 formLCSSAForInstructions(MayNeedLCSSAPhis, *DT, *LI, SE);
2523
2524 return true;
2525}
2526
2527bool LoopInterchangeTransform::adjustLoopLinks() {
2528 // Adjust all branches in the inner and outer loop.
2529 bool Changed = adjustLoopBranches();
2530 if (Changed) {
2531 // We have interchanged the preheaders so we need to interchange the data in
2532 // the preheaders as well. This is because the content of the inner
2533 // preheader was previously executed inside the outer loop.
2534 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
2535 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
2536 swapBBContents(OuterLoopPreHeader, InnerLoopPreHeader);
2537 }
2538 return Changed;
2539}
2540
2544 LPMUpdater &U) {
2545 Function &F = *LN.getParent();
2546 SmallVector<Loop *, 8> LoopList(LN.getLoops());
2547
2549
2550 // Ensure minimum depth of the loop nest to do the interchange.
2551 if (!hasSupportedLoopDepth(LoopList, ORE))
2552 return PreservedAnalyses::all();
2553 // Ensure computable loop nest.
2554 if (!isComputableLoopNest(&AR.SE, LoopList)) {
2555 LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n");
2556 return PreservedAnalyses::all();
2557 }
2558
2559 ORE.emit([&]() {
2560 return OptimizationRemarkAnalysis(DEBUG_TYPE, "Dependence",
2563 << "Computed dependence info, invoking the transform.";
2564 });
2565
2566 DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
2567 if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, &AR, &ORE).run(LN))
2568 return PreservedAnalyses::all();
2569 U.markLoopNestChanged(true);
2571}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file defines the StringMap class.
Rewrite undef for PHI
ReachingDefInfo InstSet InstSet & Ignore
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
DXIL Resource Access
#define DEBUG_TYPE
static void moveBBContents(BasicBlock &SourceBB, BasicBlock &TargetBB)
Move the contents of SourceBB to before the last instruction of TargetBB.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
This file defines the interface for the loop cache analysis.
SmallVector< Loop *, 4 > LoopVector
Definition LoopFuse.cpp:362
Loop::LoopBounds::Direction Direction
Definition LoopInfo.cpp:253
static cl::list< RuleTy > Profitabilities("loop-interchange-profitabilities", cl::MiscFlags::CommaSeparated, cl::Hidden, cl::desc("List of profitability heuristics to be used. They are applied in " "the given order"), cl::list_init< RuleTy >({RuleTy::PerInstrOrderCost, RuleTy::ForVectorization}), cl::values(clEnumValN(RuleTy::PerLoopCacheAnalysis, "cache", "Prioritize loop cache cost"), clEnumValN(RuleTy::PerInstrOrderCost, "instorder", "Prioritize the IVs order of each instruction"), clEnumValN(RuleTy::ForVectorization, "vectorize", "Prioritize vectorization"), clEnumValN(RuleTy::Ignore, "ignore", "Ignore profitability, force interchange (does not " "work with other options)")))
static cl::opt< int > LoopInterchangeCostThreshold("loop-interchange-threshold", cl::init(0), cl::Hidden, cl::desc("Interchange if you gain more than this number"))
static cl::opt< unsigned int > MinLoopNestDepth("loop-interchange-min-loop-nest-depth", cl::init(2), cl::Hidden, cl::desc("Minimum depth of loop nest considered for the transform"))
static void updateSuccessor(Instruction *Term, BasicBlock *OldBB, BasicBlock *NewBB, std::vector< DominatorTree::UpdateType > &DTUpdates, bool MustUpdateOnce=true)
static cl::opt< bool > EnableReduction2Memory("loop-interchange-reduction-to-mem", cl::init(false), cl::Hidden, cl::desc("Support for the inner-loop reduction pattern."))
static bool isComputableLoopNest(ScalarEvolution *SE, ArrayRef< Loop * > LoopList)
std::optional< const SCEV * > getAddRecCoefficient(ScalarEvolution &SE, const SCEV *S, const Loop *L)
If \S contains an affine addrec for L, return the step recurrence of it.
static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop)
static void simplifyLCSSAPhis(Loop *OuterLoop, Loop *InnerLoop)
This deals with a corner case when a LCSSA phi node appears in a non-exit block: the outer loop latch...
static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx, unsigned ToIndx)
static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader, BasicBlock *InnerLatch, BasicBlock *OuterHeader, BasicBlock *OuterLatch, BasicBlock *OuterExit, Loop *InnerLoop, LoopInfo *LI)
static void printDepMatrix(CharMatrix &DepMatrix)
static cl::opt< unsigned int > MaxMemInstrRatio("loop-interchange-max-mem-instr-ratio", cl::init(4), cl::Hidden, cl::desc("Maximum number of load/store instructions squared in relation to " "the total number of instructions. Higher value may lead to more " "interchanges at the cost of compile-time"))
static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2)
Swap instructions between BB1 and BB2 but keep terminators intact.
static PHINode * findInnerReductionPhi(Loop *L, Value *V, SmallVectorImpl< Instruction * > &HasNoWrapInsts, SmallVectorImpl< Instruction * > &HasNoInfInsts)
static bool areInnerLoopExitPHIsSupported(Loop *OuterL, Loop *InnerL, SmallPtrSetImpl< PHINode * > &Reductions, PHINode *LcssaReduction)
We currently only support LCSSA PHI nodes in the inner loop exit if their users are either of the fol...
static cl::opt< unsigned int > MaxLoopNestDepth("loop-interchange-max-loop-nest-depth", cl::init(10), cl::Hidden, cl::desc("Maximum depth of loop nest considered for the transform"))
static bool hasSupportedLoopDepth(ArrayRef< Loop * > LoopList, OptimizationRemarkEmitter &ORE)
static bool inThisOrder(const Instruction *Src, const Instruction *Dst)
Return true if Src appears before Dst in the same basic block.
static bool areInnerLoopLatchPHIsSupported(Loop *OuterLoop, Loop *InnerLoop)
static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId)
Return true if we can vectorize the loop specified by LoopId.
static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix, unsigned InnerLoopId, unsigned OuterLoopId)
#define DEBUG_TYPE
static Value * followLCSSA(Value *SV)
static void populateWorklist(Loop &L, LoopVector &LoopList)
static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, Loop *L, DependenceInfo *DI, ScalarEvolution *SE, OptimizationRemarkEmitter *ORE)
static std::optional< bool > isLexicographicallyPositive(ArrayRef< char > DV, unsigned Begin, unsigned End)
static bool checkReductionKind(Loop *L, PHINode *PHI, SmallVectorImpl< Instruction * > &HasNoWrapInsts, SmallVectorImpl< Instruction * > &HasNoInfInsts)
static bool noDuplicateRulesAndIgnore(ArrayRef< RuleTy > Rules)
This file defines the interface for the loop nest analysis.
This header provides classes for managing a pipeline of passes over loops in LLVM IR.
loop Loop Strength Reduction
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
uint64_t IntrinsicInst * II
#define P(N)
This file contains some templates that are useful if you are working with the STL at all.
SmallVector< Value *, 8 > ValueVector
This file defines the SmallSet class.
This file defines the SmallVector class.
static bool isProfitable(const StableFunctionMap::StableFunctionEntries &SFS)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:119
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
Get the first element.
Definition ArrayRef.h:144
size_t size() const
Get the array size.
Definition ArrayRef.h:141
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:185
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:474
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:461
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition BasicBlock.h:530
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
LLVM_ABI void replacePhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block to refer to basic block New instead of basic block Old.
LLVM_ABI const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition BasicBlock.h:659
static LLVM_ABI std::unique_ptr< CacheCost > getCacheCost(Loop &Root, LoopStandardAnalysisResults &AR, DependenceInfo &DI, std::optional< unsigned > TRT=std::nullopt)
Create a CacheCost for the loop nest rooted by Root.
CacheCostTy getLoopCost(const Loop &L) const
Return the estimated cost of loop L if the given loop is part of the loop nest associated with this o...
Value * getCondition() const
BasicBlock * getSuccessor(unsigned i) const
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:225
iterator end()
Definition DenseMap.h:143
DependenceInfo - This class is the main dependence-analysis driver.
LLVM_ABI std::unique_ptr< Dependence > depends(Instruction *Src, Instruction *Dst, bool UnderRuntimeAssumptions=false)
depends - Tests for a dependence between the Src and Dst instructions.
void applyUpdates(ArrayRef< UpdateType > Updates)
Inform the dominator tree about a sequence of CFG edge insertions and deletions and perform a batch u...
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static LLVM_ABI bool isInductionPHI(PHINode *Phi, const Loop *L, ScalarEvolution *SE, InductionDescriptor &D, ArrayRef< const SCEVPredicate * > NoWrapPreds={}, const SCEV *Expr=nullptr, SmallVectorImpl< Instruction * > *CastsToIgnore=nullptr)
Returns true if Phi is an induction in the loop L.
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI void insertBefore(InstListType::iterator InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified position.
LLVM_ABI bool mayHaveSideEffects() const LLVM_READONLY
Return true if the instruction may have side effects.
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
void removeBlockFromLoop(BlockT *BB)
This removes the specified basic block from the current loop, updating the Blocks as appropriate.
const std::vector< LoopT * > & getSubLoops() const
Return the loops contained entirely within this loop.
BlockT * getHeader() const
iterator_range< block_iterator > blocks() const
void addChildLoop(LoopT *NewChild)
Add the specified loop to be a child of this loop.
void addBlockEntry(BlockT *BB)
This adds a basic block directly to the basic block list.
BlockT * getExitBlock() const
If getExitBlocks would return exactly one block, return that block.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
iterator begin() const
BlockT * getUniqueExitBlock() const
If getUniqueExitBlocks would return exactly one block, return that block.
LoopT * removeChildLoop(iterator I)
This removes the specified child from being a subloop of this loop.
void changeTopLevelLoop(LoopT *OldLoop, LoopT *NewLoop)
Replace the specified loop in the top-level loops list with the indicated loop.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
void changeLoopFor(const BlockT *BB, LoopT *L)
Change the top-level loop that contains BB to the specified loop.
This class represents a loop nest and can be used to query its properties.
static const BasicBlock & skipEmptyBlockUntil(const BasicBlock *From, const BasicBlock *End, bool CheckUniquePred=false)
Recursivelly traverse all empty 'single successor' basic blocks of From (if there are any).
ArrayRef< Loop * > getLoops() const
Get the loops in the nest.
Function * getParent() const
Return the function to which the loop-nest belongs.
Loop & getOutermostLoop() const
Return the outermost loop in the loop nest.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition LoopInfo.cpp:659
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition LoopInfo.cpp:67
StringRef getName() const
Definition LoopInfo.h:407
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
bool isComplete() const
If the PHI node is complete which means all of its parent's predecessors have incoming value in this ...
op_range incoming_values()
void setIncomingBlock(unsigned i, BasicBlock *BB)
void setIncomingValue(unsigned i, Value *V)
static unsigned getIncomingValueNumForOperand(unsigned i)
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Instruction * getExactFPMathInst() const
Returns 1st non-reassociative FP instruction in the PHI node's use-chain.
static LLVM_ABI bool isReductionPHI(PHINode *Phi, Loop *TheLoop, RecurrenceDescriptor &RedDes, DemandedBits *DB=nullptr, AssumptionCache *AC=nullptr, DominatorTree *DT=nullptr, ScalarEvolution *SE=nullptr)
Returns true if Phi is a reduction in TheLoop.
LLVM_ABI SmallVector< Instruction *, 4 > getReductionOpChain(PHINode *Phi, Loop *L) const
Attempts to find a chain of operations from Phi to LoopExitInst that can be treated as a set of reduc...
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
SCEVUse getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
The main scalar evolution driver.
LLVM_ABI const SCEV * getAbsExpr(const SCEV *Op, bool IsNSW)
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI const SCEV * getPointerBase(const SCEV *V)
Transitively follow the chain of pointer-type operands until reaching a SCEV that does not have a sin...
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, SCEVUse LHS, SCEVUse RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:134
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:136
std::pair< iterator, bool > try_emplace(StringRef Key, ArgsTy &&...Args)
Emplace a new element for the specified key into the map if the key isn't already in the map.
Definition StringMap.h:387
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
constexpr size_t size() const
Get the string size.
Definition StringRef.h:144
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
LLVM_ABI bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition Value.cpp:162
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:552
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition Value.cpp:184
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
list_initializer< Ty > list_init(ArrayRef< Ty > Vals)
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
DXILDebugInfoMap run(Module &M)
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI BasicBlock * InsertPreheaderForLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
InsertPreheaderForLoop - Once we discover that a loop doesn't have a preheader, this method is called...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
InstructionCost Cost
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
auto successors(const MachineBasicBlock *BB)
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
LLVM_ABI bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition LCSSA.cpp:449
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2172
auto map_range(ContainerTy &&C, FuncTy F)
Return a range that applies F to the elements of C.
Definition STLExtras.h:365
AnalysisManager< Loop, LoopStandardAnalysisResults & > LoopAnalysisManager
The loop analysis manager.
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:2025
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:322
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FMinimumNum
FP min with llvm.minimumnum semantics.
@ Or
Bitwise or logical OR of integers.
@ FMinimum
FP min with llvm.minimum semantics.
@ Mul
Product of integers.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ Xor
Bitwise or logical XOR of integers.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMaximum
FP max with llvm.maximum semantics.
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Add
Sum of integers.
@ FAdd
Sum of floats.
@ FMaximumNum
FP max with llvm.maximumnum semantics.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2011
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI bool formLCSSAForInstructions(SmallVectorImpl< Instruction * > &Worklist, const DominatorTree &DT, const LoopInfo &LI, ScalarEvolution *SE, SmallVectorImpl< PHINode * > *PHIsToRemove=nullptr, SmallVectorImpl< PHINode * > *InsertedPHIs=nullptr)
Ensures LCSSA form for every instruction from the Worklist in the scope of innermost containing loop.
Definition LCSSA.cpp:308
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
auto predecessors(const MachineBasicBlock *BB)
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
Definition iterator.h:368
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:862
LLVM_ABI PreservedAnalyses run(LoopNest &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...