LLVM 23.0.0git
LoopInterchange.cpp
Go to the documentation of this file.
1//===- LoopInterchange.cpp - Loop interchange pass-------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This Pass handles loop interchange transform.
10// This pass interchanges loops to provide a more cache-friendly memory access
11// patterns.
12//
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/STLExtras.h"
17#include "llvm/ADT/SmallSet.h"
19#include "llvm/ADT/Statistic.h"
20#include "llvm/ADT/StringMap.h"
21#include "llvm/ADT/StringRef.h"
30#include "llvm/IR/BasicBlock.h"
32#include "llvm/IR/Dominators.h"
33#include "llvm/IR/Function.h"
34#include "llvm/IR/IRBuilder.h"
35#include "llvm/IR/InstrTypes.h"
36#include "llvm/IR/Instruction.h"
38#include "llvm/IR/User.h"
39#include "llvm/IR/Value.h"
42#include "llvm/Support/Debug.h"
49#include <cassert>
50#include <utility>
51#include <vector>
52
53using namespace llvm;
54
55#define DEBUG_TYPE "loop-interchange"
56
57STATISTIC(LoopsInterchanged, "Number of loops interchanged");
58
60 "loop-interchange-threshold", cl::init(0), cl::Hidden,
61 cl::desc("Interchange if you gain more than this number"));
62
64 "loop-interchange-max-mem-instr-ratio", cl::init(4), cl::Hidden,
65 cl::desc("Maximum number of load/store instructions squared in relation to "
66 "the total number of instructions. Higher value may lead to more "
67 "interchanges at the cost of compile-time"));
68
69namespace {
70
72
73/// A list of direction vectors. Each entry represents a direction vector
74/// corresponding to one or more dependencies existing in the loop nest. The
75/// length of all direction vectors is equal and is N + 1, where N is the depth
76/// of the loop nest. The first N elements correspond to the dependency
77/// direction of each N loops. The last one indicates whether this entry is
78/// forward dependency ('<') or not ('*'). The term "forward" aligns with what
79/// is defined in LoopAccessAnalysis.
80// TODO: Check if we can use a sparse matrix here.
81using CharMatrix = std::vector<std::vector<char>>;
82
83/// Types of rules used in profitability check.
84enum class RuleTy {
85 PerLoopCacheAnalysis,
86 PerInstrOrderCost,
87 ForVectorization,
88 Ignore
89};
90
91} // end anonymous namespace
92
93// Minimum loop depth supported.
95 "loop-interchange-min-loop-nest-depth", cl::init(2), cl::Hidden,
96 cl::desc("Minimum depth of loop nest considered for the transform"));
97
98// Maximum loop depth supported.
100 "loop-interchange-max-loop-nest-depth", cl::init(10), cl::Hidden,
101 cl::desc("Maximum depth of loop nest considered for the transform"));
102
103// We prefer cache cost to vectorization by default.
105 "loop-interchange-profitabilities", cl::MiscFlags::CommaSeparated,
107 cl::desc("List of profitability heuristics to be used. They are applied in "
108 "the given order"),
109 cl::list_init<RuleTy>({RuleTy::PerInstrOrderCost,
110 RuleTy::ForVectorization}),
111 cl::values(clEnumValN(RuleTy::PerLoopCacheAnalysis, "cache",
112 "Prioritize loop cache cost"),
113 clEnumValN(RuleTy::PerInstrOrderCost, "instorder",
114 "Prioritize the IVs order of each instruction"),
115 clEnumValN(RuleTy::ForVectorization, "vectorize",
116 "Prioritize vectorization"),
117 clEnumValN(RuleTy::Ignore, "ignore",
118 "Ignore profitability, force interchange (does not "
119 "work with other options)")));
120
121// Support for the inner-loop reduction pattern.
123 "loop-interchange-reduction-to-mem", cl::init(false), cl::Hidden,
124 cl::desc("Support for the inner-loop reduction pattern."));
125
126#ifndef NDEBUG
129 for (RuleTy Rule : Rules) {
130 if (!Set.insert(Rule).second)
131 return false;
132 if (Rule == RuleTy::Ignore)
133 return false;
134 }
135 return true;
136}
137
138static void printDepMatrix(CharMatrix &DepMatrix) {
139 for (auto &Row : DepMatrix) {
140 // Drop the last element because it is a flag indicating whether this is
141 // forward dependency or not, which doesn't affect the legality check.
142 for (char D : drop_end(Row))
143 LLVM_DEBUG(dbgs() << D << " ");
144 LLVM_DEBUG(dbgs() << "\n");
145 }
146}
147
148/// Return true if \p Src appears before \p Dst in the same basic block.
149/// Precondition: \p Src and \Dst are distinct instructions within the same
150/// basic block.
151static bool inThisOrder(const Instruction *Src, const Instruction *Dst) {
152 assert(Src->getParent() == Dst->getParent() && Src != Dst &&
153 "Expected Src and Dst to be different instructions in the same BB");
154
155 bool FoundSrc = false;
156 for (const Instruction &I : *(Src->getParent())) {
157 if (&I == Src) {
158 FoundSrc = true;
159 continue;
160 }
161 if (&I == Dst)
162 return FoundSrc;
163 }
164
165 llvm_unreachable("Dst not found");
166}
167#endif
168
169static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
170 Loop *L, DependenceInfo *DI,
171 ScalarEvolution *SE,
174
175 ValueVector MemInstr;
176 unsigned NumInsts = 0;
177
178 // For each block.
179 for (BasicBlock *BB : L->blocks()) {
180 // Scan the BB and collect legal loads and stores.
181 for (Instruction &I : *BB) {
182 NumInsts++;
183 if (auto *Ld = dyn_cast<LoadInst>(&I)) {
184 if (!Ld->isSimple())
185 return false;
186 MemInstr.push_back(&I);
187 } else if (auto *St = dyn_cast<StoreInst>(&I)) {
188 if (!St->isSimple())
189 return false;
190 MemInstr.push_back(&I);
191 }
192 }
193 }
194
195 // To populate the dependence matrix, we perform dependence test for each pair
196 // of memory instructions, which has O(NumMemInstr^2) complexity. This implies
197 // that even if the number of memory instructions is small, the analysis can
198 // still be expensive if the most of the instructions in the loop are memory
199 // instructions. On the other hand, if the number of memory instructions is
200 // not small, but the loop is large (i.e., it contains many non-memory
201 // instructions), the analysis can still be affordable.
202 unsigned NumMemInstr = MemInstr.size();
203 LLVM_DEBUG(dbgs() << "Found " << NumMemInstr
204 << " Loads and Stores to analyze\n");
205 if (MaxMemInstrRatio * NumInsts < NumMemInstr * NumMemInstr) {
206 ORE->emit([&]() {
207 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoop",
208 L->getStartLoc(), L->getHeader())
209 << "Number of loads/stores exceeded, the supported maximum can be "
210 "increased with option -loop-interchange-max-mem-instr-ratio.";
211 });
212 return false;
213 }
214 ValueVector::iterator I, IE, J, JE;
215
216 // Manage direction vectors that are already seen. Map each direction vector
217 // to an index of DepMatrix at which it is stored.
219
220 for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
221 for (J = I, JE = MemInstr.end(); J != JE; ++J) {
222 std::vector<char> Dep;
225 // Ignore Input dependencies.
226 if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
227 continue;
228 // Track Output, Flow, and Anti dependencies.
229 if (auto D = DI->depends(Src, Dst)) {
230 assert(D->isOrdered() && "Expected an output, flow or anti dep.");
231 // If the direction vector is negative, normalize it to
232 // make it non-negative.
233 if (D->normalize(SE))
234 LLVM_DEBUG(dbgs() << "Negative dependence vector normalized.\n");
235 LLVM_DEBUG(StringRef DepType =
236 D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
237 dbgs() << "Found " << DepType
238 << " dependency between Src and Dst\n"
239 << " Src:" << *Src << "\n Dst:" << *Dst << '\n');
240 unsigned Levels = D->getLevels();
241 char Direction;
242 for (unsigned II = 1; II <= Levels; ++II) {
243 // `DVEntry::LE` is converted to `*`. This is because `LE` means `<`
244 // or `=`, for which we don't have an equivalent representation, so
245 // that the conservative approximation is necessary. The same goes for
246 // `DVEntry::GE`.
247 // TODO: Use of fine-grained expressions allows for more accurate
248 // analysis.
249 unsigned Dir = D->getDirection(II);
250 if (Dir == Dependence::DVEntry::LT)
251 Direction = '<';
252 else if (Dir == Dependence::DVEntry::GT)
253 Direction = '>';
254 else if (Dir == Dependence::DVEntry::EQ)
255 Direction = '=';
256 else
257 Direction = '*';
258 Dep.push_back(Direction);
259 }
260
261 // If the Dependence object doesn't have any information, fill the
262 // dependency vector with '*'.
263 if (D->isConfused()) {
264 assert(Dep.empty() && "Expected empty dependency vector");
265 Dep.assign(Level, '*');
266 }
267
268 while (Dep.size() != Level) {
269 Dep.push_back('I');
270 }
271
272 // If all the elements of any direction vector have only '*', legality
273 // can't be proven. Exit early to save compile time.
274 if (all_of(Dep, equal_to('*'))) {
275 ORE->emit([&]() {
276 return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
277 L->getStartLoc(), L->getHeader())
278 << "All loops have dependencies in all directions.";
279 });
280 return false;
281 }
282
283 // Test whether the dependency is forward or not.
284 bool IsKnownForward = true;
285 if (Src->getParent() != Dst->getParent()) {
286 // In general, when Src and Dst are in different BBs, the execution
287 // order of them within a single iteration is not guaranteed. Treat
288 // conservatively as not-forward dependency in this case.
289 IsKnownForward = false;
290 } else {
291 // Src and Dst are in the same BB. If they are the different
292 // instructions, Src should appear before Dst in the BB as they are
293 // stored to MemInstr in that order.
294 assert((Src == Dst || inThisOrder(Src, Dst)) &&
295 "Unexpected instructions");
296
297 // If the Dependence object is reversed (due to normalization), it
298 // represents the dependency from Dst to Src, meaning it is a backward
299 // dependency. Otherwise it should be a forward dependency.
300 bool IsReversed = D->getSrc() != Src;
301 if (IsReversed)
302 IsKnownForward = false;
303 }
304
305 // Initialize the last element. Assume forward dependencies only; it
306 // will be updated later if there is any non-forward dependency.
307 Dep.push_back('<');
308
309 // The last element should express the "summary" among one or more
310 // direction vectors whose first N elements are the same (where N is
311 // the depth of the loop nest). Hence we exclude the last element from
312 // the Seen map.
313 auto [Ite, Inserted] = Seen.try_emplace(
314 StringRef(Dep.data(), Dep.size() - 1), DepMatrix.size());
315
316 // Make sure we only add unique entries to the dependency matrix.
317 if (Inserted)
318 DepMatrix.push_back(Dep);
319
320 // If we cannot prove that this dependency is forward, change the last
321 // element of the corresponding entry. Since a `[... *]` dependency
322 // includes a `[... <]` dependency, we do not need to keep both and
323 // change the existing entry instead.
324 if (!IsKnownForward)
325 DepMatrix[Ite->second].back() = '*';
326 }
327 }
328 }
329
330 return true;
331}
332
333// A loop is moved from index 'from' to an index 'to'. Update the Dependence
334// matrix by exchanging the two columns.
335static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx,
336 unsigned ToIndx) {
337 for (auto &Row : DepMatrix)
338 std::swap(Row[ToIndx], Row[FromIndx]);
339}
340
341// Check if a direction vector is lexicographically positive. Return true if it
342// is positive, nullopt if it is "zero", otherwise false.
343// [Theorem] A permutation of the loops in a perfect nest is legal if and only
344// if the direction matrix, after the same permutation is applied to its
345// columns, has no ">" direction as the leftmost non-"=" direction in any row.
346static std::optional<bool>
347isLexicographicallyPositive(ArrayRef<char> DV, unsigned Begin, unsigned End) {
348 for (unsigned char Direction : DV.slice(Begin, End - Begin)) {
349 if (Direction == '<')
350 return true;
351 if (Direction == '>' || Direction == '*')
352 return false;
353 }
354 return std::nullopt;
355}
356
357// Checks if it is legal to interchange 2 loops.
358static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
359 unsigned InnerLoopId,
360 unsigned OuterLoopId) {
361 unsigned NumRows = DepMatrix.size();
362 std::vector<char> Cur;
363 // For each row check if it is valid to interchange.
364 for (unsigned Row = 0; Row < NumRows; ++Row) {
365 // Create temporary DepVector check its lexicographical order
366 // before and after swapping OuterLoop vs InnerLoop
367 Cur = DepMatrix[Row];
368
369 // If the surrounding loops already ensure that the direction vector is
370 // lexicographically positive, nothing within the loop will be able to break
371 // the dependence. In such a case we can skip the subsequent check.
372 if (isLexicographicallyPositive(Cur, 0, OuterLoopId) == true)
373 continue;
374
375 // Check if the direction vector is lexicographically positive (or zero)
376 // for both before/after exchanged. Ignore the last element because it
377 // doesn't affect the legality.
378 if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false)
379 return false;
380 std::swap(Cur[InnerLoopId], Cur[OuterLoopId]);
381 if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false)
382 return false;
383 }
384 return true;
385}
386
387static void populateWorklist(Loop &L, LoopVector &LoopList) {
388 LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: "
389 << L.getHeader()->getParent()->getName() << " Loop: %"
390 << L.getHeader()->getName() << '\n');
391 assert(LoopList.empty() && "LoopList should initially be empty!");
392 Loop *CurrentLoop = &L;
393 const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
394 while (!Vec->empty()) {
395 // The current loop has multiple subloops in it hence it is not tightly
396 // nested.
397 // Discard all loops above it added into Worklist.
398 if (Vec->size() != 1) {
399 LoopList = {};
400 return;
401 }
402
403 LoopList.push_back(CurrentLoop);
404 CurrentLoop = Vec->front();
405 Vec = &CurrentLoop->getSubLoops();
406 }
407 LoopList.push_back(CurrentLoop);
408}
409
412 unsigned LoopNestDepth = LoopList.size();
413 if (LoopNestDepth < MinLoopNestDepth || LoopNestDepth > MaxLoopNestDepth) {
414 LLVM_DEBUG(dbgs() << "Unsupported depth of loop nest " << LoopNestDepth
415 << ", the supported range is [" << MinLoopNestDepth
416 << ", " << MaxLoopNestDepth << "].\n");
417 Loop *OuterLoop = LoopList.front();
418 ORE.emit([&]() {
419 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoopNestDepth",
420 OuterLoop->getStartLoc(),
421 OuterLoop->getHeader())
422 << "Unsupported depth of loop nest, the supported range is ["
423 << std::to_string(MinLoopNestDepth) << ", "
424 << std::to_string(MaxLoopNestDepth) << "].\n";
425 });
426 return false;
427 }
428 return true;
429}
430
432 ArrayRef<Loop *> LoopList) {
433 for (Loop *L : LoopList) {
434 const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
435 if (isa<SCEVCouldNotCompute>(ExitCountOuter)) {
436 LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n");
437 return false;
438 }
439 if (L->getNumBackEdges() != 1) {
440 LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
441 return false;
442 }
443 if (!L->getExitingBlock()) {
444 LLVM_DEBUG(dbgs() << "Loop doesn't have unique exit block\n");
445 return false;
446 }
447 }
448 return true;
449}
450
451namespace {
452
453/// LoopInterchangeLegality checks if it is legal to interchange the loop.
454class LoopInterchangeLegality {
455public:
456 LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
457 OptimizationRemarkEmitter *ORE, DominatorTree *DT)
458 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), DT(DT), ORE(ORE) {}
459
460 /// Check if the loops can be interchanged.
461 bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
462 CharMatrix &DepMatrix);
463
464 /// Check if the loop structure is understood. We do not handle triangular
465 /// loops for now.
466 bool isLoopStructureUnderstood();
467
468 bool currentLimitations();
469
470 const SmallPtrSetImpl<PHINode *> &getOuterInnerReductions() const {
471 return OuterInnerReductions;
472 }
473
474 const ArrayRef<PHINode *> getInnerLoopInductions() const {
475 return InnerLoopInductions;
476 }
477
478 ArrayRef<Instruction *> getHasNoWrapReductions() const {
479 return HasNoWrapReductions;
480 }
481
482 ArrayRef<Instruction *> getHasNoInfInsts() const { return HasNoInfInsts; }
483
484 /// Record reductions in the inner loop. Currently supported reductions:
485 /// - initialized from a constant.
486 /// - reduction PHI node has only one user.
487 /// - located in the innermost loop.
488 struct InnerReduction {
489 /// The reduction itself.
490 PHINode *Reduction;
491 Value *Init;
492 Value *Next;
493 /// The Lcssa PHI.
494 PHINode *LcssaPhi;
495 /// Store reduction result into memory object.
496 StoreInst *LcssaStore;
497 /// The memory Location.
498 Value *MemRef;
499 Type *ElemTy;
500 };
501
502 ArrayRef<InnerReduction> getInnerReductions() const {
503 return InnerReductions;
504 }
505
506private:
507 bool tightlyNested(Loop *Outer, Loop *Inner);
508 bool containsUnsafeInstructions(BasicBlock *BB, Instruction *Skip);
509
510 /// Traverse all PHI nodes in the header of each loop in the loop nest
511 /// starting from \p OuterLoop, and perform the following checks:
512 ///
513 /// - Identify induction variables in the child loop of \p OuterLoop.
514 /// - Check for reductions across the inner loop and \p OuterLoop.
515 /// - Detect unsupported PHI nodes.
516 ///
517 /// Return false if any unsupported PHI node is found or if no induction
518 /// variable is found in the child loop of \p OuterLoop. Otherwise return
519 /// true.
520 bool checkInductionsAndReductions(Loop *OuterLoop);
521
522 /// Detect and record the reduction of the inner loop. Add them to
523 /// InnerReductions.
524 ///
525 /// innerloop:
526 /// Re = phi<0.0, Next>
527 /// Next = Re op ...
528 /// OuterLoopLatch:
529 /// Lcssa = phi<Next> ; lcssa phi
530 /// store Lcssa, MemRef ; LcssaStore
531 ///
532 bool isInnerReduction(Loop *L, PHINode *Phi,
533 SmallVectorImpl<Instruction *> &HasNoWrapInsts);
534
535 Loop *OuterLoop;
536 Loop *InnerLoop;
537
538 ScalarEvolution *SE;
539 DominatorTree *DT;
540
541 /// Interface to emit optimization remarks.
542 OptimizationRemarkEmitter *ORE;
543
544 /// Set of reduction PHIs taking part of a reduction across the inner and
545 /// outer loop.
546 SmallPtrSet<PHINode *, 4> OuterInnerReductions;
547
548 /// Set of inner loop induction PHIs
549 SmallVector<PHINode *, 8> InnerLoopInductions;
550
551 /// Hold instructions that have nuw/nsw flags and involved in reductions,
552 /// like integer addition/multiplication. Those flags must be dropped when
553 /// interchanging the loops.
554 SmallVector<Instruction *, 4> HasNoWrapReductions;
555
556 /// Hold instructions that have ninf flags and involved in reductions. Those
557 /// flags must be dropped when interchanging the loops.
558 SmallVector<Instruction *, 4> HasNoInfInsts;
559
560 /// Vector of reductions in the inner loop.
561 SmallVector<InnerReduction, 8> InnerReductions;
562};
563
564/// Manages information utilized by the profitability check for cache. The main
565/// purpose of this class is to delay the computation of CacheCost until it is
566/// actually needed.
567class CacheCostManager {
568 Loop *OutermostLoop;
569 LoopStandardAnalysisResults *AR;
570 DependenceInfo *DI;
571
572 /// CacheCost for \ref OutermostLoop. Once it is computed, it is cached. Note
573 /// that the result can be nullptr.
574 std::optional<std::unique_ptr<CacheCost>> CC;
575
576 /// Maps each loop to an index representing the optimal position within the
577 /// loop-nest, as determined by the cache cost analysis.
578 DenseMap<const Loop *, unsigned> CostMap;
579
580 void computeIfUnitinialized();
581
582public:
583 CacheCostManager(Loop *OutermostLoop, LoopStandardAnalysisResults *AR,
584 DependenceInfo *DI)
585 : OutermostLoop(OutermostLoop), AR(AR), DI(DI) {}
586 CacheCost *getCacheCost();
587 const DenseMap<const Loop *, unsigned> &getCostMap();
588};
589
590/// LoopInterchangeProfitability checks if it is profitable to interchange the
591/// loop.
592class LoopInterchangeProfitability {
593public:
594 LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
595 OptimizationRemarkEmitter *ORE)
596 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
597
598 /// Check if the loop interchange is profitable.
599 bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
600 unsigned InnerLoopId, unsigned OuterLoopId,
601 CharMatrix &DepMatrix, CacheCostManager &CCM);
602
603private:
604 int getInstrOrderCost();
605 std::optional<bool> isProfitablePerLoopCacheAnalysis(
606 const DenseMap<const Loop *, unsigned> &CostMap, CacheCost *CC);
607 std::optional<bool> isProfitablePerInstrOrderCost();
608 std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId,
609 unsigned OuterLoopId,
610 CharMatrix &DepMatrix);
611 Loop *OuterLoop;
612 Loop *InnerLoop;
613
614 /// Scev analysis.
615 ScalarEvolution *SE;
616
617 /// Interface to emit optimization remarks.
618 OptimizationRemarkEmitter *ORE;
619};
620
621/// LoopInterchangeTransform interchanges the loop.
622class LoopInterchangeTransform {
623public:
624 LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
625 LoopInfo *LI, DominatorTree *DT,
626 const LoopInterchangeLegality &LIL)
627 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), LIL(LIL) {}
628
629 /// Interchange OuterLoop and InnerLoop.
630 bool transform(ArrayRef<Instruction *> DropNoWrapInsts,
631 ArrayRef<Instruction *> DropNoInfInsts);
632 void reduction2Memory();
633 void restructureLoops(Loop *NewInner, Loop *NewOuter,
634 BasicBlock *OrigInnerPreHeader,
635 BasicBlock *OrigOuterPreHeader);
636 void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
637
638private:
639 bool adjustLoopLinks();
640 bool adjustLoopBranches();
641
642 Loop *OuterLoop;
643 Loop *InnerLoop;
644
645 /// Scev analysis.
646 ScalarEvolution *SE;
647
648 LoopInfo *LI;
649 DominatorTree *DT;
650
651 const LoopInterchangeLegality &LIL;
652};
653
654struct LoopInterchange {
655 ScalarEvolution *SE = nullptr;
656 LoopInfo *LI = nullptr;
657 DependenceInfo *DI = nullptr;
658 DominatorTree *DT = nullptr;
659 LoopStandardAnalysisResults *AR = nullptr;
660
661 /// Interface to emit optimization remarks.
662 OptimizationRemarkEmitter *ORE;
663
664 LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI,
665 DominatorTree *DT, LoopStandardAnalysisResults *AR,
666 OptimizationRemarkEmitter *ORE)
667 : SE(SE), LI(LI), DI(DI), DT(DT), AR(AR), ORE(ORE) {}
668
669 bool run(Loop *L) {
670 if (L->getParentLoop())
671 return false;
672 SmallVector<Loop *, 8> LoopList;
673 populateWorklist(*L, LoopList);
674 return processLoopList(LoopList);
675 }
676
677 bool run(LoopNest &LN) {
678 SmallVector<Loop *, 8> LoopList(LN.getLoops());
679 for (unsigned I = 1; I < LoopList.size(); ++I)
680 if (LoopList[I]->getParentLoop() != LoopList[I - 1])
681 return false;
682 return processLoopList(LoopList);
683 }
684
685 unsigned selectLoopForInterchange(ArrayRef<Loop *> LoopList) {
686 // TODO: Add a better heuristic to select the loop to be interchanged based
687 // on the dependence matrix. Currently we select the innermost loop.
688 return LoopList.size() - 1;
689 }
690
691 bool processLoopList(SmallVectorImpl<Loop *> &LoopList) {
692 bool Changed = false;
693
694 // Ensure proper loop nest depth.
695 assert(hasSupportedLoopDepth(LoopList, *ORE) &&
696 "Unsupported depth of loop nest.");
697
698 unsigned LoopNestDepth = LoopList.size();
699
700 LLVM_DEBUG({
701 dbgs() << "Processing LoopList of size = " << LoopNestDepth
702 << " containing the following loops:\n";
703 for (auto *L : LoopList) {
704 dbgs() << " - ";
705 L->print(dbgs());
706 }
707 });
708
709 CharMatrix DependencyMatrix;
710 Loop *OuterMostLoop = *(LoopList.begin());
711 if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
712 OuterMostLoop, DI, SE, ORE)) {
713 LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
714 return false;
715 }
716
717 LLVM_DEBUG(dbgs() << "Dependency matrix before interchange:\n";
718 printDepMatrix(DependencyMatrix));
719
720 // Get the Outermost loop exit.
721 BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock();
722 if (!LoopNestExit) {
723 LLVM_DEBUG(dbgs() << "OuterMostLoop '" << OuterMostLoop->getName()
724 << "' needs an unique exit block");
725 return false;
726 }
727
728 unsigned SelecLoopId = selectLoopForInterchange(LoopList);
729 CacheCostManager CCM(LoopList[0], AR, DI);
730 // We try to achieve the globally optimal memory access for the loopnest,
731 // and do interchange based on a bubble-sort fasion. We start from
732 // the innermost loop, move it outwards to the best possible position
733 // and repeat this process.
734 for (unsigned j = SelecLoopId; j > 0; j--) {
735 bool ChangedPerIter = false;
736 for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
737 bool Interchanged =
738 processLoop(LoopList, i, i - 1, DependencyMatrix, CCM);
739 ChangedPerIter |= Interchanged;
740 Changed |= Interchanged;
741 }
742 // Early abort if there was no interchange during an entire round of
743 // moving loops outwards.
744 if (!ChangedPerIter)
745 break;
746 }
747 return Changed;
748 }
749
750 bool processLoop(SmallVectorImpl<Loop *> &LoopList, unsigned InnerLoopId,
751 unsigned OuterLoopId,
752 std::vector<std::vector<char>> &DependencyMatrix,
753 CacheCostManager &CCM) {
754 Loop *OuterLoop = LoopList[OuterLoopId];
755 Loop *InnerLoop = LoopList[InnerLoopId];
756 LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId
757 << " and OuterLoopId = " << OuterLoopId << "\n");
758 LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE, DT);
759 if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
760 LLVM_DEBUG(dbgs() << "Cannot prove legality, not interchanging loops '"
761 << OuterLoop->getName() << "' and '"
762 << InnerLoop->getName() << "'\n");
763 return false;
764 }
765 LLVM_DEBUG(dbgs() << "Loops '" << OuterLoop->getName() << "' and '"
766 << InnerLoop->getName()
767 << "' are legal to interchange\n");
768 LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
769 if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
770 DependencyMatrix, CCM)) {
771 LLVM_DEBUG(dbgs() << "Interchanging loops '" << OuterLoop->getName()
772 << "' and '" << InnerLoop->getName()
773 << "' not profitable.\n");
774 return false;
775 }
776
777 ORE->emit([&]() {
778 return OptimizationRemark(DEBUG_TYPE, "Interchanged",
779 InnerLoop->getStartLoc(),
780 InnerLoop->getHeader())
781 << "Loop interchanged with enclosing loop.";
782 });
783
784 LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LIL);
785 LIT.transform(LIL.getHasNoWrapReductions(), LIL.getHasNoInfInsts());
786 LLVM_DEBUG(dbgs() << "Loops interchanged: outer loop '"
787 << OuterLoop->getName() << "' and inner loop '"
788 << InnerLoop->getName() << "'\n");
789 LoopsInterchanged++;
790
791 llvm::formLCSSARecursively(*OuterLoop, *DT, LI, SE);
792
793 // Loops interchanged, update LoopList accordingly.
794 std::swap(LoopList[OuterLoopId], LoopList[InnerLoopId]);
795 // Update the DependencyMatrix
796 interChangeDependencies(DependencyMatrix, InnerLoopId, OuterLoopId);
797
798 LLVM_DEBUG(dbgs() << "Dependency matrix after interchange:\n";
799 printDepMatrix(DependencyMatrix));
800
801 return true;
802 }
803};
804
805} // end anonymous namespace
806
807bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB,
808 Instruction *Skip) {
809 return any_of(*BB, [Skip](const Instruction &I) {
810 if (&I == Skip)
811 return false;
812 return I.mayHaveSideEffects() || I.mayReadFromMemory();
813 });
814}
815
816bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
817 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
818 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
819 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
820
821 LLVM_DEBUG(dbgs() << "Checking if loops '" << OuterLoop->getName()
822 << "' and '" << InnerLoop->getName()
823 << "' are tightly nested\n");
824
825 // In a perfectly nested loop the outer header branches only into the inner
826 // loop. If it can also reach the outer latch, it conditionally guards the
827 // inner loop (an imperfect nest), so the inner loop runs on only a subset of
828 // the outer iterations. Interchanging such a nest would run the inner loop on
829 // every outer iteration, including the guarded-off ones, which is illegal
830 // when the inner loop relies on the guard to terminate (e.g. an eq/ne exit
831 // whose trip count is degenerate once the guard is false). Reject by allowing
832 // the outer header to branch only into the inner loop.
833 //
834 // TODO: This is conservative. A guarded nest is still safe to interchange
835 // when the inner loop has a computable trip count that is empty exactly when
836 // the guard is false, e.g.:
837 // for (i = 0; i < N; i++)
838 // if (M > 0) // loop-invariant guard
839 // for (j = 0; j < M; j++) // empty when M <= 0
840 // A[j][i] = ...;
841 // Interchanging is legal here because the inner loop runs zero times on the
842 // guarded-off iterations.
843 for (BasicBlock *Succ : successors(OuterLoopHeader))
844 if (Succ != InnerLoopPreHeader && Succ != InnerLoop->getHeader())
845 return false;
846
847 LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
848
849 // The inner loop reduction pattern requires storing the LCSSA PHI in
850 // the OuterLoop Latch. Therefore, when reduction2Memory is enabled, skip
851 // that store during checks.
852 Instruction *Skip = nullptr;
853 assert(InnerReductions.size() <= 1 &&
854 "So far we only support at most one reduction.");
855 if (InnerReductions.size() == 1)
856 Skip = InnerReductions[0].LcssaStore;
857
858 // We do not have any basic block in between now make sure the outer header
859 // and outer loop latch doesn't contain any unsafe instructions.
860 if (containsUnsafeInstructions(OuterLoopHeader, Skip) ||
861 containsUnsafeInstructions(OuterLoopLatch, Skip))
862 return false;
863
864 // Also make sure the inner loop preheader does not contain any unsafe
865 // instructions. Note that all instructions in the preheader will be moved to
866 // the outer loop header when interchanging.
867 if (InnerLoopPreHeader != OuterLoopHeader &&
868 containsUnsafeInstructions(InnerLoopPreHeader, Skip))
869 return false;
870
871 BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
872 // Ensure the inner loop exit block flows to the outer loop latch possibly
873 // through empty blocks.
874 const BasicBlock &SuccInner =
875 LoopNest::skipEmptyBlockUntil(InnerLoopExit, OuterLoopLatch);
876 if (&SuccInner != OuterLoopLatch) {
877 LLVM_DEBUG(dbgs() << "Inner loop exit block " << *InnerLoopExit
878 << " does not lead to the outer loop latch.\n";);
879 return false;
880 }
881 // The inner loop exit block does flow to the outer loop latch and not some
882 // other BBs, now make sure it contains safe instructions, since it will be
883 // moved into the (new) inner loop after interchange.
884 if (containsUnsafeInstructions(InnerLoopExit, Skip))
885 return false;
886
887 LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n");
888 // We have a perfect loop nest.
889 return true;
890}
891
892bool LoopInterchangeLegality::isLoopStructureUnderstood() {
893 BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
894 for (PHINode *InnerInduction : InnerLoopInductions) {
895 unsigned Num = InnerInduction->getNumOperands();
896 for (unsigned i = 0; i < Num; ++i) {
897 Value *Val = InnerInduction->getOperand(i);
898 if (isa<Constant>(Val))
899 continue;
901 if (!I)
902 return false;
903 // TODO: Handle triangular loops.
904 // e.g. for(int i=0;i<N;i++)
905 // for(int j=i;j<N;j++)
906 unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
907 if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==
908 InnerLoopPreheader &&
909 !OuterLoop->isLoopInvariant(I)) {
910 return false;
911 }
912 }
913 }
914
915 // TODO: Handle triangular loops of another form.
916 // e.g. for(int i=0;i<N;i++)
917 // for(int j=0;j<i;j++)
918 // or,
919 // for(int i=0;i<N;i++)
920 // for(int j=0;j*i<N;j++)
921 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
922 CondBrInst *InnerLoopLatchBI =
923 dyn_cast<CondBrInst>(InnerLoopLatch->getTerminator());
924 if (!InnerLoopLatchBI)
925 return false;
926
927 CmpInst *InnerLoopCmp = dyn_cast<CmpInst>(InnerLoopLatchBI->getCondition());
928 if (!InnerLoopCmp)
929 return false;
930
931 Value *Op0 = InnerLoopCmp->getOperand(0);
932 Value *Op1 = InnerLoopCmp->getOperand(1);
933
934 // LHS and RHS of the inner loop exit condition, e.g.,
935 // in "for(int j=0;j<i;j++)", LHS is j and RHS is i.
936 Value *Left = nullptr;
937 Value *Right = nullptr;
938
939 // Check if V only involves inner loop induction variable.
940 // Return true if V is InnerInduction, or a cast from
941 // InnerInduction, or a binary operator that involves
942 // InnerInduction and a constant.
943 std::function<bool(Value *)> IsPathToInnerIndVar;
944 IsPathToInnerIndVar = [this, &IsPathToInnerIndVar](const Value *V) -> bool {
945 if (llvm::is_contained(InnerLoopInductions, V))
946 return true;
947 if (isa<Constant>(V))
948 return true;
950 if (!I)
951 return false;
952 if (isa<CastInst>(I))
953 return IsPathToInnerIndVar(I->getOperand(0));
955 return IsPathToInnerIndVar(I->getOperand(0)) &&
956 IsPathToInnerIndVar(I->getOperand(1));
957 return false;
958 };
959
960 // In case of multiple inner loop indvars, it is okay if LHS and RHS
961 // are both inner indvar related variables.
962 if (IsPathToInnerIndVar(Op0) && IsPathToInnerIndVar(Op1))
963 return true;
964
965 // Otherwise we check if the cmp instruction compares an inner indvar
966 // related variable (Left) with a outer loop invariant (Right).
967 if (IsPathToInnerIndVar(Op0) && !isa<Constant>(Op0)) {
968 Left = Op0;
969 Right = Op1;
970 } else if (IsPathToInnerIndVar(Op1) && !isa<Constant>(Op1)) {
971 Left = Op1;
972 Right = Op0;
973 }
974
975 if (Left == nullptr)
976 return false;
977
978 const SCEV *S = SE->getSCEV(Right);
979 if (!SE->isLoopInvariant(S, OuterLoop))
980 return false;
981
982 return true;
983}
984
985// If SV is a LCSSA PHI node with a single incoming value, return the incoming
986// value.
987static Value *followLCSSA(Value *SV) {
989 if (!PHI)
990 return SV;
991
992 if (PHI->getNumIncomingValues() != 1)
993 return SV;
994 return followLCSSA(PHI->getIncomingValue(0));
995}
996
998 SmallVectorImpl<Instruction *> &HasNoWrapInsts,
999 SmallVectorImpl<Instruction *> &HasNoInfInsts) {
1002 // Detect floating point reduction only when it can be reordered.
1003 if (RD.getExactFPMathInst() != nullptr)
1004 return false;
1005
1006 RecurKind RK = RD.getRecurrenceKind();
1007 switch (RK) {
1008 case RecurKind::Or:
1009 case RecurKind::And:
1010 case RecurKind::Xor:
1011 case RecurKind::SMin:
1012 case RecurKind::SMax:
1013 case RecurKind::UMin:
1014 case RecurKind::UMax:
1015 return true;
1016
1017 // Interchanging the loops that contain AnyOf reduction is not always legal.
1018 // Especially, when the result value of the AnyOf is not loop-invariant with
1019 // respect to the outer loop, interchanging may change the semantics. The
1020 // following is an example of such case:
1021 // int A = {{ 1, 0 }, { 0, 1 }};
1022 // int red = 0;
1023 // for (int i = 0; i < 2; i++)
1024 // for (int j = 0; j < 2; j++)
1025 // red = (A[j][i] == 0) ? i + 1 : red;
1026 //
1027 // TODO: We may be able to support interchanging loops with AnyOf reduction
1028 // by checking the operand of the reduction is loop-invariant with respect
1029 // to the outer loop as well.
1030 case RecurKind::AnyOf:
1031 return false;
1032
1033 // Changing the order of floating-point operations may alter the results. If
1034 // a certain instruction has the ninf flag, it means that reordering can
1035 // produce a poison value, which may lead to undefined behavior. To prevent
1036 // this, we must drop the ninf flags if we decide to apply the
1037 // transformation.
1038 case RecurKind::FAdd:
1039 case RecurKind::FMul:
1040 case RecurKind::FMin:
1041 case RecurKind::FMax:
1046 case RecurKind::FMulAdd:
1047 for (Instruction *I : RD.getReductionOpChain(PHI, L))
1048 if (isa<FPMathOperator>(I) && I->hasNoInfs())
1049 HasNoInfInsts.push_back(I);
1050 return true;
1051
1052 // Change the order of integer addition/multiplication may change the
1053 // semantics. Consider the following case:
1054 //
1055 // int A[2][2] = {{ INT_MAX, INT_MAX }, { INT_MIN, INT_MIN }};
1056 // int sum = 0;
1057 // for (int i = 0; i < 2; i++)
1058 // for (int j = 0; j < 2; j++)
1059 // sum += A[j][i];
1060 //
1061 // If the above loops are exchanged, the addition will cause an
1062 // overflow. To prevent this, we must drop the nuw/nsw flags from the
1063 // addition/multiplication instructions when we actually exchanges the
1064 // loops.
1065 case RecurKind::Add:
1066 case RecurKind::Mul: {
1067 unsigned OpCode = RecurrenceDescriptor::getOpcode(RK);
1069
1070 // Bail out when we fail to collect reduction instructions chain.
1071 if (Ops.empty())
1072 return false;
1073
1074 for (Instruction *I : Ops) {
1075 assert(I->getOpcode() == OpCode &&
1076 "Expected the instruction to be the reduction operation");
1077 (void)OpCode;
1078
1079 // If the instruction has nuw/nsw flags, we must drop them when the
1080 // transformation is actually performed.
1081 if (I->hasNoSignedWrap() || I->hasNoUnsignedWrap())
1082 HasNoWrapInsts.push_back(I);
1083 }
1084 return true;
1085 }
1086
1087 default:
1088 return false;
1089 }
1090 } else
1091 return false;
1092}
1093
1094// Check V's users to see if it is involved in a reduction in L.
1095static PHINode *
1097 SmallVectorImpl<Instruction *> &HasNoWrapInsts,
1098 SmallVectorImpl<Instruction *> &HasNoInfInsts) {
1099 // Reduction variables cannot be constants.
1100 if (isa<Constant>(V))
1101 return nullptr;
1102
1103 for (Value *User : V->users()) {
1105 if (PHI->getNumIncomingValues() == 1)
1106 continue;
1107
1108 if (checkReductionKind(L, PHI, HasNoWrapInsts, HasNoInfInsts))
1109 return PHI;
1110 else
1111 return nullptr;
1112 }
1113 }
1114
1115 return nullptr;
1116}
1117
1118bool LoopInterchangeLegality::isInnerReduction(
1119 Loop *L, PHINode *Phi, SmallVectorImpl<Instruction *> &HasNoWrapInsts) {
1120
1121 // Only support reduction2Mem when the loop nest to be interchanged is
1122 // the innermost two loops.
1123 if (!L->isInnermost()) {
1124 LLVM_DEBUG(dbgs() << "Only supported when the loop is the innermost.\n");
1125 ORE->emit([&]() {
1126 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1127 L->getStartLoc(), L->getHeader())
1128 << "Only supported when the loop is the innermost.";
1129 });
1130 return false;
1131 }
1132
1133 if (Phi->getNumIncomingValues() != 2)
1134 return false;
1135
1136 Value *Init = Phi->getIncomingValueForBlock(L->getLoopPreheader());
1137 Value *Next = Phi->getIncomingValueForBlock(L->getLoopLatch());
1138
1139 // So far only supports constant initial value.
1140 if (!isa<Constant>(Init)) {
1141 LLVM_DEBUG(
1142 dbgs()
1143 << "Only supported for the reduction with a constant initial value.\n");
1144 ORE->emit([&]() {
1145 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1146 L->getStartLoc(), L->getHeader())
1147 << "Only supported for the reduction with a constant initial "
1148 "value.";
1149 });
1150 return false;
1151 }
1152
1153 // The reduction result must live in the inner loop.
1154 if (Instruction *I = dyn_cast<Instruction>(Next)) {
1155 BasicBlock *BB = I->getParent();
1156 if (!L->contains(BB))
1157 return false;
1158 }
1159
1160 // The reduction should have only one user.
1161 if (!Phi->hasOneUser())
1162 return false;
1163
1164 // Check the reduction kind.
1165 if (!checkReductionKind(L, Phi, HasNoWrapInsts, HasNoInfInsts))
1166 return false;
1167
1168 // Find lcssa_phi in OuterLoop's Latch
1169 BasicBlock *ExitBlock = L->getExitBlock();
1170 if (!ExitBlock)
1171 return false;
1172
1173 PHINode *Lcssa = NULL;
1174 for (auto *U : Next->users()) {
1175 if (auto *P = dyn_cast<PHINode>(U)) {
1176 if (P == Phi)
1177 continue;
1178
1179 if (Lcssa == NULL && P->getParent() == ExitBlock &&
1180 P->getIncomingValueForBlock(L->getLoopLatch()) == Next)
1181 Lcssa = P;
1182 else
1183 return false;
1184 } else
1185 return false;
1186 }
1187 if (!Lcssa)
1188 return false;
1189
1190 if (!Lcssa->hasOneUser()) {
1191 LLVM_DEBUG(dbgs() << "Only supported when the reduction is used once in "
1192 "the outer loop.\n");
1193 ORE->emit([&]() {
1194 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1195 L->getStartLoc(), L->getHeader())
1196 << "Only supported when the reduction is used once in the outer "
1197 "loop.";
1198 });
1199 return false;
1200 }
1201
1202 StoreInst *LcssaStore =
1204 if (!LcssaStore || LcssaStore->getParent() != ExitBlock)
1205 return false;
1206
1207 Value *MemRef = LcssaStore->getOperand(1);
1208 Type *ElemTy = LcssaStore->getOperand(0)->getType();
1209
1210 // LcssaStore stores the reduction result in BB.
1211 // When the reduction is initialized from a constant value, we need to load
1212 // from the memory object into the target basic block of the inner loop. This
1213 // means the memory reference was used prematurely. So we must ensure that the
1214 // memory reference does not dominate the target basic block.
1215 // TODO: Move the memory reference definition into the loop header.
1216 if (!DT->dominates(dyn_cast<Instruction>(MemRef), L->getHeader())) {
1217 LLVM_DEBUG(dbgs() << "Only supported when memory reference dominate "
1218 "the inner loop.\n");
1219 ORE->emit([&]() {
1220 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1221 L->getStartLoc(), L->getHeader())
1222 << "Only supported when memory reference dominate the inner "
1223 "loop.";
1224 });
1225 return false;
1226 }
1227
1228 // Found a reduction in the inner loop.
1229 InnerReduction SR;
1230 SR.Reduction = Phi;
1231 SR.Init = Init;
1232 SR.Next = Next;
1233 SR.LcssaPhi = Lcssa;
1234 SR.LcssaStore = LcssaStore;
1235 SR.MemRef = MemRef;
1236 SR.ElemTy = ElemTy;
1237
1238 InnerReductions.push_back(SR);
1239 return true;
1240}
1241
1242bool LoopInterchangeLegality::checkInductionsAndReductions(Loop *OuterLoop) {
1243 auto ChildLoop = [](Loop *L) {
1244 assert(L->getSubLoops().size() <= 1 &&
1245 "Expect at most one child loop for now.");
1246 return L->getSubLoops().empty() ? nullptr : L->getSubLoops().front();
1247 };
1248
1249 Loop *InnerLoop = ChildLoop(OuterLoop);
1250 for (Loop *CurLoop = OuterLoop; CurLoop; CurLoop = ChildLoop(CurLoop)) {
1251 for (PHINode &PHI : CurLoop->getHeader()->phis()) {
1252 InductionDescriptor ID;
1253 if (InductionDescriptor::isInductionPHI(&PHI, CurLoop, SE, ID)) {
1254 if (CurLoop == InnerLoop) {
1255 const SCEV *Step = ID.getStep();
1256 if (!SE->isLoopInvariant(Step, OuterLoop))
1257 return false;
1258 InnerLoopInductions.push_back(&PHI);
1259 }
1260 continue;
1261 }
1262
1263 if (CurLoop == OuterLoop) {
1264 // PHIs in inner loops need to be part of a reduction in the outer loop,
1265 assert(PHI.getNumIncomingValues() == 2 &&
1266 "Phis in loop header should have exactly 2 incoming values");
1267 // Check if we have a PHI node in the outer loop that has a reduction
1268 // result from the inner loop as an incoming value.
1269 Value *V = followLCSSA(
1270 PHI.getIncomingValueForBlock(OuterLoop->getLoopLatch()));
1271 PHINode *InnerRedPhi = findInnerReductionPhi(
1272 InnerLoop, V, HasNoWrapReductions, HasNoInfInsts);
1273
1274 // Reject if PHI has users other than InnerRedPhi. The typical case is
1275 // as follows:
1276 //
1277 // o.header:
1278 // %red.o = phi [ 0, ... ], [ %red.next, %o.latch ]
1279 // br label %i.header
1280 //
1281 // i.header:
1282 // %red.i = phi [ %red.o, %o.header ], [ %red.next, %i.latch ]
1283 // br label %i.body
1284 //
1285 // i.body:
1286 // store %red.o to %mem
1287 // ...
1288 //
1289 if (!InnerRedPhi ||
1290 !llvm::is_contained(InnerRedPhi->incoming_values(), &PHI) ||
1291 !all_of(PHI.users(),
1292 [InnerRedPhi](User *U) { return U == InnerRedPhi; })) {
1293 LLVM_DEBUG(
1294 dbgs()
1295 << "Failed to recognize PHI as an induction or reduction.\n");
1296 ORE->emit([&]() {
1297 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter",
1298 OuterLoop->getStartLoc(),
1299 OuterLoop->getHeader())
1300 << "Only outer loops with induction or reduction PHI nodes "
1301 "can be interchanged currently.";
1302 });
1303 return false;
1304 }
1305
1306 OuterInnerReductions.insert(&PHI);
1307 OuterInnerReductions.insert(InnerRedPhi);
1308 } else {
1309 if (OuterInnerReductions.count(&PHI)) {
1310 LLVM_DEBUG(dbgs() << "Found a reduction across the outer loop.\n");
1311 } else if (EnableReduction2Memory &&
1312 isInnerReduction(CurLoop, &PHI, HasNoWrapReductions)) {
1313 LLVM_DEBUG(dbgs() << "Found a reduction in the inner loop: \n"
1314 << PHI << '\n');
1315 } else {
1316 ORE->emit([&]() {
1317 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner",
1318 CurLoop->getStartLoc(),
1319 CurLoop->getHeader())
1320 << "Only inner loops with induction or reduction PHI nodes "
1321 "can be interchanged currently.";
1322 });
1323 return false;
1324 }
1325 }
1326 }
1327
1328 // For now we only support at most one reduction.
1329 if (InnerReductions.size() > 1) {
1330 LLVM_DEBUG(dbgs() << "Only supports at most one reduction.\n");
1331 ORE->emit([&]() {
1332 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1333 CurLoop->getStartLoc(),
1334 CurLoop->getHeader())
1335 << "Only supports at most one reduction.";
1336 });
1337 return false;
1338 }
1339 }
1340
1341 return !InnerLoopInductions.empty();
1342}
1343
1344// This function indicates the current limitations in the transform as a result
1345// of which we do not proceed.
1346bool LoopInterchangeLegality::currentLimitations() {
1347 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1348
1349 // transform currently expects the loop latches to also be the exiting
1350 // blocks.
1351 if (InnerLoop->getExitingBlock() != InnerLoopLatch ||
1352 OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch() ||
1353 !isa<CondBrInst>(InnerLoopLatch->getTerminator()) ||
1354 !isa<CondBrInst>(OuterLoop->getLoopLatch()->getTerminator())) {
1355 LLVM_DEBUG(
1356 dbgs() << "Loops where the latch is not the exiting block are not"
1357 << " supported currently.\n");
1358 ORE->emit([&]() {
1359 return OptimizationRemarkMissed(DEBUG_TYPE, "ExitingNotLatch",
1360 OuterLoop->getStartLoc(),
1361 OuterLoop->getHeader())
1362 << "Loops where the latch is not the exiting block cannot be"
1363 " interchange currently.";
1364 });
1365 return true;
1366 }
1367
1368 // TODO: Triangular loops are not handled for now.
1369 if (!isLoopStructureUnderstood()) {
1370 LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n");
1371 ORE->emit([&]() {
1372 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner",
1373 InnerLoop->getStartLoc(),
1374 InnerLoop->getHeader())
1375 << "Inner loop structure not understood currently.";
1376 });
1377 return true;
1378 }
1379
1380 return false;
1381}
1382
1383/// We currently only support LCSSA PHI nodes in the inner loop exit if their
1384/// users are either of the following:
1385///
1386/// - Reduction PHIs
1387/// - PHIs outside the outer loop
1388/// - PHIs belonging to the latch of the outer loop
1389///
1390/// These conditions mean that we are only interested in the final value after
1391/// the inner loop.
1392static bool
1395 PHINode *LcssaReduction) {
1396 BasicBlock *InnerExit = InnerL->getUniqueExitBlock();
1397 for (PHINode &PHI : InnerExit->phis()) {
1398 // The reduction LCSSA PHI will have only one incoming block, which comes
1399 // from the loop latch.
1400 if (PHI.getNumIncomingValues() > 1)
1401 return false;
1402 // The reduction LCSSA PHI's store user is rewritten by reduction2Memory();
1403 // skip its user-check but keep validating the remaining LCSSA PHIs.
1404 if (&PHI == LcssaReduction)
1405 continue;
1406 if (any_of(PHI.users(), [&Reductions, OuterL](User *U) {
1407 PHINode *PN = dyn_cast<PHINode>(U);
1408 if (!PN)
1409 return true;
1410 if (Reductions.count(PN))
1411 return false;
1412 BasicBlock *PB = PN->getParent();
1413 if (!OuterL->contains(PB))
1414 return false;
1415 return PB != OuterL->getLoopLatch();
1416 }))
1417 return false;
1418 }
1419 return true;
1420}
1421
1422// We currently support LCSSA PHI nodes in the outer loop exit, if their
1423// incoming values do not come from the outer loop latch or if the
1424// outer loop latch has a single predecessor. In that case, the value will
1425// be available if both the inner and outer loop conditions are true, which
1426// will still be true after interchanging. If we have multiple predecessor,
1427// that may not be the case, e.g. because the outer loop latch may be executed
1428// if the inner loop is not executed.
1429static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
1430 BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock();
1431 for (PHINode &PHI : LoopNestExit->phis()) {
1432 for (Value *Incoming : PHI.incoming_values()) {
1433 Instruction *IncomingI = dyn_cast<Instruction>(Incoming);
1434 if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch())
1435 continue;
1436
1437 // The incoming value is defined in the outer loop latch. Currently we
1438 // only support that in case the outer loop latch has a single predecessor.
1439 // This guarantees that the outer loop latch is executed if and only if
1440 // the inner loop is executed (because tightlyNested() guarantees that the
1441 // outer loop header only branches to the inner loop or the outer loop
1442 // latch).
1443 // FIXME: We could weaken this logic and allow multiple predecessors,
1444 // if the values are produced outside the loop latch. We would need
1445 // additional logic to update the PHI nodes in the exit block as
1446 // well.
1447 if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)
1448 return false;
1449 }
1450 }
1451 return true;
1452}
1453
1454// In case of multi-level nested loops, it may occur that lcssa phis exist in
1455// the latch of InnerLoop, i.e., when defs of the incoming values are further
1456// inside the loopnest. Sometimes those incoming values are not available
1457// after interchange, since the original inner latch will become the new outer
1458// latch which may have predecessor paths that do not include those incoming
1459// values.
1460// TODO: Handle transformation of lcssa phis in the InnerLoop latch in case of
1461// multi-level loop nests.
1462static bool areInnerLoopLatchPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
1463 if (InnerLoop->getSubLoops().empty())
1464 return true;
1465 // If the original outer latch has only one predecessor, then values defined
1466 // further inside the looploop, e.g., in the innermost loop, will be available
1467 // at the new outer latch after interchange.
1468 if (OuterLoop->getLoopLatch()->getUniquePredecessor() != nullptr)
1469 return true;
1470
1471 // The outer latch has more than one predecessors, i.e., the inner
1472 // exit and the inner header.
1473 // PHI nodes in the inner latch are lcssa phis where the incoming values
1474 // are defined further inside the loopnest. Check if those phis are used
1475 // in the original inner latch. If that is the case then bail out since
1476 // those incoming values may not be available at the new outer latch.
1477 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1478 for (PHINode &PHI : InnerLoopLatch->phis()) {
1479 for (auto *U : PHI.users()) {
1481 if (InnerLoopLatch == UI->getParent())
1482 return false;
1483 }
1484 }
1485 return true;
1486}
1487
1488bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
1489 unsigned OuterLoopId,
1490 CharMatrix &DepMatrix) {
1491 if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
1492 LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
1493 << " and OuterLoopId = " << OuterLoopId
1494 << " due to dependence\n");
1495 ORE->emit([&]() {
1496 return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
1497 InnerLoop->getStartLoc(),
1498 InnerLoop->getHeader())
1499 << "Cannot interchange loops due to dependences.";
1500 });
1501 return false;
1502 }
1503 // Check if outer and inner loop contain legal instructions only.
1504 for (auto *BB : OuterLoop->blocks())
1505 for (Instruction &I : *BB) {
1506 // Loads and stores are checked separately, so we can skip them here.
1508 continue;
1509
1510 // We cannot ignore potential memory reads, e.g., loads inside the called
1511 // function.
1512 if (!I.mayHaveSideEffects() && !I.mayReadFromMemory())
1513 continue;
1514
1515 LLVM_DEBUG(
1516 dbgs()
1517 << "Loops contain instructions that cannot be safely interchanged\n");
1518 ORE->emit([&]() {
1519 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsafeInst",
1520 I.getDebugLoc(), I.getParent())
1521 << "Cannot interchange loops due to instruction that is "
1522 "potentially unsafe to interchange.";
1523 });
1524
1525 return false;
1526 }
1527
1528 if (!checkInductionsAndReductions(OuterLoop)) {
1529 LLVM_DEBUG(dbgs() << "Failed to find inner loop inductions or found "
1530 "unsupported reductions.\n");
1531 return false;
1532 }
1533
1534 if (!areInnerLoopLatchPHIsSupported(OuterLoop, InnerLoop)) {
1535 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop latch.\n");
1536 ORE->emit([&]() {
1537 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerLatchPHI",
1538 InnerLoop->getStartLoc(),
1539 InnerLoop->getHeader())
1540 << "Cannot interchange loops because unsupported PHI nodes found "
1541 "in inner loop latch.";
1542 });
1543 return false;
1544 }
1545
1546 // TODO: The loops could not be interchanged due to current limitations in the
1547 // transform module.
1548 if (currentLimitations()) {
1549 LLVM_DEBUG(dbgs() << "Not legal because of current transform limitation\n");
1550 return false;
1551 }
1552
1553 // Check if the loops are tightly nested.
1554 if (!tightlyNested(OuterLoop, InnerLoop)) {
1555 LLVM_DEBUG(dbgs() << "Loops not tightly nested\n");
1556 ORE->emit([&]() {
1557 return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested",
1558 InnerLoop->getStartLoc(),
1559 InnerLoop->getHeader())
1560 << "Cannot interchange loops because they are not tightly "
1561 "nested.";
1562 });
1563 return false;
1564 }
1565
1566 // The LCSSA PHI for the reduction has passed checks before; its user
1567 // is a store instruction.
1568 PHINode *LcssaReduction = nullptr;
1569 assert(InnerReductions.size() <= 1 &&
1570 "So far we only support at most one reduction.");
1571 if (InnerReductions.size() == 1)
1572 LcssaReduction = InnerReductions[0].LcssaPhi;
1573
1574 if (!areInnerLoopExitPHIsSupported(OuterLoop, InnerLoop, OuterInnerReductions,
1575 LcssaReduction)) {
1576 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n");
1577 ORE->emit([&]() {
1578 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1579 InnerLoop->getStartLoc(),
1580 InnerLoop->getHeader())
1581 << "Found unsupported PHI node in loop exit.";
1582 });
1583 return false;
1584 }
1585
1586 if (!areOuterLoopExitPHIsSupported(OuterLoop, InnerLoop)) {
1587 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n");
1588 ORE->emit([&]() {
1589 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1590 OuterLoop->getStartLoc(),
1591 OuterLoop->getHeader())
1592 << "Found unsupported PHI node in loop exit.";
1593 });
1594 return false;
1595 }
1596
1597 if (any_of(OuterLoop->getLoopLatch()->phis(),
1598 [](PHINode &PHI) { return PHI.getNumIncomingValues() != 1; })) {
1599 LLVM_DEBUG(dbgs() << "Only outer loop latch PHI nodes with one incoming "
1600 "value are supported.\n");
1601 ORE->emit([&]() {
1602 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLatchPHI",
1603 OuterLoop->getStartLoc(),
1604 OuterLoop->getHeader())
1605 << "Only outer loop latch PHI nodes with one incoming value are "
1606 "supported.";
1607 });
1608 return false;
1609 }
1610
1611 // Regarding def-use chains that begin at an LCSSA PHI in the inner loop exit
1612 // and end at any instruction in the outer loop latch, we currently support
1613 // only the case where the chain contains only PHI nodes. Since we already
1614 // call `tightlyNested()`, we know that if there is a def-use chain that we
1615 // don't support (i.e., a chain that contains a non-PHI user), then the
1616 // non-PHI user must be in the outer loop latch.
1617 if (InnerLoop->getExitBlock() != OuterLoop->getLoopLatch())
1618 for (PHINode &PHI : OuterLoop->getLoopLatch()->phis())
1619 if (any_of(PHI.users(), [](const User *U) { return !isa<PHINode>(U); })) {
1620 LLVM_DEBUG(dbgs() << "Outer loop latch PHI has a non-PHI user.\n");
1621 ORE->emit([&]() {
1622 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLatchPHI",
1623 OuterLoop->getStartLoc(),
1624 OuterLoop->getHeader())
1625 << "Cannot interchange loops because an outer loop latch PHI "
1626 "node has a non-PHI user.";
1627 });
1628 return false;
1629 }
1630
1631 return true;
1632}
1633
1634void CacheCostManager::computeIfUnitinialized() {
1635 if (CC.has_value())
1636 return;
1637
1638 LLVM_DEBUG(dbgs() << "Compute CacheCost.\n");
1639 CC = CacheCost::getCacheCost(*OutermostLoop, *AR, *DI);
1640 // Obtain the loop vector returned from loop cache analysis beforehand,
1641 // and put each <Loop, index> pair into a map for constant time query
1642 // later. Indices in loop vector reprsent the optimal order of the
1643 // corresponding loop, e.g., given a loopnest with depth N, index 0
1644 // indicates the loop should be placed as the outermost loop and index N
1645 // indicates the loop should be placed as the innermost loop.
1646 //
1647 // For the old pass manager CacheCost would be null.
1648 if (*CC != nullptr)
1649 for (const auto &[Idx, Cost] : enumerate((*CC)->getLoopCosts()))
1650 CostMap[Cost.first] = Idx;
1651}
1652
1653CacheCost *CacheCostManager::getCacheCost() {
1654 computeIfUnitinialized();
1655 return CC->get();
1656}
1657
1658const DenseMap<const Loop *, unsigned> &CacheCostManager::getCostMap() {
1659 computeIfUnitinialized();
1660 return CostMap;
1661}
1662
1663/// If \S contains an affine addrec for \p L, return the step recurrence of it.
1664/// If \S is loop invariant with respect to \p L, return nullptr. Otherwise,
1665/// return std::nullopt, which indicates we cannot determine the coefficient of
1666/// the addrec for \p L in \S.
1667/// TODO: Handle more complex cases. Maybe using SCEVTraversal is a good way to
1668/// do that.
1669static std::optional<const SCEV *>
1672 if (!AR) {
1673 if (SE.isLoopInvariant(S, L))
1674 return nullptr;
1675 return std::nullopt;
1676 }
1677
1678 if (!AR->isAffine()) {
1679 LLVM_DEBUG(dbgs() << "Unexpected non-affine addrec\n");
1680 return std::nullopt;
1681 }
1682
1683 std::optional<const SCEV *> Coeff =
1684 getAddRecCoefficient(SE, AR->getStart(), L);
1685 if (!Coeff.has_value())
1686 return std::nullopt;
1687
1688 if (AR->getLoop() == L) {
1689 assert(!*Coeff && "Found more than one addrec for the same loop");
1690 Coeff = AR->getStepRecurrence(SE);
1691 }
1692 return Coeff;
1693}
1694
1695int LoopInterchangeProfitability::getInstrOrderCost() {
1696 SmallPtrSet<const SCEV *, 4> GoodBasePtrs, BadBasePtrs;
1697 for (BasicBlock *BB : InnerLoop->blocks()) {
1698 for (Instruction &Ins : *BB) {
1699 if (!isa<LoadInst, StoreInst>(&Ins))
1700 continue;
1701 const SCEV *Access = SE->getSCEV(getLoadStorePointerOperand(&Ins));
1702 const SCEV *BasePtr = SE->getPointerBase(Access);
1703 std::optional<const SCEV *> OuterCoeff =
1704 getAddRecCoefficient(*SE, Access, OuterLoop);
1705 std::optional<const SCEV *> InnerCoeff =
1706 getAddRecCoefficient(*SE, Access, InnerLoop);
1707
1708 if (!OuterCoeff.has_value() || !*OuterCoeff || !InnerCoeff.has_value() ||
1709 !*InnerCoeff)
1710 continue;
1711
1712 // This heuristic assumes that a smaller step recurrence implies that the
1713 // induction variable corresponding to the loop is used in the inner
1714 // dimension of the array. Placing such a loop in the inner position would
1715 // be beneficial in terms of locality. If the array access is of the form
1716 // like `A[3*i + 2*j]`, this heuristic may lead to an unprofitable
1717 // interchange, but we expect such cases to be rare.
1718 const SCEV *OuterStep = SE->getAbsExpr(*OuterCoeff, /*IsNSW=*/false);
1719 const SCEV *InnerStep = SE->getAbsExpr(*InnerCoeff, /*IsNSW=*/false);
1720 // If we find the inner induction after an outer induction e.g.
1721 //
1722 // for(int i=0;i<N;i++)
1723 // for(int j=0;j<N;j++)
1724 // A[i][j] = A[i-1][j-1]+k;
1725 //
1726 //
1727 // then it is a good order. If we find the outer induction after an inner
1728 // induction e.g.
1729 //
1730 // for(int i=0;i<N;i++)
1731 // for(int j=0;j<N;j++)
1732 // A[j][i] = A[j-1][i-1]+k;
1733 //
1734 // then it is a bad order.
1735 //
1736 // To avoid counting the same base pointers multiple times, we deduplicate
1737 // them by using a set of base pointers.
1738 if (SE->isKnownPredicate(ICmpInst::ICMP_SLT, InnerStep, OuterStep))
1739 GoodBasePtrs.insert(BasePtr);
1740 else if (SE->isKnownPredicate(ICmpInst::ICMP_SLT, OuterStep, InnerStep))
1741 BadBasePtrs.insert(BasePtr);
1742 }
1743 }
1744
1745 int GoodOrder = GoodBasePtrs.size();
1746 int BadOrder = BadBasePtrs.size();
1747 return GoodOrder - BadOrder;
1748}
1749
1750std::optional<bool>
1751LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis(
1752 const DenseMap<const Loop *, unsigned> &CostMap, CacheCost *CC) {
1753 // This is the new cost model returned from loop cache analysis.
1754 // A smaller index means the loop should be placed an outer loop, and vice
1755 // versa.
1756 auto InnerLoopIt = CostMap.find(InnerLoop);
1757 if (InnerLoopIt == CostMap.end())
1758 return std::nullopt;
1759 auto OuterLoopIt = CostMap.find(OuterLoop);
1760 if (OuterLoopIt == CostMap.end())
1761 return std::nullopt;
1762
1763 if (CC->getLoopCost(*OuterLoop) == CC->getLoopCost(*InnerLoop))
1764 return std::nullopt;
1765 unsigned InnerIndex = InnerLoopIt->second;
1766 unsigned OuterIndex = OuterLoopIt->second;
1767 LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex
1768 << ", OuterIndex = " << OuterIndex << "\n");
1769 assert(InnerIndex != OuterIndex && "CostMap should assign unique "
1770 "numbers to each loop");
1771 return std::optional<bool>(InnerIndex < OuterIndex);
1772}
1773
1774std::optional<bool>
1775LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
1776 // Legacy cost model: this is rough cost estimation algorithm. It counts the
1777 // good and bad order of induction variables in the instruction and allows
1778 // reordering if number of bad orders is more than good.
1779 int Cost = getInstrOrderCost();
1780 LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
1782 return std::optional<bool>(true);
1783
1784 return std::nullopt;
1785}
1786
1787/// Return true if we can vectorize the loop specified by \p LoopId.
1788static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId) {
1789 for (const auto &Dep : DepMatrix) {
1790 char Dir = Dep[LoopId];
1791 char DepType = Dep.back();
1792 assert((DepType == '<' || DepType == '*') &&
1793 "Unexpected element in dependency vector");
1794
1795 // There are no loop-carried dependencies.
1796 if (Dir == '=' || Dir == 'I')
1797 continue;
1798
1799 // DepType being '<' means that this direction vector represents a forward
1800 // dependency. In principle, a loop with '<' direction can be vectorized in
1801 // this case.
1802 if (Dir == '<' && DepType == '<')
1803 continue;
1804
1805 // We cannot prove that the loop is vectorizable.
1806 return false;
1807 }
1808 return true;
1809}
1810
1811std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
1812 unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) {
1813 // If the outer loop cannot be vectorized, it is not profitable to move this
1814 // to inner position.
1815 if (!canVectorize(DepMatrix, OuterLoopId))
1816 return false;
1817
1818 // If the inner loop cannot be vectorized but the outer loop can be, then it
1819 // is profitable to interchange to enable inner loop parallelism.
1820 if (!canVectorize(DepMatrix, InnerLoopId))
1821 return true;
1822
1823 // If both the inner and the outer loop can be vectorized, it is necessary to
1824 // check the cost of each vectorized loop for profitability decision. At this
1825 // time we do not have a cost model to estimate them, so return nullopt.
1826 // TODO: Estimate the cost of vectorized loop when both the outer and the
1827 // inner loop can be vectorized.
1828 return std::nullopt;
1829}
1830
1831bool LoopInterchangeProfitability::isProfitable(
1832 const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
1833 unsigned OuterLoopId, CharMatrix &DepMatrix, CacheCostManager &CCM) {
1834 // Do not consider loops with a backedge that isn't taken, e.g. an
1835 // unconditional branch true/false, as candidates for interchange.
1836 // TODO: when interchange is forced, we should probably also allow
1837 // interchange for these loops, and thus this logic should be moved just
1838 // below the cost-model ignore check below. But this check is done first
1839 // to avoid the issue in #163954.
1840 const SCEV *InnerBTC = SE->getBackedgeTakenCount(InnerLoop);
1841 const SCEV *OuterBTC = SE->getBackedgeTakenCount(OuterLoop);
1842 if (InnerBTC && InnerBTC->isZero()) {
1843 LLVM_DEBUG(dbgs() << "Inner loop back-edge isn't taken, rejecting "
1844 "single iteration loop\n");
1845 return false;
1846 }
1847 if (OuterBTC && OuterBTC->isZero()) {
1848 LLVM_DEBUG(dbgs() << "Outer loop back-edge isn't taken, rejecting "
1849 "single iteration loop\n");
1850 return false;
1851 }
1852
1853 // Return true if interchange is forced and the cost-model ignored.
1854 if (Profitabilities.size() == 1 && Profitabilities[0] == RuleTy::Ignore)
1855 return true;
1857 "Duplicate rules and option 'ignore' are not allowed");
1858
1859 // isProfitable() is structured to avoid endless loop interchange. If the
1860 // highest priority rule (isProfitablePerLoopCacheAnalysis by default) could
1861 // decide the profitability then, profitability check will stop and return the
1862 // analysis result. If it failed to determine it (e.g., cache analysis failed
1863 // to analyze the loopnest due to delinearization issues) then go ahead the
1864 // second highest priority rule (isProfitablePerInstrOrderCost by default).
1865 // Likewise, if it failed to analysis the profitability then only, the last
1866 // rule (isProfitableForVectorization by default) will decide.
1867 std::optional<bool> shouldInterchange;
1868 for (RuleTy RT : Profitabilities) {
1869 switch (RT) {
1870 case RuleTy::PerLoopCacheAnalysis: {
1871 CacheCost *CC = CCM.getCacheCost();
1872 const DenseMap<const Loop *, unsigned> &CostMap = CCM.getCostMap();
1873 shouldInterchange = isProfitablePerLoopCacheAnalysis(CostMap, CC);
1874 break;
1875 }
1876 case RuleTy::PerInstrOrderCost:
1877 shouldInterchange = isProfitablePerInstrOrderCost();
1878 break;
1879 case RuleTy::ForVectorization:
1880 shouldInterchange =
1881 isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
1882 break;
1883 case RuleTy::Ignore:
1884 llvm_unreachable("Option 'ignore' is not supported with other options");
1885 break;
1886 }
1887
1888 // If this rule could determine the profitability, don't call subsequent
1889 // rules.
1890 if (shouldInterchange.has_value())
1891 break;
1892 }
1893
1894 if (!shouldInterchange.has_value()) {
1895 ORE->emit([&]() {
1896 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1897 InnerLoop->getStartLoc(),
1898 InnerLoop->getHeader())
1899 << "Insufficient information to calculate the cost of loop for "
1900 "interchange.";
1901 });
1902 return false;
1903 } else if (!shouldInterchange.value()) {
1904 ORE->emit([&]() {
1905 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1906 InnerLoop->getStartLoc(),
1907 InnerLoop->getHeader())
1908 << "Interchanging loops is not considered to improve cache "
1909 "locality nor vectorization.";
1910 });
1911 return false;
1912 }
1913 return true;
1914}
1915
1916void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
1917 Loop *InnerLoop) {
1918 for (Loop *L : *OuterLoop)
1919 if (L == InnerLoop) {
1920 OuterLoop->removeChildLoop(L);
1921 return;
1922 }
1923 llvm_unreachable("Couldn't find loop");
1924}
1925
1926/// Update LoopInfo, after interchanging. NewInner and NewOuter refer to the
1927/// new inner and outer loop after interchanging: NewInner is the original
1928/// outer loop and NewOuter is the original inner loop.
1929///
1930/// Before interchanging, we have the following structure
1931/// Outer preheader
1932// Outer header
1933// Inner preheader
1934// Inner header
1935// Inner body
1936// Inner latch
1937// outer bbs
1938// Outer latch
1939//
1940// After interchanging:
1941// Inner preheader
1942// Inner header
1943// Outer preheader
1944// Outer header
1945// Inner body
1946// outer bbs
1947// Outer latch
1948// Inner latch
1949void LoopInterchangeTransform::restructureLoops(
1950 Loop *NewInner, Loop *NewOuter, BasicBlock *OrigInnerPreHeader,
1951 BasicBlock *OrigOuterPreHeader) {
1952 Loop *OuterLoopParent = OuterLoop->getParentLoop();
1953 // The original inner loop preheader moves from the new inner loop to
1954 // the parent loop, if there is one.
1955 NewInner->removeBlockFromLoop(OrigInnerPreHeader);
1956 LI->changeLoopFor(OrigInnerPreHeader, OuterLoopParent);
1957
1958 // Switch the loop levels.
1959 if (OuterLoopParent) {
1960 // Remove the loop from its parent loop.
1961 removeChildLoop(OuterLoopParent, NewInner);
1962 removeChildLoop(NewInner, NewOuter);
1963 OuterLoopParent->addChildLoop(NewOuter);
1964 } else {
1965 removeChildLoop(NewInner, NewOuter);
1966 LI->changeTopLevelLoop(NewInner, NewOuter);
1967 }
1968 while (!NewOuter->isInnermost())
1969 NewInner->addChildLoop(NewOuter->removeChildLoop(NewOuter->begin()));
1970 NewOuter->addChildLoop(NewInner);
1971
1972 // BBs from the original inner loop.
1973 SmallVector<BasicBlock *, 8> OrigInnerBBs(NewOuter->blocks());
1974
1975 // Add BBs from the original outer loop to the original inner loop (excluding
1976 // BBs already in inner loop)
1977 for (BasicBlock *BB : NewInner->blocks())
1978 if (LI->getLoopFor(BB) == NewInner)
1979 NewOuter->addBlockEntry(BB);
1980
1981 // Now remove inner loop header and latch from the new inner loop and move
1982 // other BBs (the loop body) to the new inner loop.
1983 BasicBlock *OuterHeader = NewOuter->getHeader();
1984 BasicBlock *OuterLatch = NewOuter->getLoopLatch();
1985 for (BasicBlock *BB : OrigInnerBBs) {
1986 // Nothing will change for BBs in child loops.
1987 if (LI->getLoopFor(BB) != NewOuter)
1988 continue;
1989 // Remove the new outer loop header and latch from the new inner loop.
1990 if (BB == OuterHeader || BB == OuterLatch)
1991 NewInner->removeBlockFromLoop(BB);
1992 else
1993 LI->changeLoopFor(BB, NewInner);
1994 }
1995
1996 // The preheader of the original outer loop becomes part of the new
1997 // outer loop.
1998 NewOuter->addBlockEntry(OrigOuterPreHeader);
1999 LI->changeLoopFor(OrigOuterPreHeader, NewOuter);
2000
2001 // Tell SE that we move the loops around.
2002 SE->forgetLoop(NewOuter);
2003}
2004
2005/// User can write, or optimizers can generate the reduction for inner loop.
2006/// To make the interchange valid, apply Reduction2Mem by moving the
2007/// initializer and store instructions into the inner loop. So far we only
2008/// handle cases where the reduction variable is initialized to a constant.
2009/// For example, below code:
2010///
2011/// loop:
2012/// re = phi<0.0, next>
2013/// next = re op ...
2014/// endloop
2015/// reduc_sum = phi<next> // lcssa phi
2016/// MEM_REF[idx] = reduc_sum // LcssaStore
2017///
2018/// is transformed into:
2019///
2020/// loop:
2021/// tmp = MEM_REF[idx];
2022/// new_var = !first_iteration ? tmp : 0.0;
2023/// next = new_var op ...
2024/// MEM_REF[idx] = next; // after moving
2025/// endloop
2026///
2027/// In this way the initial const is used in the first iteration of loop.
2028void LoopInterchangeTransform::reduction2Memory() {
2030 LIL.getInnerReductions();
2031
2032 assert(InnerReductions.size() == 1 &&
2033 "So far we only support at most one reduction.");
2034
2035 LoopInterchangeLegality::InnerReduction SR = InnerReductions[0];
2036 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
2037 IRBuilder<> Builder(&*(InnerLoopHeader->getFirstNonPHIIt()));
2038
2039 // Check if it's the first iteration.
2040 LLVMContext &Context = InnerLoopHeader->getContext();
2041 PHINode *FirstIter =
2042 Builder.CreatePHI(Type::getInt1Ty(Context), 2, "first.iter");
2043 FirstIter->addIncoming(ConstantInt::get(Type::getInt1Ty(Context), 1),
2044 InnerLoop->getLoopPreheader());
2045 FirstIter->addIncoming(ConstantInt::get(Type::getInt1Ty(Context), 0),
2046 InnerLoop->getLoopLatch());
2047 assert(FirstIter->isComplete() && "The FirstIter PHI node is not complete.");
2048
2049 // When the reduction is initialized from a constant value, we need to add
2050 // a stmt loading from the memory object to target basic block in inner
2051 // loop.
2052 Instruction *LoadMem = Builder.CreateLoad(SR.ElemTy, SR.MemRef);
2053
2054 // Init new_var to MEM_REF or CONST depending on if it is the first iteration.
2055 Value *NewVar = Builder.CreateSelect(FirstIter, SR.Init, LoadMem, "new.var");
2056
2057 // Replace all uses of the reduction variable with a new variable.
2058 SR.Reduction->replaceAllUsesWith(NewVar);
2059
2060 // Move store instruction into inner loop, just after reduction next's
2061 // definition.
2062 SR.LcssaStore->setOperand(0, SR.Next);
2063 SR.LcssaStore->moveAfter(dyn_cast<Instruction>(SR.Next));
2064}
2065
2066bool LoopInterchangeTransform::transform(
2067 ArrayRef<Instruction *> DropNoWrapInsts,
2068 ArrayRef<Instruction *> DropNoInfInsts) {
2069 bool Transformed = false;
2070
2072 LIL.getInnerReductions();
2073 if (InnerReductions.size() == 1)
2074 reduction2Memory();
2075
2076 LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n");
2077 auto &InductionPHIs = LIL.getInnerLoopInductions();
2078 if (InductionPHIs.empty()) {
2079 LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
2080 return false;
2081 }
2082
2083 SmallVector<Instruction *, 8> InnerIndexVarList;
2084 for (PHINode *CurInductionPHI : InductionPHIs) {
2085 Instruction *IncomingValue = dyn_cast<Instruction>(
2086 CurInductionPHI->getIncomingValueForBlock(InnerLoop->getLoopLatch()));
2087 assert(IncomingValue &&
2088 "Incoming value from loop latch isn't an instruction");
2089 if (is_contained(InductionPHIs, IncomingValue))
2090 continue;
2091 InnerIndexVarList.push_back(IncomingValue);
2092 }
2093
2094 // Create a new latch block for the inner loop. We split at the
2095 // current latch's terminator and then move the condition and all
2096 // operands that are not either loop-invariant or the induction PHI into the
2097 // new latch block.
2098 BasicBlock *NewLatch =
2099 SplitBlock(InnerLoop->getLoopLatch(),
2100 InnerLoop->getLoopLatch()->getTerminator(), DT, LI);
2101
2102 SmallSetVector<Instruction *, 4> WorkList;
2103 unsigned i = 0;
2104 auto MoveInstructions = [&i, &WorkList, this, &InductionPHIs, NewLatch]() {
2105 for (; i < WorkList.size(); i++) {
2106 // Duplicate instruction and move it to the new latch. Update uses that
2107 // have been moved.
2108 Instruction *NewI = WorkList[i]->clone();
2109 NewI->insertBefore(NewLatch->getFirstNonPHIIt());
2110 assert(!NewI->mayHaveSideEffects() &&
2111 "Moving instructions with side-effects may change behavior of "
2112 "the loop nest!");
2113 for (Use &U : llvm::make_early_inc_range(WorkList[i]->uses())) {
2114 Instruction *UserI = cast<Instruction>(U.getUser());
2115 if (!InnerLoop->contains(UserI->getParent()) ||
2116 UserI->getParent() == NewLatch ||
2117 llvm::is_contained(InductionPHIs, UserI))
2118 U.set(NewI);
2119 }
2120 // Add operands of moved instruction to the worklist, except if they are
2121 // outside the inner loop or are the induction PHI.
2122 for (Value *Op : WorkList[i]->operands()) {
2124 if (!OpI || this->LI->getLoopFor(OpI->getParent()) != this->InnerLoop ||
2125 llvm::is_contained(InductionPHIs, OpI))
2126 continue;
2127 WorkList.insert(OpI);
2128 }
2129 }
2130 };
2131
2132 // FIXME: Should we interchange when we have a constant condition?
2135 ->getCondition());
2136 if (CondI)
2137 WorkList.insert(CondI);
2138 MoveInstructions();
2139 for (Instruction *InnerIndexVar : InnerIndexVarList)
2140 WorkList.insert(cast<Instruction>(InnerIndexVar));
2141 MoveInstructions();
2142
2143 // Ensure the inner loop phi nodes have a separate basic block.
2144 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
2145 if (&*InnerLoopHeader->getFirstNonPHIIt() !=
2146 InnerLoopHeader->getTerminator()) {
2147 SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHIIt(), DT, LI);
2148 LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n");
2149 }
2150
2151 // Instructions in the original inner loop preheader may depend on values
2152 // defined in the outer loop header. Move them there, because the original
2153 // inner loop preheader will become the entry into the interchanged loop nest.
2154 // Currently we move all instructions and rely on LICM to move invariant
2155 // instructions outside the loop nest.
2156 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
2157 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
2158
2159 if (InnerLoopPreHeader != OuterLoopHeader) {
2160 // Eliminate PHIs in the inner-loop preheader.
2161 for (PHINode &P : make_early_inc_range(InnerLoopPreHeader->phis())) {
2162 assert(P.getNumIncomingValues() == 1 &&
2163 "Expected single-incoming PHIs in inner loop preheader");
2164 P.replaceAllUsesWith(P.getIncomingValue(0));
2165 P.eraseFromParent();
2166 }
2167 for (Instruction &I :
2168 make_early_inc_range(make_range(InnerLoopPreHeader->begin(),
2169 std::prev(InnerLoopPreHeader->end()))))
2170 I.moveBeforePreserving(OuterLoopHeader->getTerminator()->getIterator());
2171 }
2172
2173 Transformed |= adjustLoopLinks();
2174 if (!Transformed) {
2175 LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n");
2176 return false;
2177 }
2178
2179 // Finally, drop the nsw/nuw/ninf flags from the instructions for reduction
2180 // calculations.
2181 for (Instruction *Reduction : DropNoWrapInsts) {
2182 Reduction->setHasNoSignedWrap(false);
2183 Reduction->setHasNoUnsignedWrap(false);
2184 }
2185 for (Instruction *I : DropNoInfInsts)
2186 I->setHasNoInfs(false);
2187
2188 return true;
2189}
2190
2191/// \brief Move all instructions except the terminator from FromBB right before
2192/// InsertBefore
2193static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
2194 BasicBlock *ToBB = InsertBefore->getParent();
2195
2196 ToBB->splice(InsertBefore->getIterator(), FromBB, FromBB->begin(),
2197 FromBB->getTerminator()->getIterator());
2198}
2199
2200/// Swap instructions between \p BB1 and \p BB2 but keep terminators intact.
2201static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2) {
2202 // Save all non-terminator instructions of BB1 into TempInstrs and unlink them
2203 // from BB1 afterwards.
2204 auto Iter = map_range(*BB1, [](Instruction &I) { return &I; });
2205 SmallVector<Instruction *, 4> TempInstrs(Iter.begin(), std::prev(Iter.end()));
2206 for (Instruction *I : TempInstrs)
2207 I->removeFromParent();
2208
2209 // Move instructions from BB2 to BB1.
2210 moveBBContents(BB2, BB1->getTerminator());
2211
2212 // Move instructions from TempInstrs to BB2.
2213 for (Instruction *I : TempInstrs)
2214 I->insertBefore(BB2->getTerminator()->getIterator());
2215}
2216
2217// Update BI to jump to NewBB instead of OldBB. Records updates to the
2218// dominator tree in DTUpdates. If \p MustUpdateOnce is true, assert that
2219// \p OldBB is exactly once in BI's successor list.
2220static void updateSuccessor(Instruction *Term, BasicBlock *OldBB,
2221 BasicBlock *NewBB,
2222 std::vector<DominatorTree::UpdateType> &DTUpdates,
2223 bool MustUpdateOnce = true) {
2224 assert((!MustUpdateOnce || llvm::count(successors(Term), OldBB) == 1) &&
2225 "BI must jump to OldBB exactly once.");
2226 bool Changed = false;
2227 for (Use &Op : Term->operands())
2228 if (Op == OldBB) {
2229 Op.set(NewBB);
2230 Changed = true;
2231 }
2232
2233 if (Changed) {
2234 DTUpdates.push_back(
2235 {DominatorTree::UpdateKind::Insert, Term->getParent(), NewBB});
2236 DTUpdates.push_back(
2237 {DominatorTree::UpdateKind::Delete, Term->getParent(), OldBB});
2238 }
2239 assert(Changed && "Expected a successor to be updated");
2240}
2241
2242// Move Lcssa PHIs to the right place.
2243static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader,
2244 BasicBlock *InnerLatch, BasicBlock *OuterHeader,
2245 BasicBlock *OuterLatch, BasicBlock *OuterExit,
2246 Loop *InnerLoop, LoopInfo *LI) {
2247
2248 // Deal with LCSSA PHI nodes in the exit block of the inner loop, that are
2249 // defined either in the header or latch. Those blocks will become header and
2250 // latch of the new outer loop, and the only possible users can PHI nodes
2251 // in the exit block of the loop nest or the outer loop header (reduction
2252 // PHIs, in that case, the incoming value must be defined in the inner loop
2253 // header). We can just substitute the user with the incoming value and remove
2254 // the PHI.
2255 for (PHINode &P : make_early_inc_range(InnerExit->phis())) {
2256 assert(P.getNumIncomingValues() == 1 &&
2257 "Only loops with a single exit are supported!");
2258
2259 Value *IncomingValue = P.getIncomingValueForBlock(InnerLatch);
2260 auto *IncI = dyn_cast<Instruction>(IncomingValue);
2261 if (!IncI) {
2262 // If the incoming value is not an instruction, it must be loop invariant.
2263 // In that case, we can just replace the PHI with the incoming value and
2264 // remove the PHI.
2265 assert(InnerLoop->isLoopInvariant(IncomingValue) &&
2266 "Expected non-instruction incoming value to be loop invariant");
2267 P.replaceAllUsesWith(IncomingValue);
2268 P.eraseFromParent();
2269 continue;
2270 }
2271
2272 // In case of multi-level nested loops, follow LCSSA to find the incoming
2273 // value defined from the innermost loop.
2274 auto *IncIInnerMost = dyn_cast<Instruction>(followLCSSA(IncI));
2275 // Skip phis when:
2276 // - they are not an instruction, e.g. incoming values are constants.
2277 // - Incomming values from the inner loop body, excluding the header and
2278 // latch.
2279 if (!IncIInnerMost || (IncIInnerMost->getParent() != InnerLatch &&
2280 IncIInnerMost->getParent() != InnerHeader))
2281 continue;
2282
2283 assert(all_of(P.users(),
2284 [OuterHeader, OuterExit, IncI, InnerHeader](User *U) {
2285 return (cast<PHINode>(U)->getParent() == OuterHeader &&
2286 IncI->getParent() == InnerHeader) ||
2287 cast<PHINode>(U)->getParent() == OuterExit;
2288 }) &&
2289 "Can only replace phis iff the uses are in the loop nest exit or "
2290 "the incoming value is defined in the inner header (it will "
2291 "dominate all loop blocks after interchanging)");
2292 P.replaceAllUsesWith(IncI);
2293 P.eraseFromParent();
2294 }
2295
2296 SmallVector<PHINode *, 8> LcssaInnerExit(
2297 llvm::make_pointer_range(InnerExit->phis()));
2298
2299 SmallVector<PHINode *, 8> LcssaInnerLatch(
2300 llvm::make_pointer_range(InnerLatch->phis()));
2301
2302 // Lcssa PHIs for values used outside the inner loop are in InnerExit.
2303 // If a PHI node has users outside of InnerExit, it has a use outside the
2304 // interchanged loop and we have to preserve it. We move these to
2305 // InnerLatch, which will become the new exit block for the innermost
2306 // loop after interchanging.
2307 for (PHINode *P : LcssaInnerExit)
2308 P->moveBefore(InnerLatch->getFirstNonPHIIt());
2309
2310 // If the inner loop latch contains LCSSA PHIs, those come from a child loop
2311 // and we have to move them to the new inner latch.
2312 for (PHINode *P : LcssaInnerLatch)
2313 P->moveBefore(InnerExit->getFirstNonPHIIt());
2314
2315 // Deal with LCSSA PHI nodes in the loop nest exit block. For PHIs that have
2316 // incoming values defined in the outer loop, we have to add a new PHI
2317 // in the inner loop latch, which became the exit block of the outer loop,
2318 // after interchanging.
2319 if (OuterExit) {
2320 for (PHINode &P : OuterExit->phis()) {
2321 if (P.getNumIncomingValues() != 1)
2322 continue;
2323 // Skip Phis with incoming values defined in the inner loop. Those should
2324 // already have been updated.
2325 auto I = dyn_cast<Instruction>(P.getIncomingValue(0));
2326 if (!I || LI->getLoopFor(I->getParent()) == InnerLoop)
2327 continue;
2328
2329 PHINode *NewPhi = dyn_cast<PHINode>(P.clone());
2330 NewPhi->setIncomingValue(0, P.getIncomingValue(0));
2331 NewPhi->setIncomingBlock(0, OuterLatch);
2332 // We might have incoming edges from other BBs, i.e., the original outer
2333 // header.
2334 for (auto *Pred : predecessors(InnerLatch)) {
2335 if (Pred == OuterLatch)
2336 continue;
2337 NewPhi->addIncoming(P.getIncomingValue(0), Pred);
2338 }
2339 NewPhi->insertBefore(InnerLatch->getFirstNonPHIIt());
2340 P.setIncomingValue(0, NewPhi);
2341 }
2342 }
2343
2344 // Now adjust the incoming blocks for the LCSSA PHIs.
2345 // For PHIs moved from Inner's exit block, we need to replace Inner's latch
2346 // with the new latch.
2347 InnerLatch->replacePhiUsesWith(InnerLatch, OuterLatch);
2348}
2349
2350/// This deals with a corner case when a LCSSA phi node appears in a non-exit
2351/// block: the outer loop latch block does not need to be exit block of the
2352/// inner loop. Consider a loop that was in LCSSA form, but then some
2353/// transformation like loop-unswitch comes along and creates an empty block,
2354/// where BB5 in this example is the outer loop latch block:
2355///
2356/// BB4:
2357/// br label %BB5
2358/// BB5:
2359/// %old.cond.lcssa = phi i16 [ %cond, %BB4 ]
2360/// br outer.header
2361///
2362/// Interchange then brings it in LCSSA form again resulting in this chain of
2363/// single-input phi nodes:
2364///
2365/// BB4:
2366/// %new.cond.lcssa = phi i16 [ %cond, %BB3 ]
2367/// br label %BB5
2368/// BB5:
2369/// %old.cond.lcssa = phi i16 [ %new.cond.lcssa, %BB4 ]
2370///
2371/// The problem is that interchange can reoder blocks BB4 and BB5 placing the
2372/// use before the def if we don't check this. The solution is to simplify
2373/// lcssa phi nodes (remove) if they appear in non-exit blocks.
2374///
2375static void simplifyLCSSAPhis(Loop *OuterLoop, Loop *InnerLoop) {
2376 BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
2377 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
2378
2379 // Do not modify lcssa phis where they actually belong, i.e. in exit blocks.
2380 if (OuterLoopLatch == InnerLoopExit)
2381 return;
2382
2383 // Collect and remove phis in non-exit blocks if they have 1 input.
2385 llvm::make_pointer_range(OuterLoopLatch->phis()));
2386 for (PHINode *Phi : Phis) {
2387 assert(Phi->getNumIncomingValues() == 1 && "Single input phi expected");
2388 LLVM_DEBUG(dbgs() << "Removing 1-input phi in non-exit block: " << *Phi
2389 << "\n");
2390 Phi->replaceAllUsesWith(Phi->getIncomingValue(0));
2391 Phi->eraseFromParent();
2392 }
2393}
2394
2395bool LoopInterchangeTransform::adjustLoopBranches() {
2396 LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n");
2397 std::vector<DominatorTree::UpdateType> DTUpdates;
2398
2399 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
2400 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
2401
2402 assert(OuterLoopPreHeader != OuterLoop->getHeader() &&
2403 InnerLoopPreHeader != InnerLoop->getHeader() && OuterLoopPreHeader &&
2404 InnerLoopPreHeader && "Guaranteed by loop-simplify form");
2405
2406 simplifyLCSSAPhis(OuterLoop, InnerLoop);
2407
2408 // Ensure that both preheaders do not contain PHI nodes and have single
2409 // predecessors. This allows us to move them easily. We use
2410 // InsertPreHeaderForLoop to create an 'extra' preheader, if the existing
2411 // preheaders do not satisfy those conditions.
2412 if (isa<PHINode>(OuterLoopPreHeader->begin()) ||
2413 !OuterLoopPreHeader->getUniquePredecessor())
2414 OuterLoopPreHeader =
2415 InsertPreheaderForLoop(OuterLoop, DT, LI, nullptr, true);
2416 if (InnerLoopPreHeader == OuterLoop->getHeader())
2417 InnerLoopPreHeader =
2418 InsertPreheaderForLoop(InnerLoop, DT, LI, nullptr, true);
2419
2420 // Adjust the loop preheader
2421 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
2422 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
2423 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
2424 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
2425 BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor();
2426 BasicBlock *InnerLoopLatchPredecessor =
2427 InnerLoopLatch->getUniquePredecessor();
2428 BasicBlock *InnerLoopLatchSuccessor;
2429 BasicBlock *OuterLoopLatchSuccessor;
2430
2431 CondBrInst *OuterLoopLatchBI =
2432 dyn_cast<CondBrInst>(OuterLoopLatch->getTerminator());
2433 CondBrInst *InnerLoopLatchBI =
2434 dyn_cast<CondBrInst>(InnerLoopLatch->getTerminator());
2435 Instruction *OuterLoopHeaderBI = OuterLoopHeader->getTerminator();
2436 Instruction *InnerLoopHeaderBI = InnerLoopHeader->getTerminator();
2437
2438 if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor ||
2439 !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI ||
2440 !InnerLoopHeaderBI)
2441 return false;
2442
2443 Instruction *InnerLoopLatchPredecessorBI =
2444 InnerLoopLatchPredecessor->getTerminator();
2445 Instruction *OuterLoopPredecessorBI = OuterLoopPredecessor->getTerminator();
2446
2447 if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI)
2448 return false;
2449 BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor();
2450 if (!InnerLoopHeaderSuccessor)
2451 return false;
2452
2453 // Adjust Loop Preheader and headers.
2454 // The branches in the outer loop predecessor and the outer loop header can
2455 // be unconditional branches or conditional branches with duplicates. Consider
2456 // this when updating the successors.
2457 updateSuccessor(OuterLoopPredecessorBI, OuterLoopPreHeader,
2458 InnerLoopPreHeader, DTUpdates, /*MustUpdateOnce=*/false);
2459 // The outer loop header might or might not branch to the outer latch.
2460 // We are guaranteed to branch to the inner loop preheader.
2461 if (llvm::is_contained(successors(OuterLoopHeaderBI), OuterLoopLatch)) {
2462 // In this case the outerLoopHeader should branch to the InnerLoopLatch.
2463 updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, InnerLoopLatch,
2464 DTUpdates,
2465 /*MustUpdateOnce=*/false);
2466 }
2467 updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader,
2468 InnerLoopHeaderSuccessor, DTUpdates,
2469 /*MustUpdateOnce=*/false);
2470
2471 // Adjust reduction PHI's now that the incoming block has changed.
2472 InnerLoopHeaderSuccessor->replacePhiUsesWith(InnerLoopHeader,
2473 OuterLoopHeader);
2474
2475 updateSuccessor(InnerLoopHeaderBI, InnerLoopHeaderSuccessor,
2476 OuterLoopPreHeader, DTUpdates);
2477
2478 // -------------Adjust loop latches-----------
2479 if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader)
2480 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1);
2481 else
2482 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0);
2483
2484 updateSuccessor(InnerLoopLatchPredecessorBI, InnerLoopLatch,
2485 InnerLoopLatchSuccessor, DTUpdates);
2486
2487 if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader)
2488 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1);
2489 else
2490 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0);
2491
2492 updateSuccessor(InnerLoopLatchBI, InnerLoopLatchSuccessor,
2493 OuterLoopLatchSuccessor, DTUpdates);
2494 updateSuccessor(OuterLoopLatchBI, OuterLoopLatchSuccessor, InnerLoopLatch,
2495 DTUpdates);
2496
2497 DT->applyUpdates(DTUpdates);
2498 restructureLoops(OuterLoop, InnerLoop, InnerLoopPreHeader,
2499 OuterLoopPreHeader);
2500
2501 moveLCSSAPhis(InnerLoopLatchSuccessor, InnerLoopHeader, InnerLoopLatch,
2502 OuterLoopHeader, OuterLoopLatch, InnerLoop->getExitBlock(),
2503 InnerLoop, LI);
2504 // For PHIs in the exit block of the outer loop, outer's latch has been
2505 // replaced by Inners'.
2506 OuterLoopLatchSuccessor->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
2507
2508 auto &OuterInnerReductions = LIL.getOuterInnerReductions();
2509 // Now update the reduction PHIs in the inner and outer loop headers.
2510 SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;
2511 for (PHINode &PHI : InnerLoopHeader->phis())
2512 if (OuterInnerReductions.contains(&PHI))
2513 InnerLoopPHIs.push_back(&PHI);
2514
2515 for (PHINode &PHI : OuterLoopHeader->phis())
2516 if (OuterInnerReductions.contains(&PHI))
2517 OuterLoopPHIs.push_back(&PHI);
2518
2519 // Now move the remaining reduction PHIs from outer to inner loop header and
2520 // vice versa. The PHI nodes must be part of a reduction across the inner and
2521 // outer loop and all the remains to do is and updating the incoming blocks.
2522 for (PHINode *PHI : OuterLoopPHIs) {
2523 LLVM_DEBUG(dbgs() << "Outer loop reduction PHIs:\n"; PHI->dump(););
2524 PHI->moveBefore(InnerLoopHeader->getFirstNonPHIIt());
2525 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
2526 }
2527 for (PHINode *PHI : InnerLoopPHIs) {
2528 LLVM_DEBUG(dbgs() << "Inner loop reduction PHIs:\n"; PHI->dump(););
2529 PHI->moveBefore(OuterLoopHeader->getFirstNonPHIIt());
2530 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
2531 }
2532
2533 // Update the incoming blocks for moved PHI nodes.
2534 OuterLoopHeader->replacePhiUsesWith(InnerLoopPreHeader, OuterLoopPreHeader);
2535 OuterLoopHeader->replacePhiUsesWith(InnerLoopLatch, OuterLoopLatch);
2536 InnerLoopHeader->replacePhiUsesWith(OuterLoopPreHeader, InnerLoopPreHeader);
2537 InnerLoopHeader->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
2538
2539 // Values defined in the outer loop header could be used in the inner loop
2540 // latch. In that case, we need to create LCSSA phis for them, because after
2541 // interchanging they will be defined in the new inner loop and used in the
2542 // new outer loop.
2543 SmallVector<Instruction *, 4> MayNeedLCSSAPhis;
2544 for (Instruction &I :
2545 make_range(OuterLoopHeader->begin(), std::prev(OuterLoopHeader->end())))
2546 MayNeedLCSSAPhis.push_back(&I);
2547 formLCSSAForInstructions(MayNeedLCSSAPhis, *DT, *LI, SE);
2548
2549 return true;
2550}
2551
2552bool LoopInterchangeTransform::adjustLoopLinks() {
2553 // Adjust all branches in the inner and outer loop.
2554 bool Changed = adjustLoopBranches();
2555 if (Changed) {
2556 // We have interchanged the preheaders so we need to interchange the data in
2557 // the preheaders as well. This is because the content of the inner
2558 // preheader was previously executed inside the outer loop.
2559 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
2560 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
2561 swapBBContents(OuterLoopPreHeader, InnerLoopPreHeader);
2562 }
2563 return Changed;
2564}
2565
2569 LPMUpdater &U) {
2570 Function &F = *LN.getParent();
2571 SmallVector<Loop *, 8> LoopList(LN.getLoops());
2572
2574
2575 // Ensure minimum depth of the loop nest to do the interchange.
2576 if (!hasSupportedLoopDepth(LoopList, ORE))
2577 return PreservedAnalyses::all();
2578 // Ensure computable loop nest.
2579 if (!isComputableLoopNest(&AR.SE, LoopList)) {
2580 LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n");
2581 return PreservedAnalyses::all();
2582 }
2583
2584 ORE.emit([&]() {
2585 return OptimizationRemarkAnalysis(DEBUG_TYPE, "Dependence",
2588 << "Computed dependence info, invoking the transform.";
2589 });
2590
2591 DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
2592 if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, &AR, &ORE).run(LN))
2593 return PreservedAnalyses::all();
2594 U.markLoopNestChanged(true);
2596}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file defines the StringMap class.
Rewrite undef for PHI
ReachingDefInfo InstSet InstSet & Ignore
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
DXIL Resource Access
#define DEBUG_TYPE
static void moveBBContents(BasicBlock &SourceBB, BasicBlock &TargetBB)
Move the contents of SourceBB to before the last instruction of TargetBB.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
This file defines the interface for the loop cache analysis.
SmallVector< Loop *, 4 > LoopVector
Definition LoopFuse.cpp:362
Loop::LoopBounds::Direction Direction
Definition LoopInfo.cpp:253
static cl::list< RuleTy > Profitabilities("loop-interchange-profitabilities", cl::MiscFlags::CommaSeparated, cl::Hidden, cl::desc("List of profitability heuristics to be used. They are applied in " "the given order"), cl::list_init< RuleTy >({RuleTy::PerInstrOrderCost, RuleTy::ForVectorization}), cl::values(clEnumValN(RuleTy::PerLoopCacheAnalysis, "cache", "Prioritize loop cache cost"), clEnumValN(RuleTy::PerInstrOrderCost, "instorder", "Prioritize the IVs order of each instruction"), clEnumValN(RuleTy::ForVectorization, "vectorize", "Prioritize vectorization"), clEnumValN(RuleTy::Ignore, "ignore", "Ignore profitability, force interchange (does not " "work with other options)")))
static cl::opt< int > LoopInterchangeCostThreshold("loop-interchange-threshold", cl::init(0), cl::Hidden, cl::desc("Interchange if you gain more than this number"))
static cl::opt< unsigned int > MinLoopNestDepth("loop-interchange-min-loop-nest-depth", cl::init(2), cl::Hidden, cl::desc("Minimum depth of loop nest considered for the transform"))
static void updateSuccessor(Instruction *Term, BasicBlock *OldBB, BasicBlock *NewBB, std::vector< DominatorTree::UpdateType > &DTUpdates, bool MustUpdateOnce=true)
static cl::opt< bool > EnableReduction2Memory("loop-interchange-reduction-to-mem", cl::init(false), cl::Hidden, cl::desc("Support for the inner-loop reduction pattern."))
static bool isComputableLoopNest(ScalarEvolution *SE, ArrayRef< Loop * > LoopList)
static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop)
static void simplifyLCSSAPhis(Loop *OuterLoop, Loop *InnerLoop)
This deals with a corner case when a LCSSA phi node appears in a non-exit block: the outer loop latch...
static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx, unsigned ToIndx)
static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader, BasicBlock *InnerLatch, BasicBlock *OuterHeader, BasicBlock *OuterLatch, BasicBlock *OuterExit, Loop *InnerLoop, LoopInfo *LI)
static void printDepMatrix(CharMatrix &DepMatrix)
static cl::opt< unsigned int > MaxMemInstrRatio("loop-interchange-max-mem-instr-ratio", cl::init(4), cl::Hidden, cl::desc("Maximum number of load/store instructions squared in relation to " "the total number of instructions. Higher value may lead to more " "interchanges at the cost of compile-time"))
static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2)
Swap instructions between BB1 and BB2 but keep terminators intact.
static PHINode * findInnerReductionPhi(Loop *L, Value *V, SmallVectorImpl< Instruction * > &HasNoWrapInsts, SmallVectorImpl< Instruction * > &HasNoInfInsts)
static bool areInnerLoopExitPHIsSupported(Loop *OuterL, Loop *InnerL, SmallPtrSetImpl< PHINode * > &Reductions, PHINode *LcssaReduction)
We currently only support LCSSA PHI nodes in the inner loop exit if their users are either of the fol...
static cl::opt< unsigned int > MaxLoopNestDepth("loop-interchange-max-loop-nest-depth", cl::init(10), cl::Hidden, cl::desc("Maximum depth of loop nest considered for the transform"))
static bool hasSupportedLoopDepth(ArrayRef< Loop * > LoopList, OptimizationRemarkEmitter &ORE)
static bool inThisOrder(const Instruction *Src, const Instruction *Dst)
Return true if Src appears before Dst in the same basic block.
static bool areInnerLoopLatchPHIsSupported(Loop *OuterLoop, Loop *InnerLoop)
static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId)
Return true if we can vectorize the loop specified by LoopId.
static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix, unsigned InnerLoopId, unsigned OuterLoopId)
#define DEBUG_TYPE
static Value * followLCSSA(Value *SV)
static void populateWorklist(Loop &L, LoopVector &LoopList)
static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, Loop *L, DependenceInfo *DI, ScalarEvolution *SE, OptimizationRemarkEmitter *ORE)
static std::optional< bool > isLexicographicallyPositive(ArrayRef< char > DV, unsigned Begin, unsigned End)
static bool checkReductionKind(Loop *L, PHINode *PHI, SmallVectorImpl< Instruction * > &HasNoWrapInsts, SmallVectorImpl< Instruction * > &HasNoInfInsts)
static std::optional< const SCEV * > getAddRecCoefficient(ScalarEvolution &SE, const SCEV *S, const Loop *L)
If \S contains an affine addrec for L, return the step recurrence of it.
static bool noDuplicateRulesAndIgnore(ArrayRef< RuleTy > Rules)
This file defines the interface for the loop nest analysis.
This header provides classes for managing a pipeline of passes over loops in LLVM IR.
loop Loop Strength Reduction
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
uint64_t IntrinsicInst * II
#define P(N)
This file contains some templates that are useful if you are working with the STL at all.
SmallVector< Value *, 8 > ValueVector
This file defines the SmallSet class.
This file defines the SmallVector class.
static bool isProfitable(const StableFunctionMap::StableFunctionEntries &SFS)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:119
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
Get the first element.
Definition ArrayRef.h:144
size_t size() const
Get the array size.
Definition ArrayRef.h:141
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:185
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:474
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:461
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition BasicBlock.h:530
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
LLVM_ABI void replacePhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block to refer to basic block New instead of basic block Old.
LLVM_ABI const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition BasicBlock.h:659
static LLVM_ABI std::unique_ptr< CacheCost > getCacheCost(Loop &Root, LoopStandardAnalysisResults &AR, DependenceInfo &DI, std::optional< unsigned > TRT=std::nullopt)
Create a CacheCost for the loop nest rooted by Root.
CacheCostTy getLoopCost(const Loop &L) const
Return the estimated cost of loop L if the given loop is part of the loop nest associated with this o...
Value * getCondition() const
BasicBlock * getSuccessor(unsigned i) const
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:225
iterator end()
Definition DenseMap.h:143
DependenceInfo - This class is the main dependence-analysis driver.
LLVM_ABI std::unique_ptr< Dependence > depends(Instruction *Src, Instruction *Dst, bool UnderRuntimeAssumptions=false)
depends - Tests for a dependence between the Src and Dst instructions.
void applyUpdates(ArrayRef< UpdateType > Updates)
Inform the dominator tree about a sequence of CFG edge insertions and deletions and perform a batch u...
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static LLVM_ABI bool isInductionPHI(PHINode *Phi, const Loop *L, ScalarEvolution *SE, InductionDescriptor &D, ArrayRef< const SCEVPredicate * > NoWrapPreds={}, const SCEV *Expr=nullptr, SmallVectorImpl< Instruction * > *CastsToIgnore=nullptr)
Returns true if Phi is an induction in the loop L.
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI void insertBefore(InstListType::iterator InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified position.
LLVM_ABI bool mayHaveSideEffects() const LLVM_READONLY
Return true if the instruction may have side effects.
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
void removeBlockFromLoop(BlockT *BB)
This removes the specified basic block from the current loop, updating the Blocks as appropriate.
const std::vector< LoopT * > & getSubLoops() const
Return the loops contained entirely within this loop.
BlockT * getHeader() const
iterator_range< block_iterator > blocks() const
void addChildLoop(LoopT *NewChild)
Add the specified loop to be a child of this loop.
void addBlockEntry(BlockT *BB)
This adds a basic block directly to the basic block list.
BlockT * getExitBlock() const
If getExitBlocks would return exactly one block, return that block.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
iterator begin() const
BlockT * getUniqueExitBlock() const
If getUniqueExitBlocks would return exactly one block, return that block.
LoopT * removeChildLoop(iterator I)
This removes the specified child from being a subloop of this loop.
void changeTopLevelLoop(LoopT *OldLoop, LoopT *NewLoop)
Replace the specified loop in the top-level loops list with the indicated loop.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
void changeLoopFor(const BlockT *BB, LoopT *L)
Change the top-level loop that contains BB to the specified loop.
This class represents a loop nest and can be used to query its properties.
static const BasicBlock & skipEmptyBlockUntil(const BasicBlock *From, const BasicBlock *End, bool CheckUniquePred=false)
Recursivelly traverse all empty 'single successor' basic blocks of From (if there are any).
ArrayRef< Loop * > getLoops() const
Get the loops in the nest.
Function * getParent() const
Return the function to which the loop-nest belongs.
Loop & getOutermostLoop() const
Return the outermost loop in the loop nest.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition LoopInfo.cpp:659
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition LoopInfo.cpp:67
StringRef getName() const
Definition LoopInfo.h:407
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
bool isComplete() const
If the PHI node is complete which means all of its parent's predecessors have incoming value in this ...
op_range incoming_values()
void setIncomingBlock(unsigned i, BasicBlock *BB)
void setIncomingValue(unsigned i, Value *V)
static unsigned getIncomingValueNumForOperand(unsigned i)
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Instruction * getExactFPMathInst() const
Returns 1st non-reassociative FP instruction in the PHI node's use-chain.
static LLVM_ABI bool isReductionPHI(PHINode *Phi, Loop *TheLoop, RecurrenceDescriptor &RedDes, DemandedBits *DB=nullptr, AssumptionCache *AC=nullptr, DominatorTree *DT=nullptr, ScalarEvolution *SE=nullptr)
Returns true if Phi is a reduction in TheLoop.
LLVM_ABI SmallVector< Instruction *, 4 > getReductionOpChain(PHINode *Phi, Loop *L) const
Attempts to find a chain of operations from Phi to LoopExitInst that can be treated as a set of reduc...
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
SCEVUse getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
The main scalar evolution driver.
LLVM_ABI const SCEV * getAbsExpr(const SCEV *Op, bool IsNSW)
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI const SCEV * getPointerBase(const SCEV *V)
Transitively follow the chain of pointer-type operands until reaching a SCEV that does not have a sin...
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, SCEVUse LHS, SCEVUse RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:134
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:128
std::pair< iterator, bool > try_emplace(StringRef Key, ArgsTy &&...Args)
Emplace a new element for the specified key into the map if the key isn't already in the map.
Definition StringMap.h:369
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
constexpr size_t size() const
Get the string size.
Definition StringRef.h:144
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
LLVM_ABI bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition Value.cpp:163
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition Value.cpp:185
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
list_initializer< Ty > list_init(ArrayRef< Ty > Vals)
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
DXILDebugInfoMap run(Module &M)
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI BasicBlock * InsertPreheaderForLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
InsertPreheaderForLoop - Once we discover that a loop doesn't have a preheader, this method is called...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
InstructionCost Cost
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
auto successors(const MachineBasicBlock *BB)
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
LLVM_ABI bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition LCSSA.cpp:449
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2172
auto map_range(ContainerTy &&C, FuncTy F)
Return a range that applies F to the elements of C.
Definition STLExtras.h:365
AnalysisManager< Loop, LoopStandardAnalysisResults & > LoopAnalysisManager
The loop analysis manager.
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:2025
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:322
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FMinimumNum
FP min with llvm.minimumnum semantics.
@ Or
Bitwise or logical OR of integers.
@ FMinimum
FP min with llvm.minimum semantics.
@ Mul
Product of integers.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ Xor
Bitwise or logical XOR of integers.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMaximum
FP max with llvm.maximum semantics.
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Add
Sum of integers.
@ FAdd
Sum of floats.
@ FMaximumNum
FP max with llvm.maximumnum semantics.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2011
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI bool formLCSSAForInstructions(SmallVectorImpl< Instruction * > &Worklist, const DominatorTree &DT, const LoopInfo &LI, ScalarEvolution *SE, SmallVectorImpl< PHINode * > *PHIsToRemove=nullptr, SmallVectorImpl< PHINode * > *InsertedPHIs=nullptr)
Ensures LCSSA form for every instruction from the Worklist in the scope of innermost containing loop.
Definition LCSSA.cpp:308
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
auto predecessors(const MachineBasicBlock *BB)
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
Definition iterator.h:368
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:862
LLVM_ABI PreservedAnalyses run(LoopNest &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...