LLVM 23.0.0git
LoopInterchange.cpp
Go to the documentation of this file.
1//===- LoopInterchange.cpp - Loop interchange pass-------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This Pass handles loop interchange transform.
10// This pass interchanges loops to provide a more cache-friendly memory access
11// patterns.
12//
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/STLExtras.h"
17#include "llvm/ADT/SmallSet.h"
19#include "llvm/ADT/Statistic.h"
20#include "llvm/ADT/StringMap.h"
21#include "llvm/ADT/StringRef.h"
30#include "llvm/IR/BasicBlock.h"
32#include "llvm/IR/Dominators.h"
33#include "llvm/IR/Function.h"
34#include "llvm/IR/IRBuilder.h"
35#include "llvm/IR/InstrTypes.h"
36#include "llvm/IR/Instruction.h"
38#include "llvm/IR/User.h"
39#include "llvm/IR/Value.h"
42#include "llvm/Support/Debug.h"
49#include <cassert>
50#include <utility>
51#include <vector>
52
53using namespace llvm;
54
55#define DEBUG_TYPE "loop-interchange"
56
57STATISTIC(LoopsInterchanged, "Number of loops interchanged");
58
60 "loop-interchange-threshold", cl::init(0), cl::Hidden,
61 cl::desc("Interchange if you gain more than this number"));
62
64 "loop-interchange-max-mem-instr-ratio", cl::init(4), cl::Hidden,
65 cl::desc("Maximum number of load/store instructions squared in relation to "
66 "the total number of instructions. Higher value may lead to more "
67 "interchanges at the cost of compile-time"));
68
69namespace {
70
72
73/// A list of direction vectors. Each entry represents a direction vector
74/// corresponding to one or more dependencies existing in the loop nest. The
75/// length of all direction vectors is equal and is N + 1, where N is the depth
76/// of the loop nest. The first N elements correspond to the dependency
77/// direction of each N loops. The last one indicates whether this entry is
78/// forward dependency ('<') or not ('*'). The term "forward" aligns with what
79/// is defined in LoopAccessAnalysis.
80// TODO: Check if we can use a sparse matrix here.
81using CharMatrix = std::vector<std::vector<char>>;
82
83/// Types of rules used in profitability check.
84enum class RuleTy {
85 PerLoopCacheAnalysis,
86 PerInstrOrderCost,
87 ForVectorization,
88 Ignore
89};
90
91} // end anonymous namespace
92
93// Minimum loop depth supported.
95 "loop-interchange-min-loop-nest-depth", cl::init(2), cl::Hidden,
96 cl::desc("Minimum depth of loop nest considered for the transform"));
97
98// Maximum loop depth supported.
100 "loop-interchange-max-loop-nest-depth", cl::init(10), cl::Hidden,
101 cl::desc("Maximum depth of loop nest considered for the transform"));
102
103// We prefer cache cost to vectorization by default.
105 "loop-interchange-profitabilities", cl::MiscFlags::CommaSeparated,
107 cl::desc("List of profitability heuristics to be used. They are applied in "
108 "the given order"),
109 cl::list_init<RuleTy>({RuleTy::PerInstrOrderCost,
110 RuleTy::ForVectorization}),
111 cl::values(clEnumValN(RuleTy::PerLoopCacheAnalysis, "cache",
112 "Prioritize loop cache cost"),
113 clEnumValN(RuleTy::PerInstrOrderCost, "instorder",
114 "Prioritize the IVs order of each instruction"),
115 clEnumValN(RuleTy::ForVectorization, "vectorize",
116 "Prioritize vectorization"),
117 clEnumValN(RuleTy::Ignore, "ignore",
118 "Ignore profitability, force interchange (does not "
119 "work with other options)")));
120
121// Support for the inner-loop reduction pattern.
123 "loop-interchange-reduction-to-mem", cl::init(false), cl::Hidden,
124 cl::desc("Support for the inner-loop reduction pattern."));
125
126#ifndef NDEBUG
129 for (RuleTy Rule : Rules) {
130 if (!Set.insert(Rule).second)
131 return false;
132 if (Rule == RuleTy::Ignore)
133 return false;
134 }
135 return true;
136}
137
138static void printDepMatrix(CharMatrix &DepMatrix) {
139 for (auto &Row : DepMatrix) {
140 // Drop the last element because it is a flag indicating whether this is
141 // forward dependency or not, which doesn't affect the legality check.
142 for (char D : drop_end(Row))
143 LLVM_DEBUG(dbgs() << D << " ");
144 LLVM_DEBUG(dbgs() << "\n");
145 }
146}
147
148/// Return true if \p Src appears before \p Dst in the same basic block.
149/// Precondition: \p Src and \Dst are distinct instructions within the same
150/// basic block.
151static bool inThisOrder(const Instruction *Src, const Instruction *Dst) {
152 assert(Src->getParent() == Dst->getParent() && Src != Dst &&
153 "Expected Src and Dst to be different instructions in the same BB");
154
155 bool FoundSrc = false;
156 for (const Instruction &I : *(Src->getParent())) {
157 if (&I == Src) {
158 FoundSrc = true;
159 continue;
160 }
161 if (&I == Dst)
162 return FoundSrc;
163 }
164
165 llvm_unreachable("Dst not found");
166}
167#endif
168
169static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
170 Loop *L, DependenceInfo *DI,
171 ScalarEvolution *SE,
174
175 ValueVector MemInstr;
176 unsigned NumInsts = 0;
177
178 // For each block.
179 for (BasicBlock *BB : L->blocks()) {
180 // Scan the BB and collect legal loads and stores.
181 for (Instruction &I : *BB) {
182 if (!isa<Instruction>(I))
183 return false;
184 NumInsts++;
185 if (auto *Ld = dyn_cast<LoadInst>(&I)) {
186 if (!Ld->isSimple())
187 return false;
188 MemInstr.push_back(&I);
189 } else if (auto *St = dyn_cast<StoreInst>(&I)) {
190 if (!St->isSimple())
191 return false;
192 MemInstr.push_back(&I);
193 }
194 }
195 }
196
197 // To populate the dependence matrix, we perform dependence test for each pair
198 // of memory instructions, which has O(NumMemInstr^2) complexity. This implies
199 // that even if the number of memory instructions is small, the analysis can
200 // still be expensive if the most of the instructions in the loop are memory
201 // instructions. On the other hand, if the number of memory instructions is
202 // not small, but the loop is large (i.e., it contains many non-memory
203 // instructions), the analysis can still be affordable.
204 unsigned NumMemInstr = MemInstr.size();
205 LLVM_DEBUG(dbgs() << "Found " << NumMemInstr
206 << " Loads and Stores to analyze\n");
207 if (MaxMemInstrRatio * NumInsts < NumMemInstr * NumMemInstr) {
208 ORE->emit([&]() {
209 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoop",
210 L->getStartLoc(), L->getHeader())
211 << "Number of loads/stores exceeded, the supported maximum can be "
212 "increased with option -loop-interchange-max-mem-instr-ratio.";
213 });
214 return false;
215 }
216 ValueVector::iterator I, IE, J, JE;
217
218 // Manage direction vectors that are already seen. Map each direction vector
219 // to an index of DepMatrix at which it is stored.
221
222 for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
223 for (J = I, JE = MemInstr.end(); J != JE; ++J) {
224 std::vector<char> Dep;
227 // Ignore Input dependencies.
228 if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
229 continue;
230 // Track Output, Flow, and Anti dependencies.
231 if (auto D = DI->depends(Src, Dst)) {
232 assert(D->isOrdered() && "Expected an output, flow or anti dep.");
233 // If the direction vector is negative, normalize it to
234 // make it non-negative.
235 if (D->normalize(SE))
236 LLVM_DEBUG(dbgs() << "Negative dependence vector normalized.\n");
237 LLVM_DEBUG(StringRef DepType =
238 D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
239 dbgs() << "Found " << DepType
240 << " dependency between Src and Dst\n"
241 << " Src:" << *Src << "\n Dst:" << *Dst << '\n');
242 unsigned Levels = D->getLevels();
243 char Direction;
244 for (unsigned II = 1; II <= Levels; ++II) {
245 // `DVEntry::LE` is converted to `*`. This is because `LE` means `<`
246 // or `=`, for which we don't have an equivalent representation, so
247 // that the conservative approximation is necessary. The same goes for
248 // `DVEntry::GE`.
249 // TODO: Use of fine-grained expressions allows for more accurate
250 // analysis.
251 unsigned Dir = D->getDirection(II);
252 if (Dir == Dependence::DVEntry::LT)
253 Direction = '<';
254 else if (Dir == Dependence::DVEntry::GT)
255 Direction = '>';
256 else if (Dir == Dependence::DVEntry::EQ)
257 Direction = '=';
258 else
259 Direction = '*';
260 Dep.push_back(Direction);
261 }
262
263 // If the Dependence object doesn't have any information, fill the
264 // dependency vector with '*'.
265 if (D->isConfused()) {
266 assert(Dep.empty() && "Expected empty dependency vector");
267 Dep.assign(Level, '*');
268 }
269
270 while (Dep.size() != Level) {
271 Dep.push_back('I');
272 }
273
274 // If all the elements of any direction vector have only '*', legality
275 // can't be proven. Exit early to save compile time.
276 if (all_of(Dep, equal_to('*'))) {
277 ORE->emit([&]() {
278 return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
279 L->getStartLoc(), L->getHeader())
280 << "All loops have dependencies in all directions.";
281 });
282 return false;
283 }
284
285 // Test whether the dependency is forward or not.
286 bool IsKnownForward = true;
287 if (Src->getParent() != Dst->getParent()) {
288 // In general, when Src and Dst are in different BBs, the execution
289 // order of them within a single iteration is not guaranteed. Treat
290 // conservatively as not-forward dependency in this case.
291 IsKnownForward = false;
292 } else {
293 // Src and Dst are in the same BB. If they are the different
294 // instructions, Src should appear before Dst in the BB as they are
295 // stored to MemInstr in that order.
296 assert((Src == Dst || inThisOrder(Src, Dst)) &&
297 "Unexpected instructions");
298
299 // If the Dependence object is reversed (due to normalization), it
300 // represents the dependency from Dst to Src, meaning it is a backward
301 // dependency. Otherwise it should be a forward dependency.
302 bool IsReversed = D->getSrc() != Src;
303 if (IsReversed)
304 IsKnownForward = false;
305 }
306
307 // Initialize the last element. Assume forward dependencies only; it
308 // will be updated later if there is any non-forward dependency.
309 Dep.push_back('<');
310
311 // The last element should express the "summary" among one or more
312 // direction vectors whose first N elements are the same (where N is
313 // the depth of the loop nest). Hence we exclude the last element from
314 // the Seen map.
315 auto [Ite, Inserted] = Seen.try_emplace(
316 StringRef(Dep.data(), Dep.size() - 1), DepMatrix.size());
317
318 // Make sure we only add unique entries to the dependency matrix.
319 if (Inserted)
320 DepMatrix.push_back(Dep);
321
322 // If we cannot prove that this dependency is forward, change the last
323 // element of the corresponding entry. Since a `[... *]` dependency
324 // includes a `[... <]` dependency, we do not need to keep both and
325 // change the existing entry instead.
326 if (!IsKnownForward)
327 DepMatrix[Ite->second].back() = '*';
328 }
329 }
330 }
331
332 return true;
333}
334
335// A loop is moved from index 'from' to an index 'to'. Update the Dependence
336// matrix by exchanging the two columns.
337static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx,
338 unsigned ToIndx) {
339 for (auto &Row : DepMatrix)
340 std::swap(Row[ToIndx], Row[FromIndx]);
341}
342
343// Check if a direction vector is lexicographically positive. Return true if it
344// is positive, nullopt if it is "zero", otherwise false.
345// [Theorem] A permutation of the loops in a perfect nest is legal if and only
346// if the direction matrix, after the same permutation is applied to its
347// columns, has no ">" direction as the leftmost non-"=" direction in any row.
348static std::optional<bool>
349isLexicographicallyPositive(ArrayRef<char> DV, unsigned Begin, unsigned End) {
350 for (unsigned char Direction : DV.slice(Begin, End - Begin)) {
351 if (Direction == '<')
352 return true;
353 if (Direction == '>' || Direction == '*')
354 return false;
355 }
356 return std::nullopt;
357}
358
359// Checks if it is legal to interchange 2 loops.
360static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
361 unsigned InnerLoopId,
362 unsigned OuterLoopId) {
363 unsigned NumRows = DepMatrix.size();
364 std::vector<char> Cur;
365 // For each row check if it is valid to interchange.
366 for (unsigned Row = 0; Row < NumRows; ++Row) {
367 // Create temporary DepVector check its lexicographical order
368 // before and after swapping OuterLoop vs InnerLoop
369 Cur = DepMatrix[Row];
370
371 // If the surrounding loops already ensure that the direction vector is
372 // lexicographically positive, nothing within the loop will be able to break
373 // the dependence. In such a case we can skip the subsequent check.
374 if (isLexicographicallyPositive(Cur, 0, OuterLoopId) == true)
375 continue;
376
377 // Check if the direction vector is lexicographically positive (or zero)
378 // for both before/after exchanged. Ignore the last element because it
379 // doesn't affect the legality.
380 if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false)
381 return false;
382 std::swap(Cur[InnerLoopId], Cur[OuterLoopId]);
383 if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false)
384 return false;
385 }
386 return true;
387}
388
389static void populateWorklist(Loop &L, LoopVector &LoopList) {
390 LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: "
391 << L.getHeader()->getParent()->getName() << " Loop: %"
392 << L.getHeader()->getName() << '\n');
393 assert(LoopList.empty() && "LoopList should initially be empty!");
394 Loop *CurrentLoop = &L;
395 const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
396 while (!Vec->empty()) {
397 // The current loop has multiple subloops in it hence it is not tightly
398 // nested.
399 // Discard all loops above it added into Worklist.
400 if (Vec->size() != 1) {
401 LoopList = {};
402 return;
403 }
404
405 LoopList.push_back(CurrentLoop);
406 CurrentLoop = Vec->front();
407 Vec = &CurrentLoop->getSubLoops();
408 }
409 LoopList.push_back(CurrentLoop);
410}
411
414 unsigned LoopNestDepth = LoopList.size();
415 if (LoopNestDepth < MinLoopNestDepth || LoopNestDepth > MaxLoopNestDepth) {
416 LLVM_DEBUG(dbgs() << "Unsupported depth of loop nest " << LoopNestDepth
417 << ", the supported range is [" << MinLoopNestDepth
418 << ", " << MaxLoopNestDepth << "].\n");
419 Loop *OuterLoop = LoopList.front();
420 ORE.emit([&]() {
421 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoopNestDepth",
422 OuterLoop->getStartLoc(),
423 OuterLoop->getHeader())
424 << "Unsupported depth of loop nest, the supported range is ["
425 << std::to_string(MinLoopNestDepth) << ", "
426 << std::to_string(MaxLoopNestDepth) << "].\n";
427 });
428 return false;
429 }
430 return true;
431}
432
434 ArrayRef<Loop *> LoopList) {
435 for (Loop *L : LoopList) {
436 const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
437 if (isa<SCEVCouldNotCompute>(ExitCountOuter)) {
438 LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n");
439 return false;
440 }
441 if (L->getNumBackEdges() != 1) {
442 LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
443 return false;
444 }
445 if (!L->getExitingBlock()) {
446 LLVM_DEBUG(dbgs() << "Loop doesn't have unique exit block\n");
447 return false;
448 }
449 }
450 return true;
451}
452
453namespace {
454
455/// LoopInterchangeLegality checks if it is legal to interchange the loop.
456class LoopInterchangeLegality {
457public:
458 LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
459 OptimizationRemarkEmitter *ORE, DominatorTree *DT)
460 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), DT(DT), ORE(ORE) {}
461
462 /// Check if the loops can be interchanged.
463 bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
464 CharMatrix &DepMatrix);
465
466 /// Discover induction PHIs in the header of \p L. Induction
467 /// PHIs are added to \p Inductions.
468 bool findInductions(Loop *L, SmallVectorImpl<PHINode *> &Inductions);
469
470 /// Check if the loop structure is understood. We do not handle triangular
471 /// loops for now.
472 bool isLoopStructureUnderstood();
473
474 bool currentLimitations();
475
476 const SmallPtrSetImpl<PHINode *> &getOuterInnerReductions() const {
477 return OuterInnerReductions;
478 }
479
480 const ArrayRef<PHINode *> getInnerLoopInductions() const {
481 return InnerLoopInductions;
482 }
483
484 ArrayRef<Instruction *> getHasNoWrapReductions() const {
485 return HasNoWrapReductions;
486 }
487
488 ArrayRef<Instruction *> getHasNoInfInsts() const { return HasNoInfInsts; }
489
490 /// Record reductions in the inner loop. Currently supported reductions:
491 /// - initialized from a constant.
492 /// - reduction PHI node has only one user.
493 /// - located in the innermost loop.
494 struct InnerReduction {
495 /// The reduction itself.
496 PHINode *Reduction;
497 Value *Init;
498 Value *Next;
499 /// The Lcssa PHI.
500 PHINode *LcssaPhi;
501 /// Store reduction result into memory object.
502 StoreInst *LcssaStore;
503 /// The memory Location.
504 Value *MemRef;
505 Type *ElemTy;
506 };
507
508 ArrayRef<InnerReduction> getInnerReductions() const {
509 return InnerReductions;
510 }
511
512private:
513 bool tightlyNested(Loop *Outer, Loop *Inner);
514 bool containsUnsafeInstructions(BasicBlock *BB, Instruction *Skip);
515
516 /// Discover induction and reduction PHIs in the header of \p L. Induction
517 /// PHIs are added to \p Inductions, reductions are added to
518 /// OuterInnerReductions. When the outer loop is passed, the inner loop needs
519 /// to be passed as \p InnerLoop.
520 bool findInductionAndReductions(Loop *L,
521 SmallVector<PHINode *, 8> &Inductions,
522 Loop *InnerLoop);
523
524 /// Detect and record the reduction of the inner loop. Add them to
525 /// InnerReductions.
526 ///
527 /// innerloop:
528 /// Re = phi<0.0, Next>
529 /// Next = Re op ...
530 /// OuterLoopLatch:
531 /// Lcssa = phi<Next> ; lcssa phi
532 /// store Lcssa, MemRef ; LcssaStore
533 ///
534 bool isInnerReduction(Loop *L, PHINode *Phi,
535 SmallVectorImpl<Instruction *> &HasNoWrapInsts);
536
537 Loop *OuterLoop;
538 Loop *InnerLoop;
539
540 ScalarEvolution *SE;
541 DominatorTree *DT;
542
543 /// Interface to emit optimization remarks.
544 OptimizationRemarkEmitter *ORE;
545
546 /// Set of reduction PHIs taking part of a reduction across the inner and
547 /// outer loop.
548 SmallPtrSet<PHINode *, 4> OuterInnerReductions;
549
550 /// Set of inner loop induction PHIs
551 SmallVector<PHINode *, 8> InnerLoopInductions;
552
553 /// Hold instructions that have nuw/nsw flags and involved in reductions,
554 /// like integer addition/multiplication. Those flags must be dropped when
555 /// interchanging the loops.
556 SmallVector<Instruction *, 4> HasNoWrapReductions;
557
558 /// Hold instructions that have ninf flags and involved in reductions. Those
559 /// flags must be dropped when interchanging the loops.
560 SmallVector<Instruction *, 4> HasNoInfInsts;
561
562 /// Vector of reductions in the inner loop.
563 SmallVector<InnerReduction, 8> InnerReductions;
564};
565
566/// Manages information utilized by the profitability check for cache. The main
567/// purpose of this class is to delay the computation of CacheCost until it is
568/// actually needed.
569class CacheCostManager {
570 Loop *OutermostLoop;
571 LoopStandardAnalysisResults *AR;
572 DependenceInfo *DI;
573
574 /// CacheCost for \ref OutermostLoop. Once it is computed, it is cached. Note
575 /// that the result can be nullptr.
576 std::optional<std::unique_ptr<CacheCost>> CC;
577
578 /// Maps each loop to an index representing the optimal position within the
579 /// loop-nest, as determined by the cache cost analysis.
580 DenseMap<const Loop *, unsigned> CostMap;
581
582 void computeIfUnitinialized();
583
584public:
585 CacheCostManager(Loop *OutermostLoop, LoopStandardAnalysisResults *AR,
586 DependenceInfo *DI)
587 : OutermostLoop(OutermostLoop), AR(AR), DI(DI) {}
588 CacheCost *getCacheCost();
589 const DenseMap<const Loop *, unsigned> &getCostMap();
590};
591
592/// LoopInterchangeProfitability checks if it is profitable to interchange the
593/// loop.
594class LoopInterchangeProfitability {
595public:
596 LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
597 OptimizationRemarkEmitter *ORE)
598 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
599
600 /// Check if the loop interchange is profitable.
601 bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
602 unsigned InnerLoopId, unsigned OuterLoopId,
603 CharMatrix &DepMatrix, CacheCostManager &CCM);
604
605private:
606 int getInstrOrderCost();
607 std::optional<bool> isProfitablePerLoopCacheAnalysis(
608 const DenseMap<const Loop *, unsigned> &CostMap, CacheCost *CC);
609 std::optional<bool> isProfitablePerInstrOrderCost();
610 std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId,
611 unsigned OuterLoopId,
612 CharMatrix &DepMatrix);
613 Loop *OuterLoop;
614 Loop *InnerLoop;
615
616 /// Scev analysis.
617 ScalarEvolution *SE;
618
619 /// Interface to emit optimization remarks.
620 OptimizationRemarkEmitter *ORE;
621};
622
623/// LoopInterchangeTransform interchanges the loop.
624class LoopInterchangeTransform {
625public:
626 LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
627 LoopInfo *LI, DominatorTree *DT,
628 const LoopInterchangeLegality &LIL)
629 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), LIL(LIL) {}
630
631 /// Interchange OuterLoop and InnerLoop.
632 bool transform(ArrayRef<Instruction *> DropNoWrapInsts,
633 ArrayRef<Instruction *> DropNoInfInsts);
634 void reduction2Memory();
635 void restructureLoops(Loop *NewInner, Loop *NewOuter,
636 BasicBlock *OrigInnerPreHeader,
637 BasicBlock *OrigOuterPreHeader);
638 void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
639
640private:
641 bool adjustLoopLinks();
642 bool adjustLoopBranches();
643
644 Loop *OuterLoop;
645 Loop *InnerLoop;
646
647 /// Scev analysis.
648 ScalarEvolution *SE;
649
650 LoopInfo *LI;
651 DominatorTree *DT;
652
653 const LoopInterchangeLegality &LIL;
654};
655
656struct LoopInterchange {
657 ScalarEvolution *SE = nullptr;
658 LoopInfo *LI = nullptr;
659 DependenceInfo *DI = nullptr;
660 DominatorTree *DT = nullptr;
661 LoopStandardAnalysisResults *AR = nullptr;
662
663 /// Interface to emit optimization remarks.
664 OptimizationRemarkEmitter *ORE;
665
666 LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI,
667 DominatorTree *DT, LoopStandardAnalysisResults *AR,
668 OptimizationRemarkEmitter *ORE)
669 : SE(SE), LI(LI), DI(DI), DT(DT), AR(AR), ORE(ORE) {}
670
671 bool run(Loop *L) {
672 if (L->getParentLoop())
673 return false;
674 SmallVector<Loop *, 8> LoopList;
675 populateWorklist(*L, LoopList);
676 return processLoopList(LoopList);
677 }
678
679 bool run(LoopNest &LN) {
680 SmallVector<Loop *, 8> LoopList(LN.getLoops());
681 for (unsigned I = 1; I < LoopList.size(); ++I)
682 if (LoopList[I]->getParentLoop() != LoopList[I - 1])
683 return false;
684 return processLoopList(LoopList);
685 }
686
687 unsigned selectLoopForInterchange(ArrayRef<Loop *> LoopList) {
688 // TODO: Add a better heuristic to select the loop to be interchanged based
689 // on the dependence matrix. Currently we select the innermost loop.
690 return LoopList.size() - 1;
691 }
692
693 bool processLoopList(SmallVectorImpl<Loop *> &LoopList) {
694 bool Changed = false;
695
696 // Ensure proper loop nest depth.
697 assert(hasSupportedLoopDepth(LoopList, *ORE) &&
698 "Unsupported depth of loop nest.");
699
700 unsigned LoopNestDepth = LoopList.size();
701
702 LLVM_DEBUG({
703 dbgs() << "Processing LoopList of size = " << LoopNestDepth
704 << " containing the following loops:\n";
705 for (auto *L : LoopList) {
706 dbgs() << " - ";
707 L->print(dbgs());
708 }
709 });
710
711 CharMatrix DependencyMatrix;
712 Loop *OuterMostLoop = *(LoopList.begin());
713 if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
714 OuterMostLoop, DI, SE, ORE)) {
715 LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
716 return false;
717 }
718
719 LLVM_DEBUG(dbgs() << "Dependency matrix before interchange:\n";
720 printDepMatrix(DependencyMatrix));
721
722 // Get the Outermost loop exit.
723 BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock();
724 if (!LoopNestExit) {
725 LLVM_DEBUG(dbgs() << "OuterMostLoop '" << OuterMostLoop->getName()
726 << "' needs an unique exit block");
727 return false;
728 }
729
730 unsigned SelecLoopId = selectLoopForInterchange(LoopList);
731 CacheCostManager CCM(LoopList[0], AR, DI);
732 // We try to achieve the globally optimal memory access for the loopnest,
733 // and do interchange based on a bubble-sort fasion. We start from
734 // the innermost loop, move it outwards to the best possible position
735 // and repeat this process.
736 for (unsigned j = SelecLoopId; j > 0; j--) {
737 bool ChangedPerIter = false;
738 for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
739 bool Interchanged =
740 processLoop(LoopList, i, i - 1, DependencyMatrix, CCM);
741 ChangedPerIter |= Interchanged;
742 Changed |= Interchanged;
743 }
744 // Early abort if there was no interchange during an entire round of
745 // moving loops outwards.
746 if (!ChangedPerIter)
747 break;
748 }
749 return Changed;
750 }
751
752 bool processLoop(SmallVectorImpl<Loop *> &LoopList, unsigned InnerLoopId,
753 unsigned OuterLoopId,
754 std::vector<std::vector<char>> &DependencyMatrix,
755 CacheCostManager &CCM) {
756 Loop *OuterLoop = LoopList[OuterLoopId];
757 Loop *InnerLoop = LoopList[InnerLoopId];
758 LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId
759 << " and OuterLoopId = " << OuterLoopId << "\n");
760 LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE, DT);
761 if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
762 LLVM_DEBUG(dbgs() << "Cannot prove legality, not interchanging loops '"
763 << OuterLoop->getName() << "' and '"
764 << InnerLoop->getName() << "'\n");
765 return false;
766 }
767 LLVM_DEBUG(dbgs() << "Loops '" << OuterLoop->getName() << "' and '"
768 << InnerLoop->getName()
769 << "' are legal to interchange\n");
770 LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
771 if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
772 DependencyMatrix, CCM)) {
773 LLVM_DEBUG(dbgs() << "Interchanging loops '" << OuterLoop->getName()
774 << "' and '" << InnerLoop->getName()
775 << "' not profitable.\n");
776 return false;
777 }
778
779 ORE->emit([&]() {
780 return OptimizationRemark(DEBUG_TYPE, "Interchanged",
781 InnerLoop->getStartLoc(),
782 InnerLoop->getHeader())
783 << "Loop interchanged with enclosing loop.";
784 });
785
786 LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LIL);
787 LIT.transform(LIL.getHasNoWrapReductions(), LIL.getHasNoInfInsts());
788 LLVM_DEBUG(dbgs() << "Loops interchanged: outer loop '"
789 << OuterLoop->getName() << "' and inner loop '"
790 << InnerLoop->getName() << "'\n");
791 LoopsInterchanged++;
792
793 llvm::formLCSSARecursively(*OuterLoop, *DT, LI, SE);
794
795 // Loops interchanged, update LoopList accordingly.
796 std::swap(LoopList[OuterLoopId], LoopList[InnerLoopId]);
797 // Update the DependencyMatrix
798 interChangeDependencies(DependencyMatrix, InnerLoopId, OuterLoopId);
799
800 LLVM_DEBUG(dbgs() << "Dependency matrix after interchange:\n";
801 printDepMatrix(DependencyMatrix));
802
803 return true;
804 }
805};
806
807} // end anonymous namespace
808
809bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB,
810 Instruction *Skip) {
811 return any_of(*BB, [Skip](const Instruction &I) {
812 if (&I == Skip)
813 return false;
814 return I.mayHaveSideEffects() || I.mayReadFromMemory();
815 });
816}
817
818bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
819 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
820 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
821 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
822
823 LLVM_DEBUG(dbgs() << "Checking if loops '" << OuterLoop->getName()
824 << "' and '" << InnerLoop->getName()
825 << "' are tightly nested\n");
826
827 // In a perfectly nested loop the outer header branches only into the inner
828 // loop. If it can also reach the outer latch, it conditionally guards the
829 // inner loop (an imperfect nest), so the inner loop runs on only a subset of
830 // the outer iterations. Interchanging such a nest would run the inner loop on
831 // every outer iteration, including the guarded-off ones, which is illegal
832 // when the inner loop relies on the guard to terminate (e.g. an eq/ne exit
833 // whose trip count is degenerate once the guard is false). Reject by allowing
834 // the outer header to branch only into the inner loop.
835 //
836 // TODO: This is conservative. A guarded nest is still safe to interchange
837 // when the inner loop has a computable trip count that is empty exactly when
838 // the guard is false, e.g.:
839 // for (i = 0; i < N; i++)
840 // if (M > 0) // loop-invariant guard
841 // for (j = 0; j < M; j++) // empty when M <= 0
842 // A[j][i] = ...;
843 // Interchanging is legal here because the inner loop runs zero times on the
844 // guarded-off iterations.
845 for (BasicBlock *Succ : successors(OuterLoopHeader))
846 if (Succ != InnerLoopPreHeader && Succ != InnerLoop->getHeader())
847 return false;
848
849 LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
850
851 // The inner loop reduction pattern requires storing the LCSSA PHI in
852 // the OuterLoop Latch. Therefore, when reduction2Memory is enabled, skip
853 // that store during checks.
854 Instruction *Skip = nullptr;
855 assert(InnerReductions.size() <= 1 &&
856 "So far we only support at most one reduction.");
857 if (InnerReductions.size() == 1)
858 Skip = InnerReductions[0].LcssaStore;
859
860 // We do not have any basic block in between now make sure the outer header
861 // and outer loop latch doesn't contain any unsafe instructions.
862 if (containsUnsafeInstructions(OuterLoopHeader, Skip) ||
863 containsUnsafeInstructions(OuterLoopLatch, Skip))
864 return false;
865
866 // Also make sure the inner loop preheader does not contain any unsafe
867 // instructions. Note that all instructions in the preheader will be moved to
868 // the outer loop header when interchanging.
869 if (InnerLoopPreHeader != OuterLoopHeader &&
870 containsUnsafeInstructions(InnerLoopPreHeader, Skip))
871 return false;
872
873 BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
874 // Ensure the inner loop exit block flows to the outer loop latch possibly
875 // through empty blocks.
876 const BasicBlock &SuccInner =
877 LoopNest::skipEmptyBlockUntil(InnerLoopExit, OuterLoopLatch);
878 if (&SuccInner != OuterLoopLatch) {
879 LLVM_DEBUG(dbgs() << "Inner loop exit block " << *InnerLoopExit
880 << " does not lead to the outer loop latch.\n";);
881 return false;
882 }
883 // The inner loop exit block does flow to the outer loop latch and not some
884 // other BBs, now make sure it contains safe instructions, since it will be
885 // moved into the (new) inner loop after interchange.
886 if (containsUnsafeInstructions(InnerLoopExit, Skip))
887 return false;
888
889 LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n");
890 // We have a perfect loop nest.
891 return true;
892}
893
894bool LoopInterchangeLegality::isLoopStructureUnderstood() {
895 BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
896 for (PHINode *InnerInduction : InnerLoopInductions) {
897 unsigned Num = InnerInduction->getNumOperands();
898 for (unsigned i = 0; i < Num; ++i) {
899 Value *Val = InnerInduction->getOperand(i);
900 if (isa<Constant>(Val))
901 continue;
903 if (!I)
904 return false;
905 // TODO: Handle triangular loops.
906 // e.g. for(int i=0;i<N;i++)
907 // for(int j=i;j<N;j++)
908 unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
909 if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==
910 InnerLoopPreheader &&
911 !OuterLoop->isLoopInvariant(I)) {
912 return false;
913 }
914 }
915 }
916
917 // TODO: Handle triangular loops of another form.
918 // e.g. for(int i=0;i<N;i++)
919 // for(int j=0;j<i;j++)
920 // or,
921 // for(int i=0;i<N;i++)
922 // for(int j=0;j*i<N;j++)
923 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
924 CondBrInst *InnerLoopLatchBI =
925 dyn_cast<CondBrInst>(InnerLoopLatch->getTerminator());
926 if (!InnerLoopLatchBI)
927 return false;
928 if (CmpInst *InnerLoopCmp =
929 dyn_cast<CmpInst>(InnerLoopLatchBI->getCondition())) {
930 Value *Op0 = InnerLoopCmp->getOperand(0);
931 Value *Op1 = InnerLoopCmp->getOperand(1);
932
933 // LHS and RHS of the inner loop exit condition, e.g.,
934 // in "for(int j=0;j<i;j++)", LHS is j and RHS is i.
935 Value *Left = nullptr;
936 Value *Right = nullptr;
937
938 // Check if V only involves inner loop induction variable.
939 // Return true if V is InnerInduction, or a cast from
940 // InnerInduction, or a binary operator that involves
941 // InnerInduction and a constant.
942 std::function<bool(Value *)> IsPathToInnerIndVar;
943 IsPathToInnerIndVar = [this, &IsPathToInnerIndVar](const Value *V) -> bool {
944 if (llvm::is_contained(InnerLoopInductions, V))
945 return true;
946 if (isa<Constant>(V))
947 return true;
949 if (!I)
950 return false;
951 if (isa<CastInst>(I))
952 return IsPathToInnerIndVar(I->getOperand(0));
954 return IsPathToInnerIndVar(I->getOperand(0)) &&
955 IsPathToInnerIndVar(I->getOperand(1));
956 return false;
957 };
958
959 // In case of multiple inner loop indvars, it is okay if LHS and RHS
960 // are both inner indvar related variables.
961 if (IsPathToInnerIndVar(Op0) && IsPathToInnerIndVar(Op1))
962 return true;
963
964 // Otherwise we check if the cmp instruction compares an inner indvar
965 // related variable (Left) with a outer loop invariant (Right).
966 if (IsPathToInnerIndVar(Op0) && !isa<Constant>(Op0)) {
967 Left = Op0;
968 Right = Op1;
969 } else if (IsPathToInnerIndVar(Op1) && !isa<Constant>(Op1)) {
970 Left = Op1;
971 Right = Op0;
972 }
973
974 if (Left == nullptr)
975 return false;
976
977 const SCEV *S = SE->getSCEV(Right);
978 if (!SE->isLoopInvariant(S, OuterLoop))
979 return false;
980 }
981
982 return true;
983}
984
985// If SV is a LCSSA PHI node with a single incoming value, return the incoming
986// value.
987static Value *followLCSSA(Value *SV) {
989 if (!PHI)
990 return SV;
991
992 if (PHI->getNumIncomingValues() != 1)
993 return SV;
994 return followLCSSA(PHI->getIncomingValue(0));
995}
996
998 SmallVectorImpl<Instruction *> &HasNoWrapInsts,
999 SmallVectorImpl<Instruction *> &HasNoInfInsts) {
1002 // Detect floating point reduction only when it can be reordered.
1003 if (RD.getExactFPMathInst() != nullptr)
1004 return false;
1005
1006 RecurKind RK = RD.getRecurrenceKind();
1007 switch (RK) {
1008 case RecurKind::Or:
1009 case RecurKind::And:
1010 case RecurKind::Xor:
1011 case RecurKind::SMin:
1012 case RecurKind::SMax:
1013 case RecurKind::UMin:
1014 case RecurKind::UMax:
1015 case RecurKind::AnyOf:
1016 return true;
1017
1018 // Changing the order of floating-point operations may alter the results. If
1019 // a certain instruction has the ninf flag, it means that reordering can
1020 // produce a poison value, which may lead to undefined behavior. To prevent
1021 // this, we must drop the ninf flags if we decide to apply the
1022 // transformation.
1023 case RecurKind::FAdd:
1024 case RecurKind::FMul:
1025 case RecurKind::FMin:
1026 case RecurKind::FMax:
1031 case RecurKind::FMulAdd:
1032 for (Instruction *I : RD.getReductionOpChain(PHI, L))
1033 if (isa<FPMathOperator>(I) && I->hasNoInfs())
1034 HasNoInfInsts.push_back(I);
1035 return true;
1036
1037 // Change the order of integer addition/multiplication may change the
1038 // semantics. Consider the following case:
1039 //
1040 // int A[2][2] = {{ INT_MAX, INT_MAX }, { INT_MIN, INT_MIN }};
1041 // int sum = 0;
1042 // for (int i = 0; i < 2; i++)
1043 // for (int j = 0; j < 2; j++)
1044 // sum += A[j][i];
1045 //
1046 // If the above loops are exchanged, the addition will cause an
1047 // overflow. To prevent this, we must drop the nuw/nsw flags from the
1048 // addition/multiplication instructions when we actually exchanges the
1049 // loops.
1050 case RecurKind::Add:
1051 case RecurKind::Mul: {
1052 unsigned OpCode = RecurrenceDescriptor::getOpcode(RK);
1054
1055 // Bail out when we fail to collect reduction instructions chain.
1056 if (Ops.empty())
1057 return false;
1058
1059 for (Instruction *I : Ops) {
1060 assert(I->getOpcode() == OpCode &&
1061 "Expected the instruction to be the reduction operation");
1062 (void)OpCode;
1063
1064 // If the instruction has nuw/nsw flags, we must drop them when the
1065 // transformation is actually performed.
1066 if (I->hasNoSignedWrap() || I->hasNoUnsignedWrap())
1067 HasNoWrapInsts.push_back(I);
1068 }
1069 return true;
1070 }
1071
1072 default:
1073 return false;
1074 }
1075 } else
1076 return false;
1077}
1078
1079// Check V's users to see if it is involved in a reduction in L.
1080static PHINode *
1082 SmallVectorImpl<Instruction *> &HasNoWrapInsts,
1083 SmallVectorImpl<Instruction *> &HasNoInfInsts) {
1084 // Reduction variables cannot be constants.
1085 if (isa<Constant>(V))
1086 return nullptr;
1087
1088 for (Value *User : V->users()) {
1090 if (PHI->getNumIncomingValues() == 1)
1091 continue;
1092
1093 if (checkReductionKind(L, PHI, HasNoWrapInsts, HasNoInfInsts))
1094 return PHI;
1095 else
1096 return nullptr;
1097 }
1098 }
1099
1100 return nullptr;
1101}
1102
1103bool LoopInterchangeLegality::isInnerReduction(
1104 Loop *L, PHINode *Phi, SmallVectorImpl<Instruction *> &HasNoWrapInsts) {
1105
1106 // Only support reduction2Mem when the loop nest to be interchanged is
1107 // the innermost two loops.
1108 if (!L->isInnermost()) {
1109 LLVM_DEBUG(dbgs() << "Only supported when the loop is the innermost.\n");
1110 ORE->emit([&]() {
1111 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1112 L->getStartLoc(), L->getHeader())
1113 << "Only supported when the loop is the innermost.";
1114 });
1115 return false;
1116 }
1117
1118 if (Phi->getNumIncomingValues() != 2)
1119 return false;
1120
1121 Value *Init = Phi->getIncomingValueForBlock(L->getLoopPreheader());
1122 Value *Next = Phi->getIncomingValueForBlock(L->getLoopLatch());
1123
1124 // So far only supports constant initial value.
1125 if (!isa<Constant>(Init)) {
1126 LLVM_DEBUG(
1127 dbgs()
1128 << "Only supported for the reduction with a constant initial value.\n");
1129 ORE->emit([&]() {
1130 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1131 L->getStartLoc(), L->getHeader())
1132 << "Only supported for the reduction with a constant initial "
1133 "value.";
1134 });
1135 return false;
1136 }
1137
1138 // The reduction result must live in the inner loop.
1139 if (Instruction *I = dyn_cast<Instruction>(Next)) {
1140 BasicBlock *BB = I->getParent();
1141 if (!L->contains(BB))
1142 return false;
1143 }
1144
1145 // The reduction should have only one user.
1146 if (!Phi->hasOneUser())
1147 return false;
1148
1149 // Check the reduction kind.
1150 if (!checkReductionKind(L, Phi, HasNoWrapInsts, HasNoInfInsts))
1151 return false;
1152
1153 // Find lcssa_phi in OuterLoop's Latch
1154 BasicBlock *ExitBlock = L->getExitBlock();
1155 if (!ExitBlock)
1156 return false;
1157
1158 PHINode *Lcssa = NULL;
1159 for (auto *U : Next->users()) {
1160 if (auto *P = dyn_cast<PHINode>(U)) {
1161 if (P == Phi)
1162 continue;
1163
1164 if (Lcssa == NULL && P->getParent() == ExitBlock &&
1165 P->getIncomingValueForBlock(L->getLoopLatch()) == Next)
1166 Lcssa = P;
1167 else
1168 return false;
1169 } else
1170 return false;
1171 }
1172 if (!Lcssa)
1173 return false;
1174
1175 if (!Lcssa->hasOneUser()) {
1176 LLVM_DEBUG(dbgs() << "Only supported when the reduction is used once in "
1177 "the outer loop.\n");
1178 ORE->emit([&]() {
1179 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1180 L->getStartLoc(), L->getHeader())
1181 << "Only supported when the reduction is used once in the outer "
1182 "loop.";
1183 });
1184 return false;
1185 }
1186
1187 StoreInst *LcssaStore =
1189 if (!LcssaStore || LcssaStore->getParent() != ExitBlock)
1190 return false;
1191
1192 Value *MemRef = LcssaStore->getOperand(1);
1193 Type *ElemTy = LcssaStore->getOperand(0)->getType();
1194
1195 // LcssaStore stores the reduction result in BB.
1196 // When the reduction is initialized from a constant value, we need to load
1197 // from the memory object into the target basic block of the inner loop. This
1198 // means the memory reference was used prematurely. So we must ensure that the
1199 // memory reference does not dominate the target basic block.
1200 // TODO: Move the memory reference definition into the loop header.
1201 if (!DT->dominates(dyn_cast<Instruction>(MemRef), L->getHeader())) {
1202 LLVM_DEBUG(dbgs() << "Only supported when memory reference dominate "
1203 "the inner loop.\n");
1204 ORE->emit([&]() {
1205 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1206 L->getStartLoc(), L->getHeader())
1207 << "Only supported when memory reference dominate the inner "
1208 "loop.";
1209 });
1210 return false;
1211 }
1212
1213 // Found a reduction in the inner loop.
1214 InnerReduction SR;
1215 SR.Reduction = Phi;
1216 SR.Init = Init;
1217 SR.Next = Next;
1218 SR.LcssaPhi = Lcssa;
1219 SR.LcssaStore = LcssaStore;
1220 SR.MemRef = MemRef;
1221 SR.ElemTy = ElemTy;
1222
1223 InnerReductions.push_back(SR);
1224 return true;
1225}
1226
1227bool LoopInterchangeLegality::findInductionAndReductions(
1228 Loop *L, SmallVector<PHINode *, 8> &Inductions, Loop *InnerLoop) {
1229 if (!L->getLoopLatch() || !L->getLoopPredecessor())
1230 return false;
1231 for (PHINode &PHI : L->getHeader()->phis()) {
1232 InductionDescriptor ID;
1234 Inductions.push_back(&PHI);
1235 else {
1236 // PHIs in inner loops need to be part of a reduction in the outer loop,
1237 // discovered when checking the PHIs of the outer loop earlier.
1238 if (!InnerLoop) {
1239 if (OuterInnerReductions.count(&PHI)) {
1240 LLVM_DEBUG(dbgs() << "Found a reduction across the outer loop.\n");
1241 } else if (EnableReduction2Memory &&
1242 isInnerReduction(L, &PHI, HasNoWrapReductions)) {
1243 LLVM_DEBUG(dbgs() << "Found a reduction in the inner loop: \n"
1244 << PHI << '\n');
1245 } else
1246 return false;
1247 } else {
1248 assert(PHI.getNumIncomingValues() == 2 &&
1249 "Phis in loop header should have exactly 2 incoming values");
1250 // Check if we have a PHI node in the outer loop that has a reduction
1251 // result from the inner loop as an incoming value.
1252 Value *V = followLCSSA(PHI.getIncomingValueForBlock(L->getLoopLatch()));
1253 PHINode *InnerRedPhi = findInnerReductionPhi(
1254 InnerLoop, V, HasNoWrapReductions, HasNoInfInsts);
1255 if (!InnerRedPhi ||
1256 !llvm::is_contained(InnerRedPhi->incoming_values(), &PHI)) {
1257 LLVM_DEBUG(
1258 dbgs()
1259 << "Failed to recognize PHI as an induction or reduction.\n");
1260 return false;
1261 }
1262 OuterInnerReductions.insert(&PHI);
1263 OuterInnerReductions.insert(InnerRedPhi);
1264 }
1265 }
1266 }
1267
1268 // For now we only support at most one reduction.
1269 if (InnerReductions.size() > 1) {
1270 LLVM_DEBUG(dbgs() << "Only supports at most one reduction.\n");
1271 ORE->emit([&]() {
1272 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1273 L->getStartLoc(), L->getHeader())
1274 << "Only supports at most one reduction.";
1275 });
1276 return false;
1277 }
1278
1279 return true;
1280}
1281
1282// This function indicates the current limitations in the transform as a result
1283// of which we do not proceed.
1284bool LoopInterchangeLegality::currentLimitations() {
1285 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1286
1287 // transform currently expects the loop latches to also be the exiting
1288 // blocks.
1289 if (InnerLoop->getExitingBlock() != InnerLoopLatch ||
1290 OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch() ||
1291 !isa<CondBrInst>(InnerLoopLatch->getTerminator()) ||
1292 !isa<CondBrInst>(OuterLoop->getLoopLatch()->getTerminator())) {
1293 LLVM_DEBUG(
1294 dbgs() << "Loops where the latch is not the exiting block are not"
1295 << " supported currently.\n");
1296 ORE->emit([&]() {
1297 return OptimizationRemarkMissed(DEBUG_TYPE, "ExitingNotLatch",
1298 OuterLoop->getStartLoc(),
1299 OuterLoop->getHeader())
1300 << "Loops where the latch is not the exiting block cannot be"
1301 " interchange currently.";
1302 });
1303 return true;
1304 }
1305
1306 SmallVector<PHINode *, 8> Inductions;
1307 if (!findInductionAndReductions(OuterLoop, Inductions, InnerLoop)) {
1308 LLVM_DEBUG(
1309 dbgs() << "Only outer loops with induction or reduction PHI nodes "
1310 << "are supported currently.\n");
1311 ORE->emit([&]() {
1312 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter",
1313 OuterLoop->getStartLoc(),
1314 OuterLoop->getHeader())
1315 << "Only outer loops with induction or reduction PHI nodes can be"
1316 " interchanged currently.";
1317 });
1318 return true;
1319 }
1320
1321 Inductions.clear();
1322 // For multi-level loop nests, make sure that all phi nodes for inner loops
1323 // at all levels can be recognized as a induction or reduction phi. Bail out
1324 // if a phi node at a certain nesting level cannot be properly recognized.
1325 Loop *CurLevelLoop = OuterLoop;
1326 while (!CurLevelLoop->getSubLoops().empty()) {
1327 // We already made sure that the loop nest is tightly nested.
1328 CurLevelLoop = CurLevelLoop->getSubLoops().front();
1329 if (!findInductionAndReductions(CurLevelLoop, Inductions, nullptr)) {
1330 LLVM_DEBUG(
1331 dbgs() << "Only inner loops with induction or reduction PHI nodes "
1332 << "are supported currently.\n");
1333 ORE->emit([&]() {
1334 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner",
1335 CurLevelLoop->getStartLoc(),
1336 CurLevelLoop->getHeader())
1337 << "Only inner loops with induction or reduction PHI nodes can be"
1338 " interchange currently.";
1339 });
1340 return true;
1341 }
1342 }
1343
1344 // TODO: Triangular loops are not handled for now.
1345 if (!isLoopStructureUnderstood()) {
1346 LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n");
1347 ORE->emit([&]() {
1348 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner",
1349 InnerLoop->getStartLoc(),
1350 InnerLoop->getHeader())
1351 << "Inner loop structure not understood currently.";
1352 });
1353 return true;
1354 }
1355
1356 return false;
1357}
1358
1359bool LoopInterchangeLegality::findInductions(
1360 Loop *L, SmallVectorImpl<PHINode *> &Inductions) {
1361 for (PHINode &PHI : L->getHeader()->phis()) {
1362 InductionDescriptor ID;
1364 Inductions.push_back(&PHI);
1365 }
1366 return !Inductions.empty();
1367}
1368
1369/// We currently only support LCSSA PHI nodes in the inner loop exit if their
1370/// users are either of the following:
1371///
1372/// - Reduction PHIs
1373/// - PHIs outside the outer loop
1374/// - PHIs belonging to the latch of the outer loop
1375///
1376/// These conditions mean that we are only interested in the final value after
1377/// the inner loop.
1378static bool
1381 PHINode *LcssaReduction) {
1382 BasicBlock *InnerExit = InnerL->getUniqueExitBlock();
1383 for (PHINode &PHI : InnerExit->phis()) {
1384 // The reduction LCSSA PHI will have only one incoming block, which comes
1385 // from the loop latch.
1386 if (PHI.getNumIncomingValues() > 1)
1387 return false;
1388 // The reduction LCSSA PHI's store user is rewritten by reduction2Memory();
1389 // skip its user-check but keep validating the remaining LCSSA PHIs.
1390 if (&PHI == LcssaReduction)
1391 continue;
1392 if (any_of(PHI.users(), [&Reductions, OuterL](User *U) {
1393 PHINode *PN = dyn_cast<PHINode>(U);
1394 if (!PN)
1395 return true;
1396 if (Reductions.count(PN))
1397 return false;
1398 BasicBlock *PB = PN->getParent();
1399 if (!OuterL->contains(PB))
1400 return false;
1401 return PB != OuterL->getLoopLatch();
1402 }))
1403 return false;
1404 }
1405 return true;
1406}
1407
1408// We currently support LCSSA PHI nodes in the outer loop exit, if their
1409// incoming values do not come from the outer loop latch or if the
1410// outer loop latch has a single predecessor. In that case, the value will
1411// be available if both the inner and outer loop conditions are true, which
1412// will still be true after interchanging. If we have multiple predecessor,
1413// that may not be the case, e.g. because the outer loop latch may be executed
1414// if the inner loop is not executed.
1415static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
1416 BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock();
1417 for (PHINode &PHI : LoopNestExit->phis()) {
1418 for (Value *Incoming : PHI.incoming_values()) {
1419 Instruction *IncomingI = dyn_cast<Instruction>(Incoming);
1420 if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch())
1421 continue;
1422
1423 // The incoming value is defined in the outer loop latch. Currently we
1424 // only support that in case the outer loop latch has a single predecessor.
1425 // This guarantees that the outer loop latch is executed if and only if
1426 // the inner loop is executed (because tightlyNested() guarantees that the
1427 // outer loop header only branches to the inner loop or the outer loop
1428 // latch).
1429 // FIXME: We could weaken this logic and allow multiple predecessors,
1430 // if the values are produced outside the loop latch. We would need
1431 // additional logic to update the PHI nodes in the exit block as
1432 // well.
1433 if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)
1434 return false;
1435 }
1436 }
1437 return true;
1438}
1439
1440// In case of multi-level nested loops, it may occur that lcssa phis exist in
1441// the latch of InnerLoop, i.e., when defs of the incoming values are further
1442// inside the loopnest. Sometimes those incoming values are not available
1443// after interchange, since the original inner latch will become the new outer
1444// latch which may have predecessor paths that do not include those incoming
1445// values.
1446// TODO: Handle transformation of lcssa phis in the InnerLoop latch in case of
1447// multi-level loop nests.
1448static bool areInnerLoopLatchPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
1449 if (InnerLoop->getSubLoops().empty())
1450 return true;
1451 // If the original outer latch has only one predecessor, then values defined
1452 // further inside the looploop, e.g., in the innermost loop, will be available
1453 // at the new outer latch after interchange.
1454 if (OuterLoop->getLoopLatch()->getUniquePredecessor() != nullptr)
1455 return true;
1456
1457 // The outer latch has more than one predecessors, i.e., the inner
1458 // exit and the inner header.
1459 // PHI nodes in the inner latch are lcssa phis where the incoming values
1460 // are defined further inside the loopnest. Check if those phis are used
1461 // in the original inner latch. If that is the case then bail out since
1462 // those incoming values may not be available at the new outer latch.
1463 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1464 for (PHINode &PHI : InnerLoopLatch->phis()) {
1465 for (auto *U : PHI.users()) {
1467 if (InnerLoopLatch == UI->getParent())
1468 return false;
1469 }
1470 }
1471 return true;
1472}
1473
1474bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
1475 unsigned OuterLoopId,
1476 CharMatrix &DepMatrix) {
1477 if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
1478 LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
1479 << " and OuterLoopId = " << OuterLoopId
1480 << " due to dependence\n");
1481 ORE->emit([&]() {
1482 return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
1483 InnerLoop->getStartLoc(),
1484 InnerLoop->getHeader())
1485 << "Cannot interchange loops due to dependences.";
1486 });
1487 return false;
1488 }
1489 // Check if outer and inner loop contain legal instructions only.
1490 for (auto *BB : OuterLoop->blocks())
1491 for (Instruction &I : *BB) {
1492 // Loads and stores are checked separately, so we can skip them here.
1494 continue;
1495
1496 // We cannot ignore potential memory reads, e.g., loads inside the called
1497 // function.
1498 if (!I.mayHaveSideEffects() && !I.mayReadFromMemory())
1499 continue;
1500
1501 LLVM_DEBUG(
1502 dbgs()
1503 << "Loops contain instructions that cannot be safely interchanged\n");
1504 ORE->emit([&]() {
1505 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsafeInst",
1506 I.getDebugLoc(), I.getParent())
1507 << "Cannot interchange loops due to instruction that is "
1508 "potentially unsafe to interchange.";
1509 });
1510
1511 return false;
1512 }
1513
1514 if (!findInductions(InnerLoop, InnerLoopInductions)) {
1515 LLVM_DEBUG(dbgs() << "Could not find inner loop induction variables.\n");
1516 return false;
1517 }
1518
1519 if (!areInnerLoopLatchPHIsSupported(OuterLoop, InnerLoop)) {
1520 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop latch.\n");
1521 ORE->emit([&]() {
1522 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerLatchPHI",
1523 InnerLoop->getStartLoc(),
1524 InnerLoop->getHeader())
1525 << "Cannot interchange loops because unsupported PHI nodes found "
1526 "in inner loop latch.";
1527 });
1528 return false;
1529 }
1530
1531 // TODO: The loops could not be interchanged due to current limitations in the
1532 // transform module.
1533 if (currentLimitations()) {
1534 LLVM_DEBUG(dbgs() << "Not legal because of current transform limitation\n");
1535 return false;
1536 }
1537
1538 // Check if the loops are tightly nested.
1539 if (!tightlyNested(OuterLoop, InnerLoop)) {
1540 LLVM_DEBUG(dbgs() << "Loops not tightly nested\n");
1541 ORE->emit([&]() {
1542 return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested",
1543 InnerLoop->getStartLoc(),
1544 InnerLoop->getHeader())
1545 << "Cannot interchange loops because they are not tightly "
1546 "nested.";
1547 });
1548 return false;
1549 }
1550
1551 // The LCSSA PHI for the reduction has passed checks before; its user
1552 // is a store instruction.
1553 PHINode *LcssaReduction = nullptr;
1554 assert(InnerReductions.size() <= 1 &&
1555 "So far we only support at most one reduction.");
1556 if (InnerReductions.size() == 1)
1557 LcssaReduction = InnerReductions[0].LcssaPhi;
1558
1559 if (!areInnerLoopExitPHIsSupported(OuterLoop, InnerLoop, OuterInnerReductions,
1560 LcssaReduction)) {
1561 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n");
1562 ORE->emit([&]() {
1563 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1564 InnerLoop->getStartLoc(),
1565 InnerLoop->getHeader())
1566 << "Found unsupported PHI node in loop exit.";
1567 });
1568 return false;
1569 }
1570
1571 if (!areOuterLoopExitPHIsSupported(OuterLoop, InnerLoop)) {
1572 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n");
1573 ORE->emit([&]() {
1574 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1575 OuterLoop->getStartLoc(),
1576 OuterLoop->getHeader())
1577 << "Found unsupported PHI node in loop exit.";
1578 });
1579 return false;
1580 }
1581
1582 if (any_of(OuterLoop->getLoopLatch()->phis(),
1583 [](PHINode &PHI) { return PHI.getNumIncomingValues() != 1; })) {
1584 LLVM_DEBUG(dbgs() << "Only outer loop latch PHI nodes with one incoming "
1585 "value are supported.\n");
1586 ORE->emit([&]() {
1587 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLatchPHI",
1588 OuterLoop->getStartLoc(),
1589 OuterLoop->getHeader())
1590 << "Only outer loop latch PHI nodes with one incoming value are "
1591 "supported.";
1592 });
1593 return false;
1594 }
1595
1596 return true;
1597}
1598
1599void CacheCostManager::computeIfUnitinialized() {
1600 if (CC.has_value())
1601 return;
1602
1603 LLVM_DEBUG(dbgs() << "Compute CacheCost.\n");
1604 CC = CacheCost::getCacheCost(*OutermostLoop, *AR, *DI);
1605 // Obtain the loop vector returned from loop cache analysis beforehand,
1606 // and put each <Loop, index> pair into a map for constant time query
1607 // later. Indices in loop vector reprsent the optimal order of the
1608 // corresponding loop, e.g., given a loopnest with depth N, index 0
1609 // indicates the loop should be placed as the outermost loop and index N
1610 // indicates the loop should be placed as the innermost loop.
1611 //
1612 // For the old pass manager CacheCost would be null.
1613 if (*CC != nullptr)
1614 for (const auto &[Idx, Cost] : enumerate((*CC)->getLoopCosts()))
1615 CostMap[Cost.first] = Idx;
1616}
1617
1618CacheCost *CacheCostManager::getCacheCost() {
1619 computeIfUnitinialized();
1620 return CC->get();
1621}
1622
1623const DenseMap<const Loop *, unsigned> &CacheCostManager::getCostMap() {
1624 computeIfUnitinialized();
1625 return CostMap;
1626}
1627
1628/// If \S contains an affine addrec for \p L, return the step recurrence of it.
1629/// If \S is loop invariant with respect to \p L, return nullptr. Otherwise,
1630/// return std::nullopt, which indicates we cannot determine the coefficient of
1631/// the addrec for \p L in \S.
1632/// TODO: Handle more complex cases. Maybe using SCEVTraversal is a good way to
1633/// do that.
1634std::optional<const SCEV *> getAddRecCoefficient(ScalarEvolution &SE,
1635 const SCEV *S, const Loop *L) {
1637 if (!AR) {
1638 if (SE.isLoopInvariant(S, L))
1639 return nullptr;
1640 return std::nullopt;
1641 }
1642
1643 if (!AR->isAffine()) {
1644 LLVM_DEBUG(dbgs() << "Unexpected non-affine addrec\n");
1645 return std::nullopt;
1646 }
1647
1648 std::optional<const SCEV *> Coeff =
1649 getAddRecCoefficient(SE, AR->getStart(), L);
1650 if (!Coeff.has_value())
1651 return std::nullopt;
1652
1653 if (AR->getLoop() == L) {
1654 assert(!*Coeff && "Found more than one addrec for the same loop");
1655 Coeff = AR->getStepRecurrence(SE);
1656 }
1657 return Coeff;
1658}
1659
1660int LoopInterchangeProfitability::getInstrOrderCost() {
1661 SmallPtrSet<const SCEV *, 4> GoodBasePtrs, BadBasePtrs;
1662 for (BasicBlock *BB : InnerLoop->blocks()) {
1663 for (Instruction &Ins : *BB) {
1664 if (!isa<LoadInst, StoreInst>(&Ins))
1665 continue;
1666 const SCEV *Access = SE->getSCEV(getLoadStorePointerOperand(&Ins));
1667 const SCEV *BasePtr = SE->getPointerBase(Access);
1668 std::optional<const SCEV *> OuterCoeff =
1669 getAddRecCoefficient(*SE, Access, OuterLoop);
1670 std::optional<const SCEV *> InnerCoeff =
1671 getAddRecCoefficient(*SE, Access, InnerLoop);
1672
1673 if (!OuterCoeff.has_value() || !*OuterCoeff || !InnerCoeff.has_value() ||
1674 !*InnerCoeff)
1675 continue;
1676
1677 // This heuristic assumes that a smaller step recurrence implies that the
1678 // induction variable corresponding to the loop is used in the inner
1679 // dimension of the array. Placing such a loop in the inner position would
1680 // be beneficial in terms of locality. If the array access is of the form
1681 // like `A[3*i + 2*j]`, this heuristic may lead to an unprofitable
1682 // interchange, but we expect such cases to be rare.
1683 const SCEV *OuterStep = SE->getAbsExpr(*OuterCoeff, /*IsNSW=*/false);
1684 const SCEV *InnerStep = SE->getAbsExpr(*InnerCoeff, /*IsNSW=*/false);
1685 // If we find the inner induction after an outer induction e.g.
1686 //
1687 // for(int i=0;i<N;i++)
1688 // for(int j=0;j<N;j++)
1689 // A[i][j] = A[i-1][j-1]+k;
1690 //
1691 //
1692 // then it is a good order. If we find the outer induction after an inner
1693 // induction e.g.
1694 //
1695 // for(int i=0;i<N;i++)
1696 // for(int j=0;j<N;j++)
1697 // A[j][i] = A[j-1][i-1]+k;
1698 //
1699 // then it is a bad order.
1700 //
1701 // To avoid counting the same base pointers multiple times, we deduplicate
1702 // them by using a set of base pointers.
1703 if (SE->isKnownPredicate(ICmpInst::ICMP_SLT, InnerStep, OuterStep))
1704 GoodBasePtrs.insert(BasePtr);
1705 else if (SE->isKnownPredicate(ICmpInst::ICMP_SLT, OuterStep, InnerStep))
1706 BadBasePtrs.insert(BasePtr);
1707 }
1708 }
1709
1710 int GoodOrder = GoodBasePtrs.size();
1711 int BadOrder = BadBasePtrs.size();
1712 return GoodOrder - BadOrder;
1713}
1714
1715std::optional<bool>
1716LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis(
1717 const DenseMap<const Loop *, unsigned> &CostMap, CacheCost *CC) {
1718 // This is the new cost model returned from loop cache analysis.
1719 // A smaller index means the loop should be placed an outer loop, and vice
1720 // versa.
1721 auto InnerLoopIt = CostMap.find(InnerLoop);
1722 if (InnerLoopIt == CostMap.end())
1723 return std::nullopt;
1724 auto OuterLoopIt = CostMap.find(OuterLoop);
1725 if (OuterLoopIt == CostMap.end())
1726 return std::nullopt;
1727
1728 if (CC->getLoopCost(*OuterLoop) == CC->getLoopCost(*InnerLoop))
1729 return std::nullopt;
1730 unsigned InnerIndex = InnerLoopIt->second;
1731 unsigned OuterIndex = OuterLoopIt->second;
1732 LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex
1733 << ", OuterIndex = " << OuterIndex << "\n");
1734 assert(InnerIndex != OuterIndex && "CostMap should assign unique "
1735 "numbers to each loop");
1736 return std::optional<bool>(InnerIndex < OuterIndex);
1737}
1738
1739std::optional<bool>
1740LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
1741 // Legacy cost model: this is rough cost estimation algorithm. It counts the
1742 // good and bad order of induction variables in the instruction and allows
1743 // reordering if number of bad orders is more than good.
1744 int Cost = getInstrOrderCost();
1745 LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
1747 return std::optional<bool>(true);
1748
1749 return std::nullopt;
1750}
1751
1752/// Return true if we can vectorize the loop specified by \p LoopId.
1753static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId) {
1754 for (const auto &Dep : DepMatrix) {
1755 char Dir = Dep[LoopId];
1756 char DepType = Dep.back();
1757 assert((DepType == '<' || DepType == '*') &&
1758 "Unexpected element in dependency vector");
1759
1760 // There are no loop-carried dependencies.
1761 if (Dir == '=' || Dir == 'I')
1762 continue;
1763
1764 // DepType being '<' means that this direction vector represents a forward
1765 // dependency. In principle, a loop with '<' direction can be vectorized in
1766 // this case.
1767 if (Dir == '<' && DepType == '<')
1768 continue;
1769
1770 // We cannot prove that the loop is vectorizable.
1771 return false;
1772 }
1773 return true;
1774}
1775
1776std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
1777 unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) {
1778 // If the outer loop cannot be vectorized, it is not profitable to move this
1779 // to inner position.
1780 if (!canVectorize(DepMatrix, OuterLoopId))
1781 return false;
1782
1783 // If the inner loop cannot be vectorized but the outer loop can be, then it
1784 // is profitable to interchange to enable inner loop parallelism.
1785 if (!canVectorize(DepMatrix, InnerLoopId))
1786 return true;
1787
1788 // If both the inner and the outer loop can be vectorized, it is necessary to
1789 // check the cost of each vectorized loop for profitability decision. At this
1790 // time we do not have a cost model to estimate them, so return nullopt.
1791 // TODO: Estimate the cost of vectorized loop when both the outer and the
1792 // inner loop can be vectorized.
1793 return std::nullopt;
1794}
1795
1796bool LoopInterchangeProfitability::isProfitable(
1797 const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
1798 unsigned OuterLoopId, CharMatrix &DepMatrix, CacheCostManager &CCM) {
1799 // Do not consider loops with a backedge that isn't taken, e.g. an
1800 // unconditional branch true/false, as candidates for interchange.
1801 // TODO: when interchange is forced, we should probably also allow
1802 // interchange for these loops, and thus this logic should be moved just
1803 // below the cost-model ignore check below. But this check is done first
1804 // to avoid the issue in #163954.
1805 const SCEV *InnerBTC = SE->getBackedgeTakenCount(InnerLoop);
1806 const SCEV *OuterBTC = SE->getBackedgeTakenCount(OuterLoop);
1807 if (InnerBTC && InnerBTC->isZero()) {
1808 LLVM_DEBUG(dbgs() << "Inner loop back-edge isn't taken, rejecting "
1809 "single iteration loop\n");
1810 return false;
1811 }
1812 if (OuterBTC && OuterBTC->isZero()) {
1813 LLVM_DEBUG(dbgs() << "Outer loop back-edge isn't taken, rejecting "
1814 "single iteration loop\n");
1815 return false;
1816 }
1817
1818 // Return true if interchange is forced and the cost-model ignored.
1819 if (Profitabilities.size() == 1 && Profitabilities[0] == RuleTy::Ignore)
1820 return true;
1822 "Duplicate rules and option 'ignore' are not allowed");
1823
1824 // isProfitable() is structured to avoid endless loop interchange. If the
1825 // highest priority rule (isProfitablePerLoopCacheAnalysis by default) could
1826 // decide the profitability then, profitability check will stop and return the
1827 // analysis result. If it failed to determine it (e.g., cache analysis failed
1828 // to analyze the loopnest due to delinearization issues) then go ahead the
1829 // second highest priority rule (isProfitablePerInstrOrderCost by default).
1830 // Likewise, if it failed to analysis the profitability then only, the last
1831 // rule (isProfitableForVectorization by default) will decide.
1832 std::optional<bool> shouldInterchange;
1833 for (RuleTy RT : Profitabilities) {
1834 switch (RT) {
1835 case RuleTy::PerLoopCacheAnalysis: {
1836 CacheCost *CC = CCM.getCacheCost();
1837 const DenseMap<const Loop *, unsigned> &CostMap = CCM.getCostMap();
1838 shouldInterchange = isProfitablePerLoopCacheAnalysis(CostMap, CC);
1839 break;
1840 }
1841 case RuleTy::PerInstrOrderCost:
1842 shouldInterchange = isProfitablePerInstrOrderCost();
1843 break;
1844 case RuleTy::ForVectorization:
1845 shouldInterchange =
1846 isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
1847 break;
1848 case RuleTy::Ignore:
1849 llvm_unreachable("Option 'ignore' is not supported with other options");
1850 break;
1851 }
1852
1853 // If this rule could determine the profitability, don't call subsequent
1854 // rules.
1855 if (shouldInterchange.has_value())
1856 break;
1857 }
1858
1859 if (!shouldInterchange.has_value()) {
1860 ORE->emit([&]() {
1861 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1862 InnerLoop->getStartLoc(),
1863 InnerLoop->getHeader())
1864 << "Insufficient information to calculate the cost of loop for "
1865 "interchange.";
1866 });
1867 return false;
1868 } else if (!shouldInterchange.value()) {
1869 ORE->emit([&]() {
1870 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1871 InnerLoop->getStartLoc(),
1872 InnerLoop->getHeader())
1873 << "Interchanging loops is not considered to improve cache "
1874 "locality nor vectorization.";
1875 });
1876 return false;
1877 }
1878 return true;
1879}
1880
1881void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
1882 Loop *InnerLoop) {
1883 for (Loop *L : *OuterLoop)
1884 if (L == InnerLoop) {
1885 OuterLoop->removeChildLoop(L);
1886 return;
1887 }
1888 llvm_unreachable("Couldn't find loop");
1889}
1890
1891/// Update LoopInfo, after interchanging. NewInner and NewOuter refer to the
1892/// new inner and outer loop after interchanging: NewInner is the original
1893/// outer loop and NewOuter is the original inner loop.
1894///
1895/// Before interchanging, we have the following structure
1896/// Outer preheader
1897// Outer header
1898// Inner preheader
1899// Inner header
1900// Inner body
1901// Inner latch
1902// outer bbs
1903// Outer latch
1904//
1905// After interchanging:
1906// Inner preheader
1907// Inner header
1908// Outer preheader
1909// Outer header
1910// Inner body
1911// outer bbs
1912// Outer latch
1913// Inner latch
1914void LoopInterchangeTransform::restructureLoops(
1915 Loop *NewInner, Loop *NewOuter, BasicBlock *OrigInnerPreHeader,
1916 BasicBlock *OrigOuterPreHeader) {
1917 Loop *OuterLoopParent = OuterLoop->getParentLoop();
1918 // The original inner loop preheader moves from the new inner loop to
1919 // the parent loop, if there is one.
1920 NewInner->removeBlockFromLoop(OrigInnerPreHeader);
1921 LI->changeLoopFor(OrigInnerPreHeader, OuterLoopParent);
1922
1923 // Switch the loop levels.
1924 if (OuterLoopParent) {
1925 // Remove the loop from its parent loop.
1926 removeChildLoop(OuterLoopParent, NewInner);
1927 removeChildLoop(NewInner, NewOuter);
1928 OuterLoopParent->addChildLoop(NewOuter);
1929 } else {
1930 removeChildLoop(NewInner, NewOuter);
1931 LI->changeTopLevelLoop(NewInner, NewOuter);
1932 }
1933 while (!NewOuter->isInnermost())
1934 NewInner->addChildLoop(NewOuter->removeChildLoop(NewOuter->begin()));
1935 NewOuter->addChildLoop(NewInner);
1936
1937 // BBs from the original inner loop.
1938 SmallVector<BasicBlock *, 8> OrigInnerBBs(NewOuter->blocks());
1939
1940 // Add BBs from the original outer loop to the original inner loop (excluding
1941 // BBs already in inner loop)
1942 for (BasicBlock *BB : NewInner->blocks())
1943 if (LI->getLoopFor(BB) == NewInner)
1944 NewOuter->addBlockEntry(BB);
1945
1946 // Now remove inner loop header and latch from the new inner loop and move
1947 // other BBs (the loop body) to the new inner loop.
1948 BasicBlock *OuterHeader = NewOuter->getHeader();
1949 BasicBlock *OuterLatch = NewOuter->getLoopLatch();
1950 for (BasicBlock *BB : OrigInnerBBs) {
1951 // Nothing will change for BBs in child loops.
1952 if (LI->getLoopFor(BB) != NewOuter)
1953 continue;
1954 // Remove the new outer loop header and latch from the new inner loop.
1955 if (BB == OuterHeader || BB == OuterLatch)
1956 NewInner->removeBlockFromLoop(BB);
1957 else
1958 LI->changeLoopFor(BB, NewInner);
1959 }
1960
1961 // The preheader of the original outer loop becomes part of the new
1962 // outer loop.
1963 NewOuter->addBlockEntry(OrigOuterPreHeader);
1964 LI->changeLoopFor(OrigOuterPreHeader, NewOuter);
1965
1966 // Tell SE that we move the loops around.
1967 SE->forgetLoop(NewOuter);
1968}
1969
1970/// User can write, or optimizers can generate the reduction for inner loop.
1971/// To make the interchange valid, apply Reduction2Mem by moving the
1972/// initializer and store instructions into the inner loop. So far we only
1973/// handle cases where the reduction variable is initialized to a constant.
1974/// For example, below code:
1975///
1976/// loop:
1977/// re = phi<0.0, next>
1978/// next = re op ...
1979/// endloop
1980/// reduc_sum = phi<next> // lcssa phi
1981/// MEM_REF[idx] = reduc_sum // LcssaStore
1982///
1983/// is transformed into:
1984///
1985/// loop:
1986/// tmp = MEM_REF[idx];
1987/// new_var = !first_iteration ? tmp : 0.0;
1988/// next = new_var op ...
1989/// MEM_REF[idx] = next; // after moving
1990/// endloop
1991///
1992/// In this way the initial const is used in the first iteration of loop.
1993void LoopInterchangeTransform::reduction2Memory() {
1995 LIL.getInnerReductions();
1996
1997 assert(InnerReductions.size() == 1 &&
1998 "So far we only support at most one reduction.");
1999
2000 LoopInterchangeLegality::InnerReduction SR = InnerReductions[0];
2001 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
2002 IRBuilder<> Builder(&*(InnerLoopHeader->getFirstNonPHIIt()));
2003
2004 // Check if it's the first iteration.
2005 LLVMContext &Context = InnerLoopHeader->getContext();
2006 PHINode *FirstIter =
2007 Builder.CreatePHI(Type::getInt1Ty(Context), 2, "first.iter");
2008 FirstIter->addIncoming(ConstantInt::get(Type::getInt1Ty(Context), 1),
2009 InnerLoop->getLoopPreheader());
2010 FirstIter->addIncoming(ConstantInt::get(Type::getInt1Ty(Context), 0),
2011 InnerLoop->getLoopLatch());
2012 assert(FirstIter->isComplete() && "The FirstIter PHI node is not complete.");
2013
2014 // When the reduction is initialized from a constant value, we need to add
2015 // a stmt loading from the memory object to target basic block in inner
2016 // loop.
2017 Instruction *LoadMem = Builder.CreateLoad(SR.ElemTy, SR.MemRef);
2018
2019 // Init new_var to MEM_REF or CONST depending on if it is the first iteration.
2020 Value *NewVar = Builder.CreateSelect(FirstIter, SR.Init, LoadMem, "new.var");
2021
2022 // Replace all uses of the reduction variable with a new variable.
2023 SR.Reduction->replaceAllUsesWith(NewVar);
2024
2025 // Move store instruction into inner loop, just after reduction next's
2026 // definition.
2027 SR.LcssaStore->setOperand(0, SR.Next);
2028 SR.LcssaStore->moveAfter(dyn_cast<Instruction>(SR.Next));
2029}
2030
2031bool LoopInterchangeTransform::transform(
2032 ArrayRef<Instruction *> DropNoWrapInsts,
2033 ArrayRef<Instruction *> DropNoInfInsts) {
2034 bool Transformed = false;
2035
2037 LIL.getInnerReductions();
2038 if (InnerReductions.size() == 1)
2039 reduction2Memory();
2040
2041 LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n");
2042 auto &InductionPHIs = LIL.getInnerLoopInductions();
2043 if (InductionPHIs.empty()) {
2044 LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
2045 return false;
2046 }
2047
2048 SmallVector<Instruction *, 8> InnerIndexVarList;
2049 for (PHINode *CurInductionPHI : InductionPHIs) {
2050 Instruction *IncomingValue = dyn_cast<Instruction>(
2051 CurInductionPHI->getIncomingValueForBlock(InnerLoop->getLoopLatch()));
2052 assert(IncomingValue &&
2053 "Incoming value from loop latch isn't an instruction");
2054 if (is_contained(InductionPHIs, IncomingValue))
2055 continue;
2056 InnerIndexVarList.push_back(IncomingValue);
2057 }
2058
2059 // Create a new latch block for the inner loop. We split at the
2060 // current latch's terminator and then move the condition and all
2061 // operands that are not either loop-invariant or the induction PHI into the
2062 // new latch block.
2063 BasicBlock *NewLatch =
2064 SplitBlock(InnerLoop->getLoopLatch(),
2065 InnerLoop->getLoopLatch()->getTerminator(), DT, LI);
2066
2067 SmallSetVector<Instruction *, 4> WorkList;
2068 unsigned i = 0;
2069 auto MoveInstructions = [&i, &WorkList, this, &InductionPHIs, NewLatch]() {
2070 for (; i < WorkList.size(); i++) {
2071 // Duplicate instruction and move it to the new latch. Update uses that
2072 // have been moved.
2073 Instruction *NewI = WorkList[i]->clone();
2074 NewI->insertBefore(NewLatch->getFirstNonPHIIt());
2075 assert(!NewI->mayHaveSideEffects() &&
2076 "Moving instructions with side-effects may change behavior of "
2077 "the loop nest!");
2078 for (Use &U : llvm::make_early_inc_range(WorkList[i]->uses())) {
2079 Instruction *UserI = cast<Instruction>(U.getUser());
2080 if (!InnerLoop->contains(UserI->getParent()) ||
2081 UserI->getParent() == NewLatch ||
2082 llvm::is_contained(InductionPHIs, UserI))
2083 U.set(NewI);
2084 }
2085 // Add operands of moved instruction to the worklist, except if they are
2086 // outside the inner loop or are the induction PHI.
2087 for (Value *Op : WorkList[i]->operands()) {
2089 if (!OpI || this->LI->getLoopFor(OpI->getParent()) != this->InnerLoop ||
2090 llvm::is_contained(InductionPHIs, OpI))
2091 continue;
2092 WorkList.insert(OpI);
2093 }
2094 }
2095 };
2096
2097 // FIXME: Should we interchange when we have a constant condition?
2100 ->getCondition());
2101 if (CondI)
2102 WorkList.insert(CondI);
2103 MoveInstructions();
2104 for (Instruction *InnerIndexVar : InnerIndexVarList)
2105 WorkList.insert(cast<Instruction>(InnerIndexVar));
2106 MoveInstructions();
2107
2108 // Ensure the inner loop phi nodes have a separate basic block.
2109 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
2110 if (&*InnerLoopHeader->getFirstNonPHIIt() !=
2111 InnerLoopHeader->getTerminator()) {
2112 SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHIIt(), DT, LI);
2113 LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n");
2114 }
2115
2116 // Instructions in the original inner loop preheader may depend on values
2117 // defined in the outer loop header. Move them there, because the original
2118 // inner loop preheader will become the entry into the interchanged loop nest.
2119 // Currently we move all instructions and rely on LICM to move invariant
2120 // instructions outside the loop nest.
2121 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
2122 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
2123
2124 if (InnerLoopPreHeader != OuterLoopHeader) {
2125 // Eliminate PHIs in the inner-loop preheader.
2126 for (PHINode &P : make_early_inc_range(InnerLoopPreHeader->phis())) {
2127 assert(P.getNumIncomingValues() == 1 &&
2128 "Expected single-incoming PHIs in inner loop preheader");
2129 P.replaceAllUsesWith(P.getIncomingValue(0));
2130 P.eraseFromParent();
2131 }
2132 for (Instruction &I :
2133 make_early_inc_range(make_range(InnerLoopPreHeader->begin(),
2134 std::prev(InnerLoopPreHeader->end()))))
2135 I.moveBeforePreserving(OuterLoopHeader->getTerminator()->getIterator());
2136 }
2137
2138 Transformed |= adjustLoopLinks();
2139 if (!Transformed) {
2140 LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n");
2141 return false;
2142 }
2143
2144 // Finally, drop the nsw/nuw/ninf flags from the instructions for reduction
2145 // calculations.
2146 for (Instruction *Reduction : DropNoWrapInsts) {
2147 Reduction->setHasNoSignedWrap(false);
2148 Reduction->setHasNoUnsignedWrap(false);
2149 }
2150 for (Instruction *I : DropNoInfInsts)
2151 I->setHasNoInfs(false);
2152
2153 return true;
2154}
2155
2156/// \brief Move all instructions except the terminator from FromBB right before
2157/// InsertBefore
2158static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
2159 BasicBlock *ToBB = InsertBefore->getParent();
2160
2161 ToBB->splice(InsertBefore->getIterator(), FromBB, FromBB->begin(),
2162 FromBB->getTerminator()->getIterator());
2163}
2164
2165/// Swap instructions between \p BB1 and \p BB2 but keep terminators intact.
2166static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2) {
2167 // Save all non-terminator instructions of BB1 into TempInstrs and unlink them
2168 // from BB1 afterwards.
2169 auto Iter = map_range(*BB1, [](Instruction &I) { return &I; });
2170 SmallVector<Instruction *, 4> TempInstrs(Iter.begin(), std::prev(Iter.end()));
2171 for (Instruction *I : TempInstrs)
2172 I->removeFromParent();
2173
2174 // Move instructions from BB2 to BB1.
2175 moveBBContents(BB2, BB1->getTerminator());
2176
2177 // Move instructions from TempInstrs to BB2.
2178 for (Instruction *I : TempInstrs)
2179 I->insertBefore(BB2->getTerminator()->getIterator());
2180}
2181
2182// Update BI to jump to NewBB instead of OldBB. Records updates to the
2183// dominator tree in DTUpdates. If \p MustUpdateOnce is true, assert that
2184// \p OldBB is exactly once in BI's successor list.
2185static void updateSuccessor(Instruction *Term, BasicBlock *OldBB,
2186 BasicBlock *NewBB,
2187 std::vector<DominatorTree::UpdateType> &DTUpdates,
2188 bool MustUpdateOnce = true) {
2189 assert((!MustUpdateOnce || llvm::count(successors(Term), OldBB) == 1) &&
2190 "BI must jump to OldBB exactly once.");
2191 bool Changed = false;
2192 for (Use &Op : Term->operands())
2193 if (Op == OldBB) {
2194 Op.set(NewBB);
2195 Changed = true;
2196 }
2197
2198 if (Changed) {
2199 DTUpdates.push_back(
2200 {DominatorTree::UpdateKind::Insert, Term->getParent(), NewBB});
2201 DTUpdates.push_back(
2202 {DominatorTree::UpdateKind::Delete, Term->getParent(), OldBB});
2203 }
2204 assert(Changed && "Expected a successor to be updated");
2205}
2206
2207// Move Lcssa PHIs to the right place.
2208static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader,
2209 BasicBlock *InnerLatch, BasicBlock *OuterHeader,
2210 BasicBlock *OuterLatch, BasicBlock *OuterExit,
2211 Loop *InnerLoop, LoopInfo *LI) {
2212
2213 // Deal with LCSSA PHI nodes in the exit block of the inner loop, that are
2214 // defined either in the header or latch. Those blocks will become header and
2215 // latch of the new outer loop, and the only possible users can PHI nodes
2216 // in the exit block of the loop nest or the outer loop header (reduction
2217 // PHIs, in that case, the incoming value must be defined in the inner loop
2218 // header). We can just substitute the user with the incoming value and remove
2219 // the PHI.
2220 for (PHINode &P : make_early_inc_range(InnerExit->phis())) {
2221 assert(P.getNumIncomingValues() == 1 &&
2222 "Only loops with a single exit are supported!");
2223
2224 Value *IncomingValue = P.getIncomingValueForBlock(InnerLatch);
2225 auto *IncI = dyn_cast<Instruction>(IncomingValue);
2226 if (!IncI) {
2227 // If the incoming value is not an instruction, it must be loop invariant.
2228 // In that case, we can just replace the PHI with the incoming value and
2229 // remove the PHI.
2230 assert(InnerLoop->isLoopInvariant(IncomingValue) &&
2231 "Expected non-instruction incoming value to be loop invariant");
2232 P.replaceAllUsesWith(IncomingValue);
2233 P.eraseFromParent();
2234 continue;
2235 }
2236
2237 // In case of multi-level nested loops, follow LCSSA to find the incoming
2238 // value defined from the innermost loop.
2239 auto IncIInnerMost = cast<Instruction>(followLCSSA(IncI));
2240 // Skip phis with incoming values from the inner loop body, excluding the
2241 // header and latch.
2242 if (IncIInnerMost->getParent() != InnerLatch &&
2243 IncIInnerMost->getParent() != InnerHeader)
2244 continue;
2245
2246 assert(all_of(P.users(),
2247 [OuterHeader, OuterExit, IncI, InnerHeader](User *U) {
2248 if (!isa<PHINode>(U))
2249 return true;
2250 return (cast<PHINode>(U)->getParent() == OuterHeader &&
2251 IncI->getParent() == InnerHeader) ||
2252 cast<PHINode>(U)->getParent() == OuterExit;
2253 }) &&
2254 "Can only replace phis iff the phi-uses are in the loop nest exit "
2255 "or the incoming value is defined in the inner header (it will "
2256 "dominate all loop blocks after interchanging)");
2257 P.replaceAllUsesWith(IncI);
2258 P.eraseFromParent();
2259 }
2260
2261 SmallVector<PHINode *, 8> LcssaInnerExit(
2262 llvm::make_pointer_range(InnerExit->phis()));
2263
2264 SmallVector<PHINode *, 8> LcssaInnerLatch(
2265 llvm::make_pointer_range(InnerLatch->phis()));
2266
2267 // Lcssa PHIs for values used outside the inner loop are in InnerExit.
2268 // If a PHI node has users outside of InnerExit, it has a use outside the
2269 // interchanged loop and we have to preserve it. We move these to
2270 // InnerLatch, which will become the new exit block for the innermost
2271 // loop after interchanging.
2272 for (PHINode *P : LcssaInnerExit)
2273 P->moveBefore(InnerLatch->getFirstNonPHIIt());
2274
2275 // If the inner loop latch contains LCSSA PHIs, those come from a child loop
2276 // and we have to move them to the new inner latch.
2277 for (PHINode *P : LcssaInnerLatch)
2278 P->moveBefore(InnerExit->getFirstNonPHIIt());
2279
2280 // Deal with LCSSA PHI nodes in the loop nest exit block. For PHIs that have
2281 // incoming values defined in the outer loop, we have to add a new PHI
2282 // in the inner loop latch, which became the exit block of the outer loop,
2283 // after interchanging.
2284 if (OuterExit) {
2285 for (PHINode &P : OuterExit->phis()) {
2286 if (P.getNumIncomingValues() != 1)
2287 continue;
2288 // Skip Phis with incoming values defined in the inner loop. Those should
2289 // already have been updated.
2290 auto I = dyn_cast<Instruction>(P.getIncomingValue(0));
2291 if (!I || LI->getLoopFor(I->getParent()) == InnerLoop)
2292 continue;
2293
2294 PHINode *NewPhi = dyn_cast<PHINode>(P.clone());
2295 NewPhi->setIncomingValue(0, P.getIncomingValue(0));
2296 NewPhi->setIncomingBlock(0, OuterLatch);
2297 // We might have incoming edges from other BBs, i.e., the original outer
2298 // header.
2299 for (auto *Pred : predecessors(InnerLatch)) {
2300 if (Pred == OuterLatch)
2301 continue;
2302 NewPhi->addIncoming(P.getIncomingValue(0), Pred);
2303 }
2304 NewPhi->insertBefore(InnerLatch->getFirstNonPHIIt());
2305 P.setIncomingValue(0, NewPhi);
2306 }
2307 }
2308
2309 // Now adjust the incoming blocks for the LCSSA PHIs.
2310 // For PHIs moved from Inner's exit block, we need to replace Inner's latch
2311 // with the new latch.
2312 InnerLatch->replacePhiUsesWith(InnerLatch, OuterLatch);
2313}
2314
2315/// This deals with a corner case when a LCSSA phi node appears in a non-exit
2316/// block: the outer loop latch block does not need to be exit block of the
2317/// inner loop. Consider a loop that was in LCSSA form, but then some
2318/// transformation like loop-unswitch comes along and creates an empty block,
2319/// where BB5 in this example is the outer loop latch block:
2320///
2321/// BB4:
2322/// br label %BB5
2323/// BB5:
2324/// %old.cond.lcssa = phi i16 [ %cond, %BB4 ]
2325/// br outer.header
2326///
2327/// Interchange then brings it in LCSSA form again resulting in this chain of
2328/// single-input phi nodes:
2329///
2330/// BB4:
2331/// %new.cond.lcssa = phi i16 [ %cond, %BB3 ]
2332/// br label %BB5
2333/// BB5:
2334/// %old.cond.lcssa = phi i16 [ %new.cond.lcssa, %BB4 ]
2335///
2336/// The problem is that interchange can reoder blocks BB4 and BB5 placing the
2337/// use before the def if we don't check this. The solution is to simplify
2338/// lcssa phi nodes (remove) if they appear in non-exit blocks.
2339///
2340static void simplifyLCSSAPhis(Loop *OuterLoop, Loop *InnerLoop) {
2341 BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
2342 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
2343
2344 // Do not modify lcssa phis where they actually belong, i.e. in exit blocks.
2345 if (OuterLoopLatch == InnerLoopExit)
2346 return;
2347
2348 // Collect and remove phis in non-exit blocks if they have 1 input.
2350 llvm::make_pointer_range(OuterLoopLatch->phis()));
2351 for (PHINode *Phi : Phis) {
2352 assert(Phi->getNumIncomingValues() == 1 && "Single input phi expected");
2353 LLVM_DEBUG(dbgs() << "Removing 1-input phi in non-exit block: " << *Phi
2354 << "\n");
2355 Phi->replaceAllUsesWith(Phi->getIncomingValue(0));
2356 Phi->eraseFromParent();
2357 }
2358}
2359
2360bool LoopInterchangeTransform::adjustLoopBranches() {
2361 LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n");
2362 std::vector<DominatorTree::UpdateType> DTUpdates;
2363
2364 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
2365 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
2366
2367 assert(OuterLoopPreHeader != OuterLoop->getHeader() &&
2368 InnerLoopPreHeader != InnerLoop->getHeader() && OuterLoopPreHeader &&
2369 InnerLoopPreHeader && "Guaranteed by loop-simplify form");
2370
2371 simplifyLCSSAPhis(OuterLoop, InnerLoop);
2372
2373 // Ensure that both preheaders do not contain PHI nodes and have single
2374 // predecessors. This allows us to move them easily. We use
2375 // InsertPreHeaderForLoop to create an 'extra' preheader, if the existing
2376 // preheaders do not satisfy those conditions.
2377 if (isa<PHINode>(OuterLoopPreHeader->begin()) ||
2378 !OuterLoopPreHeader->getUniquePredecessor())
2379 OuterLoopPreHeader =
2380 InsertPreheaderForLoop(OuterLoop, DT, LI, nullptr, true);
2381 if (InnerLoopPreHeader == OuterLoop->getHeader())
2382 InnerLoopPreHeader =
2383 InsertPreheaderForLoop(InnerLoop, DT, LI, nullptr, true);
2384
2385 // Adjust the loop preheader
2386 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
2387 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
2388 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
2389 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
2390 BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor();
2391 BasicBlock *InnerLoopLatchPredecessor =
2392 InnerLoopLatch->getUniquePredecessor();
2393 BasicBlock *InnerLoopLatchSuccessor;
2394 BasicBlock *OuterLoopLatchSuccessor;
2395
2396 CondBrInst *OuterLoopLatchBI =
2397 dyn_cast<CondBrInst>(OuterLoopLatch->getTerminator());
2398 CondBrInst *InnerLoopLatchBI =
2399 dyn_cast<CondBrInst>(InnerLoopLatch->getTerminator());
2400 Instruction *OuterLoopHeaderBI = OuterLoopHeader->getTerminator();
2401 Instruction *InnerLoopHeaderBI = InnerLoopHeader->getTerminator();
2402
2403 if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor ||
2404 !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI ||
2405 !InnerLoopHeaderBI)
2406 return false;
2407
2408 Instruction *InnerLoopLatchPredecessorBI =
2409 InnerLoopLatchPredecessor->getTerminator();
2410 Instruction *OuterLoopPredecessorBI = OuterLoopPredecessor->getTerminator();
2411
2412 if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI)
2413 return false;
2414 BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor();
2415 if (!InnerLoopHeaderSuccessor)
2416 return false;
2417
2418 // Adjust Loop Preheader and headers.
2419 // The branches in the outer loop predecessor and the outer loop header can
2420 // be unconditional branches or conditional branches with duplicates. Consider
2421 // this when updating the successors.
2422 updateSuccessor(OuterLoopPredecessorBI, OuterLoopPreHeader,
2423 InnerLoopPreHeader, DTUpdates, /*MustUpdateOnce=*/false);
2424 // The outer loop header might or might not branch to the outer latch.
2425 // We are guaranteed to branch to the inner loop preheader.
2426 if (llvm::is_contained(successors(OuterLoopHeaderBI), OuterLoopLatch)) {
2427 // In this case the outerLoopHeader should branch to the InnerLoopLatch.
2428 updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, InnerLoopLatch,
2429 DTUpdates,
2430 /*MustUpdateOnce=*/false);
2431 }
2432 updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader,
2433 InnerLoopHeaderSuccessor, DTUpdates,
2434 /*MustUpdateOnce=*/false);
2435
2436 // Adjust reduction PHI's now that the incoming block has changed.
2437 InnerLoopHeaderSuccessor->replacePhiUsesWith(InnerLoopHeader,
2438 OuterLoopHeader);
2439
2440 updateSuccessor(InnerLoopHeaderBI, InnerLoopHeaderSuccessor,
2441 OuterLoopPreHeader, DTUpdates);
2442
2443 // -------------Adjust loop latches-----------
2444 if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader)
2445 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1);
2446 else
2447 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0);
2448
2449 updateSuccessor(InnerLoopLatchPredecessorBI, InnerLoopLatch,
2450 InnerLoopLatchSuccessor, DTUpdates);
2451
2452 if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader)
2453 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1);
2454 else
2455 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0);
2456
2457 updateSuccessor(InnerLoopLatchBI, InnerLoopLatchSuccessor,
2458 OuterLoopLatchSuccessor, DTUpdates);
2459 updateSuccessor(OuterLoopLatchBI, OuterLoopLatchSuccessor, InnerLoopLatch,
2460 DTUpdates);
2461
2462 DT->applyUpdates(DTUpdates);
2463 restructureLoops(OuterLoop, InnerLoop, InnerLoopPreHeader,
2464 OuterLoopPreHeader);
2465
2466 moveLCSSAPhis(InnerLoopLatchSuccessor, InnerLoopHeader, InnerLoopLatch,
2467 OuterLoopHeader, OuterLoopLatch, InnerLoop->getExitBlock(),
2468 InnerLoop, LI);
2469 // For PHIs in the exit block of the outer loop, outer's latch has been
2470 // replaced by Inners'.
2471 OuterLoopLatchSuccessor->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
2472
2473 auto &OuterInnerReductions = LIL.getOuterInnerReductions();
2474 // Now update the reduction PHIs in the inner and outer loop headers.
2475 SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;
2476 for (PHINode &PHI : InnerLoopHeader->phis())
2477 if (OuterInnerReductions.contains(&PHI))
2478 InnerLoopPHIs.push_back(&PHI);
2479
2480 for (PHINode &PHI : OuterLoopHeader->phis())
2481 if (OuterInnerReductions.contains(&PHI))
2482 OuterLoopPHIs.push_back(&PHI);
2483
2484 // Now move the remaining reduction PHIs from outer to inner loop header and
2485 // vice versa. The PHI nodes must be part of a reduction across the inner and
2486 // outer loop and all the remains to do is and updating the incoming blocks.
2487 for (PHINode *PHI : OuterLoopPHIs) {
2488 LLVM_DEBUG(dbgs() << "Outer loop reduction PHIs:\n"; PHI->dump(););
2489 PHI->moveBefore(InnerLoopHeader->getFirstNonPHIIt());
2490 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
2491 }
2492 for (PHINode *PHI : InnerLoopPHIs) {
2493 LLVM_DEBUG(dbgs() << "Inner loop reduction PHIs:\n"; PHI->dump(););
2494 PHI->moveBefore(OuterLoopHeader->getFirstNonPHIIt());
2495 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
2496 }
2497
2498 // Update the incoming blocks for moved PHI nodes.
2499 OuterLoopHeader->replacePhiUsesWith(InnerLoopPreHeader, OuterLoopPreHeader);
2500 OuterLoopHeader->replacePhiUsesWith(InnerLoopLatch, OuterLoopLatch);
2501 InnerLoopHeader->replacePhiUsesWith(OuterLoopPreHeader, InnerLoopPreHeader);
2502 InnerLoopHeader->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
2503
2504 // Values defined in the outer loop header could be used in the inner loop
2505 // latch. In that case, we need to create LCSSA phis for them, because after
2506 // interchanging they will be defined in the new inner loop and used in the
2507 // new outer loop.
2508 SmallVector<Instruction *, 4> MayNeedLCSSAPhis;
2509 for (Instruction &I :
2510 make_range(OuterLoopHeader->begin(), std::prev(OuterLoopHeader->end())))
2511 MayNeedLCSSAPhis.push_back(&I);
2512 formLCSSAForInstructions(MayNeedLCSSAPhis, *DT, *LI, SE);
2513
2514 return true;
2515}
2516
2517bool LoopInterchangeTransform::adjustLoopLinks() {
2518 // Adjust all branches in the inner and outer loop.
2519 bool Changed = adjustLoopBranches();
2520 if (Changed) {
2521 // We have interchanged the preheaders so we need to interchange the data in
2522 // the preheaders as well. This is because the content of the inner
2523 // preheader was previously executed inside the outer loop.
2524 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
2525 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
2526 swapBBContents(OuterLoopPreHeader, InnerLoopPreHeader);
2527 }
2528 return Changed;
2529}
2530
2534 LPMUpdater &U) {
2535 Function &F = *LN.getParent();
2536 SmallVector<Loop *, 8> LoopList(LN.getLoops());
2537
2539
2540 // Ensure minimum depth of the loop nest to do the interchange.
2541 if (!hasSupportedLoopDepth(LoopList, ORE))
2542 return PreservedAnalyses::all();
2543 // Ensure computable loop nest.
2544 if (!isComputableLoopNest(&AR.SE, LoopList)) {
2545 LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n");
2546 return PreservedAnalyses::all();
2547 }
2548
2549 ORE.emit([&]() {
2550 return OptimizationRemarkAnalysis(DEBUG_TYPE, "Dependence",
2553 << "Computed dependence info, invoking the transform.";
2554 });
2555
2556 DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
2557 if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, &AR, &ORE).run(LN))
2558 return PreservedAnalyses::all();
2559 U.markLoopNestChanged(true);
2561}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file defines the StringMap class.
Rewrite undef for PHI
ReachingDefInfo InstSet InstSet & Ignore
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
DXIL Resource Access
#define DEBUG_TYPE
static void moveBBContents(BasicBlock &SourceBB, BasicBlock &TargetBB)
Move the contents of SourceBB to before the last instruction of TargetBB.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
This file defines the interface for the loop cache analysis.
SmallVector< Loop *, 4 > LoopVector
Definition LoopFuse.cpp:348
Loop::LoopBounds::Direction Direction
Definition LoopInfo.cpp:253
static cl::list< RuleTy > Profitabilities("loop-interchange-profitabilities", cl::MiscFlags::CommaSeparated, cl::Hidden, cl::desc("List of profitability heuristics to be used. They are applied in " "the given order"), cl::list_init< RuleTy >({RuleTy::PerInstrOrderCost, RuleTy::ForVectorization}), cl::values(clEnumValN(RuleTy::PerLoopCacheAnalysis, "cache", "Prioritize loop cache cost"), clEnumValN(RuleTy::PerInstrOrderCost, "instorder", "Prioritize the IVs order of each instruction"), clEnumValN(RuleTy::ForVectorization, "vectorize", "Prioritize vectorization"), clEnumValN(RuleTy::Ignore, "ignore", "Ignore profitability, force interchange (does not " "work with other options)")))
static cl::opt< int > LoopInterchangeCostThreshold("loop-interchange-threshold", cl::init(0), cl::Hidden, cl::desc("Interchange if you gain more than this number"))
static cl::opt< unsigned int > MinLoopNestDepth("loop-interchange-min-loop-nest-depth", cl::init(2), cl::Hidden, cl::desc("Minimum depth of loop nest considered for the transform"))
static void updateSuccessor(Instruction *Term, BasicBlock *OldBB, BasicBlock *NewBB, std::vector< DominatorTree::UpdateType > &DTUpdates, bool MustUpdateOnce=true)
static cl::opt< bool > EnableReduction2Memory("loop-interchange-reduction-to-mem", cl::init(false), cl::Hidden, cl::desc("Support for the inner-loop reduction pattern."))
static bool isComputableLoopNest(ScalarEvolution *SE, ArrayRef< Loop * > LoopList)
std::optional< const SCEV * > getAddRecCoefficient(ScalarEvolution &SE, const SCEV *S, const Loop *L)
If \S contains an affine addrec for L, return the step recurrence of it.
static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop)
static void simplifyLCSSAPhis(Loop *OuterLoop, Loop *InnerLoop)
This deals with a corner case when a LCSSA phi node appears in a non-exit block: the outer loop latch...
static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx, unsigned ToIndx)
static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader, BasicBlock *InnerLatch, BasicBlock *OuterHeader, BasicBlock *OuterLatch, BasicBlock *OuterExit, Loop *InnerLoop, LoopInfo *LI)
static void printDepMatrix(CharMatrix &DepMatrix)
static cl::opt< unsigned int > MaxMemInstrRatio("loop-interchange-max-mem-instr-ratio", cl::init(4), cl::Hidden, cl::desc("Maximum number of load/store instructions squared in relation to " "the total number of instructions. Higher value may lead to more " "interchanges at the cost of compile-time"))
static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2)
Swap instructions between BB1 and BB2 but keep terminators intact.
static PHINode * findInnerReductionPhi(Loop *L, Value *V, SmallVectorImpl< Instruction * > &HasNoWrapInsts, SmallVectorImpl< Instruction * > &HasNoInfInsts)
static bool areInnerLoopExitPHIsSupported(Loop *OuterL, Loop *InnerL, SmallPtrSetImpl< PHINode * > &Reductions, PHINode *LcssaReduction)
We currently only support LCSSA PHI nodes in the inner loop exit if their users are either of the fol...
static cl::opt< unsigned int > MaxLoopNestDepth("loop-interchange-max-loop-nest-depth", cl::init(10), cl::Hidden, cl::desc("Maximum depth of loop nest considered for the transform"))
static bool hasSupportedLoopDepth(ArrayRef< Loop * > LoopList, OptimizationRemarkEmitter &ORE)
static bool inThisOrder(const Instruction *Src, const Instruction *Dst)
Return true if Src appears before Dst in the same basic block.
static bool areInnerLoopLatchPHIsSupported(Loop *OuterLoop, Loop *InnerLoop)
static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId)
Return true if we can vectorize the loop specified by LoopId.
static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix, unsigned InnerLoopId, unsigned OuterLoopId)
#define DEBUG_TYPE
static Value * followLCSSA(Value *SV)
static void populateWorklist(Loop &L, LoopVector &LoopList)
static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, Loop *L, DependenceInfo *DI, ScalarEvolution *SE, OptimizationRemarkEmitter *ORE)
static std::optional< bool > isLexicographicallyPositive(ArrayRef< char > DV, unsigned Begin, unsigned End)
static bool checkReductionKind(Loop *L, PHINode *PHI, SmallVectorImpl< Instruction * > &HasNoWrapInsts, SmallVectorImpl< Instruction * > &HasNoInfInsts)
static bool noDuplicateRulesAndIgnore(ArrayRef< RuleTy > Rules)
This file defines the interface for the loop nest analysis.
This header provides classes for managing a pipeline of passes over loops in LLVM IR.
loop Loop Strength Reduction
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
uint64_t IntrinsicInst * II
#define P(N)
This file contains some templates that are useful if you are working with the STL at all.
SmallVector< Value *, 8 > ValueVector
This file defines the SmallSet class.
This file defines the SmallVector class.
static bool isProfitable(const StableFunctionMap::StableFunctionEntries &SFS)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:119
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
Get the first element.
Definition ArrayRef.h:144
size_t size() const
Get the array size.
Definition ArrayRef.h:141
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:185
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:474
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:461
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition BasicBlock.h:530
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
LLVM_ABI void replacePhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block to refer to basic block New instead of basic block Old.
LLVM_ABI const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition BasicBlock.h:659
static LLVM_ABI std::unique_ptr< CacheCost > getCacheCost(Loop &Root, LoopStandardAnalysisResults &AR, DependenceInfo &DI, std::optional< unsigned > TRT=std::nullopt)
Create a CacheCost for the loop nest rooted by Root.
CacheCostTy getLoopCost(const Loop &L) const
Return the estimated cost of loop L if the given loop is part of the loop nest associated with this o...
Value * getCondition() const
BasicBlock * getSuccessor(unsigned i) const
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:225
iterator end()
Definition DenseMap.h:143
DependenceInfo - This class is the main dependence-analysis driver.
LLVM_ABI std::unique_ptr< Dependence > depends(Instruction *Src, Instruction *Dst, bool UnderRuntimeAssumptions=false)
depends - Tests for a dependence between the Src and Dst instructions.
void applyUpdates(ArrayRef< UpdateType > Updates)
Inform the dominator tree about a sequence of CFG edge insertions and deletions and perform a batch u...
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static LLVM_ABI bool isInductionPHI(PHINode *Phi, const Loop *L, ScalarEvolution *SE, InductionDescriptor &D, ArrayRef< const SCEVPredicate * > NoWrapPreds={}, const SCEV *Expr=nullptr, SmallVectorImpl< Instruction * > *CastsToIgnore=nullptr)
Returns true if Phi is an induction in the loop L.
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI void insertBefore(InstListType::iterator InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified position.
LLVM_ABI bool mayHaveSideEffects() const LLVM_READONLY
Return true if the instruction may have side effects.
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
void removeBlockFromLoop(BlockT *BB)
This removes the specified basic block from the current loop, updating the Blocks as appropriate.
const std::vector< LoopT * > & getSubLoops() const
Return the loops contained entirely within this loop.
BlockT * getHeader() const
iterator_range< block_iterator > blocks() const
void addChildLoop(LoopT *NewChild)
Add the specified loop to be a child of this loop.
void addBlockEntry(BlockT *BB)
This adds a basic block directly to the basic block list.
BlockT * getExitBlock() const
If getExitBlocks would return exactly one block, return that block.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
iterator begin() const
BlockT * getUniqueExitBlock() const
If getUniqueExitBlocks would return exactly one block, return that block.
LoopT * removeChildLoop(iterator I)
This removes the specified child from being a subloop of this loop.
void changeTopLevelLoop(LoopT *OldLoop, LoopT *NewLoop)
Replace the specified loop in the top-level loops list with the indicated loop.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
void changeLoopFor(const BlockT *BB, LoopT *L)
Change the top-level loop that contains BB to the specified loop.
This class represents a loop nest and can be used to query its properties.
static const BasicBlock & skipEmptyBlockUntil(const BasicBlock *From, const BasicBlock *End, bool CheckUniquePred=false)
Recursivelly traverse all empty 'single successor' basic blocks of From (if there are any).
ArrayRef< Loop * > getLoops() const
Get the loops in the nest.
Function * getParent() const
Return the function to which the loop-nest belongs.
Loop & getOutermostLoop() const
Return the outermost loop in the loop nest.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition LoopInfo.cpp:659
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition LoopInfo.cpp:67
StringRef getName() const
Definition LoopInfo.h:407
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
bool isComplete() const
If the PHI node is complete which means all of its parent's predecessors have incoming value in this ...
op_range incoming_values()
void setIncomingBlock(unsigned i, BasicBlock *BB)
void setIncomingValue(unsigned i, Value *V)
static unsigned getIncomingValueNumForOperand(unsigned i)
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Instruction * getExactFPMathInst() const
Returns 1st non-reassociative FP instruction in the PHI node's use-chain.
static LLVM_ABI bool isReductionPHI(PHINode *Phi, Loop *TheLoop, RecurrenceDescriptor &RedDes, DemandedBits *DB=nullptr, AssumptionCache *AC=nullptr, DominatorTree *DT=nullptr, ScalarEvolution *SE=nullptr)
Returns true if Phi is a reduction in TheLoop.
LLVM_ABI SmallVector< Instruction *, 4 > getReductionOpChain(PHINode *Phi, Loop *L) const
Attempts to find a chain of operations from Phi to LoopExitInst that can be treated as a set of reduc...
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
SCEVUse getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
The main scalar evolution driver.
LLVM_ABI const SCEV * getAbsExpr(const SCEV *Op, bool IsNSW)
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI const SCEV * getPointerBase(const SCEV *V)
Transitively follow the chain of pointer-type operands until reaching a SCEV that does not have a sin...
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, SCEVUse LHS, SCEVUse RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:134
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133
std::pair< iterator, bool > try_emplace(StringRef Key, ArgsTy &&...Args)
Emplace a new element for the specified key into the map if the key isn't already in the map.
Definition StringMap.h:381
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
constexpr size_t size() const
Get the string size.
Definition StringRef.h:144
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
LLVM_ABI bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition Value.cpp:162
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:552
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition Value.cpp:184
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
list_initializer< Ty > list_init(ArrayRef< Ty > Vals)
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
DXILDebugInfoMap run(Module &M)
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI BasicBlock * InsertPreheaderForLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
InsertPreheaderForLoop - Once we discover that a loop doesn't have a preheader, this method is called...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
InstructionCost Cost
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
auto successors(const MachineBasicBlock *BB)
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
LLVM_ABI bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition LCSSA.cpp:449
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2172
auto map_range(ContainerTy &&C, FuncTy F)
Return a range that applies F to the elements of C.
Definition STLExtras.h:365
AnalysisManager< Loop, LoopStandardAnalysisResults & > LoopAnalysisManager
The loop analysis manager.
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:2025
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:322
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FMinimumNum
FP min with llvm.minimumnum semantics.
@ Or
Bitwise or logical OR of integers.
@ FMinimum
FP min with llvm.minimum semantics.
@ Mul
Product of integers.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ Xor
Bitwise or logical XOR of integers.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMaximum
FP max with llvm.maximum semantics.
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Add
Sum of integers.
@ FAdd
Sum of floats.
@ FMaximumNum
FP max with llvm.maximumnum semantics.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2011
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI bool formLCSSAForInstructions(SmallVectorImpl< Instruction * > &Worklist, const DominatorTree &DT, const LoopInfo &LI, ScalarEvolution *SE, SmallVectorImpl< PHINode * > *PHIsToRemove=nullptr, SmallVectorImpl< PHINode * > *InsertedPHIs=nullptr)
Ensures LCSSA form for every instruction from the Worklist in the scope of innermost containing loop.
Definition LCSSA.cpp:308
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
auto predecessors(const MachineBasicBlock *BB)
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
Definition iterator.h:368
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:863
LLVM_ABI PreservedAnalyses run(LoopNest &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...