LLVM 23.0.0git
LoopInterchange.cpp
Go to the documentation of this file.
1//===- LoopInterchange.cpp - Loop interchange pass-------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This Pass handles loop interchange transform.
10// This pass interchanges loops to provide a more cache-friendly memory access
11// patterns.
12//
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/STLExtras.h"
17#include "llvm/ADT/SmallSet.h"
19#include "llvm/ADT/Statistic.h"
20#include "llvm/ADT/StringMap.h"
21#include "llvm/ADT/StringRef.h"
30#include "llvm/IR/BasicBlock.h"
32#include "llvm/IR/Dominators.h"
33#include "llvm/IR/Function.h"
34#include "llvm/IR/IRBuilder.h"
35#include "llvm/IR/InstrTypes.h"
36#include "llvm/IR/Instruction.h"
38#include "llvm/IR/User.h"
39#include "llvm/IR/Value.h"
42#include "llvm/Support/Debug.h"
49#include <cassert>
50#include <utility>
51#include <vector>
52
53using namespace llvm;
54
55#define DEBUG_TYPE "loop-interchange"
56
57STATISTIC(LoopsInterchanged, "Number of loops interchanged");
58
60 "loop-interchange-threshold", cl::init(0), cl::Hidden,
61 cl::desc("Interchange if you gain more than this number"));
62
63// Maximum number of load-stores that can be handled in the dependency matrix.
65 "loop-interchange-max-meminstr-count", cl::init(64), cl::Hidden,
67 "Maximum number of load-store instructions that should be handled "
68 "in the dependency matrix. Higher value may lead to more interchanges "
69 "at the cost of compile-time"));
70
71namespace {
72
74
75/// A list of direction vectors. Each entry represents a direction vector
76/// corresponding to one or more dependencies existing in the loop nest. The
77/// length of all direction vectors is equal and is N + 1, where N is the depth
78/// of the loop nest. The first N elements correspond to the dependency
79/// direction of each N loops. The last one indicates whether this entry is
80/// forward dependency ('<') or not ('*'). The term "forward" aligns with what
81/// is defined in LoopAccessAnalysis.
82// TODO: Check if we can use a sparse matrix here.
83using CharMatrix = std::vector<std::vector<char>>;
84
85/// Types of rules used in profitability check.
86enum class RuleTy {
87 PerLoopCacheAnalysis,
88 PerInstrOrderCost,
89 ForVectorization,
90 Ignore
91};
92
93} // end anonymous namespace
94
95// Minimum loop depth supported.
97 "loop-interchange-min-loop-nest-depth", cl::init(2), cl::Hidden,
98 cl::desc("Minimum depth of loop nest considered for the transform"));
99
100// Maximum loop depth supported.
102 "loop-interchange-max-loop-nest-depth", cl::init(10), cl::Hidden,
103 cl::desc("Maximum depth of loop nest considered for the transform"));
104
105// We prefer cache cost to vectorization by default.
107 "loop-interchange-profitabilities", cl::MiscFlags::CommaSeparated,
109 cl::desc("List of profitability heuristics to be used. They are applied in "
110 "the given order"),
111 cl::list_init<RuleTy>({RuleTy::PerLoopCacheAnalysis,
112 RuleTy::PerInstrOrderCost,
113 RuleTy::ForVectorization}),
114 cl::values(clEnumValN(RuleTy::PerLoopCacheAnalysis, "cache",
115 "Prioritize loop cache cost"),
116 clEnumValN(RuleTy::PerInstrOrderCost, "instorder",
117 "Prioritize the IVs order of each instruction"),
118 clEnumValN(RuleTy::ForVectorization, "vectorize",
119 "Prioritize vectorization"),
120 clEnumValN(RuleTy::Ignore, "ignore",
121 "Ignore profitability, force interchange (does not "
122 "work with other options)")));
123
124// Support for the inner-loop reduction pattern.
126 "loop-interchange-reduction-to-mem", cl::init(false), cl::Hidden,
127 cl::desc("Support for the inner-loop reduction pattern."));
128
129#ifndef NDEBUG
132 for (RuleTy Rule : Rules) {
133 if (!Set.insert(Rule).second)
134 return false;
135 if (Rule == RuleTy::Ignore)
136 return false;
137 }
138 return true;
139}
140
141static void printDepMatrix(CharMatrix &DepMatrix) {
142 for (auto &Row : DepMatrix) {
143 // Drop the last element because it is a flag indicating whether this is
144 // forward dependency or not, which doesn't affect the legality check.
145 for (char D : drop_end(Row))
146 LLVM_DEBUG(dbgs() << D << " ");
147 LLVM_DEBUG(dbgs() << "\n");
148 }
149}
150
151/// Return true if \p Src appears before \p Dst in the same basic block.
152/// Precondition: \p Src and \Dst are distinct instructions within the same
153/// basic block.
154static bool inThisOrder(const Instruction *Src, const Instruction *Dst) {
155 assert(Src->getParent() == Dst->getParent() && Src != Dst &&
156 "Expected Src and Dst to be different instructions in the same BB");
157
158 bool FoundSrc = false;
159 for (const Instruction &I : *(Src->getParent())) {
160 if (&I == Src) {
161 FoundSrc = true;
162 continue;
163 }
164 if (&I == Dst)
165 return FoundSrc;
166 }
167
168 llvm_unreachable("Dst not found");
169}
170#endif
171
172static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
173 Loop *L, DependenceInfo *DI,
174 ScalarEvolution *SE,
177
178 ValueVector MemInstr;
179
180 // For each block.
181 for (BasicBlock *BB : L->blocks()) {
182 // Scan the BB and collect legal loads and stores.
183 for (Instruction &I : *BB) {
184 if (!isa<Instruction>(I))
185 return false;
186 if (auto *Ld = dyn_cast<LoadInst>(&I)) {
187 if (!Ld->isSimple())
188 return false;
189 MemInstr.push_back(&I);
190 } else if (auto *St = dyn_cast<StoreInst>(&I)) {
191 if (!St->isSimple())
192 return false;
193 MemInstr.push_back(&I);
194 }
195 }
196 }
197
198 LLVM_DEBUG(dbgs() << "Found " << MemInstr.size()
199 << " Loads and Stores to analyze\n");
200 if (MemInstr.size() > MaxMemInstrCount) {
201 LLVM_DEBUG(dbgs() << "The transform doesn't support more than "
202 << MaxMemInstrCount << " load/stores in a loop\n");
203 ORE->emit([&]() {
204 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoop",
205 L->getStartLoc(), L->getHeader())
206 << "Number of loads/stores exceeded, the supported maximum "
207 "can be increased with option "
208 "-loop-interchange-maxmeminstr-count.";
209 });
210 return false;
211 }
212 ValueVector::iterator I, IE, J, JE;
213
214 // Manage direction vectors that are already seen. Map each direction vector
215 // to an index of DepMatrix at which it is stored.
217
218 for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
219 for (J = I, JE = MemInstr.end(); J != JE; ++J) {
220 std::vector<char> Dep;
223 // Ignore Input dependencies.
224 if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
225 continue;
226 // Track Output, Flow, and Anti dependencies.
227 if (auto D = DI->depends(Src, Dst)) {
228 assert(D->isOrdered() && "Expected an output, flow or anti dep.");
229 // If the direction vector is negative, normalize it to
230 // make it non-negative.
231 if (D->normalize(SE))
232 LLVM_DEBUG(dbgs() << "Negative dependence vector normalized.\n");
233 LLVM_DEBUG(StringRef DepType =
234 D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
235 dbgs() << "Found " << DepType
236 << " dependency between Src and Dst\n"
237 << " Src:" << *Src << "\n Dst:" << *Dst << '\n');
238 unsigned Levels = D->getLevels();
239 char Direction;
240 for (unsigned II = 1; II <= Levels; ++II) {
241 // `DVEntry::LE` is converted to `*`. This is because `LE` means `<`
242 // or `=`, for which we don't have an equivalent representation, so
243 // that the conservative approximation is necessary. The same goes for
244 // `DVEntry::GE`.
245 // TODO: Use of fine-grained expressions allows for more accurate
246 // analysis.
247 unsigned Dir = D->getDirection(II);
248 if (Dir == Dependence::DVEntry::LT)
249 Direction = '<';
250 else if (Dir == Dependence::DVEntry::GT)
251 Direction = '>';
252 else if (Dir == Dependence::DVEntry::EQ)
253 Direction = '=';
254 else
255 Direction = '*';
256 Dep.push_back(Direction);
257 }
258
259 // If the Dependence object doesn't have any information, fill the
260 // dependency vector with '*'.
261 if (D->isConfused()) {
262 assert(Dep.empty() && "Expected empty dependency vector");
263 Dep.assign(Level, '*');
264 }
265
266 while (Dep.size() != Level) {
267 Dep.push_back('I');
268 }
269
270 // If all the elements of any direction vector have only '*', legality
271 // can't be proven. Exit early to save compile time.
272 if (all_of(Dep, equal_to('*'))) {
273 ORE->emit([&]() {
274 return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
275 L->getStartLoc(), L->getHeader())
276 << "All loops have dependencies in all directions.";
277 });
278 return false;
279 }
280
281 // Test whether the dependency is forward or not.
282 bool IsKnownForward = true;
283 if (Src->getParent() != Dst->getParent()) {
284 // In general, when Src and Dst are in different BBs, the execution
285 // order of them within a single iteration is not guaranteed. Treat
286 // conservatively as not-forward dependency in this case.
287 IsKnownForward = false;
288 } else {
289 // Src and Dst are in the same BB. If they are the different
290 // instructions, Src should appear before Dst in the BB as they are
291 // stored to MemInstr in that order.
292 assert((Src == Dst || inThisOrder(Src, Dst)) &&
293 "Unexpected instructions");
294
295 // If the Dependence object is reversed (due to normalization), it
296 // represents the dependency from Dst to Src, meaning it is a backward
297 // dependency. Otherwise it should be a forward dependency.
298 bool IsReversed = D->getSrc() != Src;
299 if (IsReversed)
300 IsKnownForward = false;
301 }
302
303 // Initialize the last element. Assume forward dependencies only; it
304 // will be updated later if there is any non-forward dependency.
305 Dep.push_back('<');
306
307 // The last element should express the "summary" among one or more
308 // direction vectors whose first N elements are the same (where N is
309 // the depth of the loop nest). Hence we exclude the last element from
310 // the Seen map.
311 auto [Ite, Inserted] = Seen.try_emplace(
312 StringRef(Dep.data(), Dep.size() - 1), DepMatrix.size());
313
314 // Make sure we only add unique entries to the dependency matrix.
315 if (Inserted)
316 DepMatrix.push_back(Dep);
317
318 // If we cannot prove that this dependency is forward, change the last
319 // element of the corresponding entry. Since a `[... *]` dependency
320 // includes a `[... <]` dependency, we do not need to keep both and
321 // change the existing entry instead.
322 if (!IsKnownForward)
323 DepMatrix[Ite->second].back() = '*';
324 }
325 }
326 }
327
328 return true;
329}
330
331// A loop is moved from index 'from' to an index 'to'. Update the Dependence
332// matrix by exchanging the two columns.
333static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx,
334 unsigned ToIndx) {
335 for (auto &Row : DepMatrix)
336 std::swap(Row[ToIndx], Row[FromIndx]);
337}
338
339// Check if a direction vector is lexicographically positive. Return true if it
340// is positive, nullopt if it is "zero", otherwise false.
341// [Theorem] A permutation of the loops in a perfect nest is legal if and only
342// if the direction matrix, after the same permutation is applied to its
343// columns, has no ">" direction as the leftmost non-"=" direction in any row.
344static std::optional<bool>
345isLexicographicallyPositive(ArrayRef<char> DV, unsigned Begin, unsigned End) {
346 for (unsigned char Direction : DV.slice(Begin, End - Begin)) {
347 if (Direction == '<')
348 return true;
349 if (Direction == '>' || Direction == '*')
350 return false;
351 }
352 return std::nullopt;
353}
354
355// Checks if it is legal to interchange 2 loops.
356static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
357 unsigned InnerLoopId,
358 unsigned OuterLoopId) {
359 unsigned NumRows = DepMatrix.size();
360 std::vector<char> Cur;
361 // For each row check if it is valid to interchange.
362 for (unsigned Row = 0; Row < NumRows; ++Row) {
363 // Create temporary DepVector check its lexicographical order
364 // before and after swapping OuterLoop vs InnerLoop
365 Cur = DepMatrix[Row];
366
367 // If the surrounding loops already ensure that the direction vector is
368 // lexicographically positive, nothing within the loop will be able to break
369 // the dependence. In such a case we can skip the subsequent check.
370 if (isLexicographicallyPositive(Cur, 0, OuterLoopId) == true)
371 continue;
372
373 // Check if the direction vector is lexicographically positive (or zero)
374 // for both before/after exchanged. Ignore the last element because it
375 // doesn't affect the legality.
376 if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false)
377 return false;
378 std::swap(Cur[InnerLoopId], Cur[OuterLoopId]);
379 if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false)
380 return false;
381 }
382 return true;
383}
384
385static void populateWorklist(Loop &L, LoopVector &LoopList) {
386 LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: "
387 << L.getHeader()->getParent()->getName() << " Loop: %"
388 << L.getHeader()->getName() << '\n');
389 assert(LoopList.empty() && "LoopList should initially be empty!");
390 Loop *CurrentLoop = &L;
391 const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
392 while (!Vec->empty()) {
393 // The current loop has multiple subloops in it hence it is not tightly
394 // nested.
395 // Discard all loops above it added into Worklist.
396 if (Vec->size() != 1) {
397 LoopList = {};
398 return;
399 }
400
401 LoopList.push_back(CurrentLoop);
402 CurrentLoop = Vec->front();
403 Vec = &CurrentLoop->getSubLoops();
404 }
405 LoopList.push_back(CurrentLoop);
406}
407
410 unsigned LoopNestDepth = LoopList.size();
411 if (LoopNestDepth < MinLoopNestDepth || LoopNestDepth > MaxLoopNestDepth) {
412 LLVM_DEBUG(dbgs() << "Unsupported depth of loop nest " << LoopNestDepth
413 << ", the supported range is [" << MinLoopNestDepth
414 << ", " << MaxLoopNestDepth << "].\n");
415 Loop *OuterLoop = LoopList.front();
416 ORE.emit([&]() {
417 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoopNestDepth",
418 OuterLoop->getStartLoc(),
419 OuterLoop->getHeader())
420 << "Unsupported depth of loop nest, the supported range is ["
421 << std::to_string(MinLoopNestDepth) << ", "
422 << std::to_string(MaxLoopNestDepth) << "].\n";
423 });
424 return false;
425 }
426 return true;
427}
428
430 ArrayRef<Loop *> LoopList) {
431 for (Loop *L : LoopList) {
432 const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
433 if (isa<SCEVCouldNotCompute>(ExitCountOuter)) {
434 LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n");
435 return false;
436 }
437 if (L->getNumBackEdges() != 1) {
438 LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
439 return false;
440 }
441 if (!L->getExitingBlock()) {
442 LLVM_DEBUG(dbgs() << "Loop doesn't have unique exit block\n");
443 return false;
444 }
445 }
446 return true;
447}
448
449namespace {
450
451/// LoopInterchangeLegality checks if it is legal to interchange the loop.
452class LoopInterchangeLegality {
453public:
454 LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
455 OptimizationRemarkEmitter *ORE, DominatorTree *DT)
456 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), DT(DT), ORE(ORE) {}
457
458 /// Check if the loops can be interchanged.
459 bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
460 CharMatrix &DepMatrix);
461
462 /// Discover induction PHIs in the header of \p L. Induction
463 /// PHIs are added to \p Inductions.
464 bool findInductions(Loop *L, SmallVectorImpl<PHINode *> &Inductions);
465
466 /// Check if the loop structure is understood. We do not handle triangular
467 /// loops for now.
468 bool isLoopStructureUnderstood();
469
470 bool currentLimitations();
471
472 const SmallPtrSetImpl<PHINode *> &getOuterInnerReductions() const {
473 return OuterInnerReductions;
474 }
475
476 const ArrayRef<PHINode *> getInnerLoopInductions() const {
477 return InnerLoopInductions;
478 }
479
480 ArrayRef<Instruction *> getHasNoWrapReductions() const {
481 return HasNoWrapReductions;
482 }
483
484 /// Record reductions in the inner loop. Currently supported reductions:
485 /// - initialized from a constant.
486 /// - reduction PHI node has only one user.
487 /// - located in the innermost loop.
488 struct InnerReduction {
489 /// The reduction itself.
490 PHINode *Reduction;
491 Value *Init;
492 Value *Next;
493 /// The Lcssa PHI.
494 PHINode *LcssaPhi;
495 /// Store reduction result into memory object.
496 StoreInst *LcssaStore;
497 /// The memory Location.
498 Value *MemRef;
499 Type *ElemTy;
500 };
501
502 ArrayRef<InnerReduction> getInnerReductions() const {
503 return InnerReductions;
504 }
505
506private:
507 bool tightlyNested(Loop *Outer, Loop *Inner);
508 bool containsUnsafeInstructions(BasicBlock *BB, Instruction *Skip);
509
510 /// Discover induction and reduction PHIs in the header of \p L. Induction
511 /// PHIs are added to \p Inductions, reductions are added to
512 /// OuterInnerReductions. When the outer loop is passed, the inner loop needs
513 /// to be passed as \p InnerLoop.
514 bool findInductionAndReductions(Loop *L,
515 SmallVector<PHINode *, 8> &Inductions,
516 Loop *InnerLoop);
517
518 /// Detect and record the reduction of the inner loop. Add them to
519 /// InnerReductions.
520 ///
521 /// innerloop:
522 /// Re = phi<0.0, Next>
523 /// Next = Re op ...
524 /// OuterLoopLatch:
525 /// Lcssa = phi<Next> ; lcssa phi
526 /// store Lcssa, MemRef ; LcssaStore
527 ///
528 bool isInnerReduction(Loop *L, PHINode *Phi,
529 SmallVectorImpl<Instruction *> &HasNoWrapInsts);
530
531 Loop *OuterLoop;
532 Loop *InnerLoop;
533
534 ScalarEvolution *SE;
535 DominatorTree *DT;
536
537 /// Interface to emit optimization remarks.
538 OptimizationRemarkEmitter *ORE;
539
540 /// Set of reduction PHIs taking part of a reduction across the inner and
541 /// outer loop.
542 SmallPtrSet<PHINode *, 4> OuterInnerReductions;
543
544 /// Set of inner loop induction PHIs
545 SmallVector<PHINode *, 8> InnerLoopInductions;
546
547 /// Hold instructions that have nuw/nsw flags and involved in reductions,
548 /// like integer addition/multiplication. Those flags must be dropped when
549 /// interchanging the loops.
550 SmallVector<Instruction *, 4> HasNoWrapReductions;
551
552 /// Vector of reductions in the inner loop.
553 SmallVector<InnerReduction, 8> InnerReductions;
554};
555
556/// Manages information utilized by the profitability check for cache. The main
557/// purpose of this class is to delay the computation of CacheCost until it is
558/// actually needed.
559class CacheCostManager {
560 Loop *OutermostLoop;
561 LoopStandardAnalysisResults *AR;
562 DependenceInfo *DI;
563
564 /// CacheCost for \ref OutermostLoop. Once it is computed, it is cached. Note
565 /// that the result can be nullptr.
566 std::optional<std::unique_ptr<CacheCost>> CC;
567
568 /// Maps each loop to an index representing the optimal position within the
569 /// loop-nest, as determined by the cache cost analysis.
570 DenseMap<const Loop *, unsigned> CostMap;
571
572 void computeIfUnitinialized();
573
574public:
575 CacheCostManager(Loop *OutermostLoop, LoopStandardAnalysisResults *AR,
576 DependenceInfo *DI)
577 : OutermostLoop(OutermostLoop), AR(AR), DI(DI) {}
578 CacheCost *getCacheCost();
579 const DenseMap<const Loop *, unsigned> &getCostMap();
580};
581
582/// LoopInterchangeProfitability checks if it is profitable to interchange the
583/// loop.
584class LoopInterchangeProfitability {
585public:
586 LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
587 OptimizationRemarkEmitter *ORE)
588 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
589
590 /// Check if the loop interchange is profitable.
591 bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
592 unsigned InnerLoopId, unsigned OuterLoopId,
593 CharMatrix &DepMatrix, CacheCostManager &CCM);
594
595private:
596 int getInstrOrderCost();
597 std::optional<bool> isProfitablePerLoopCacheAnalysis(
598 const DenseMap<const Loop *, unsigned> &CostMap, CacheCost *CC);
599 std::optional<bool> isProfitablePerInstrOrderCost();
600 std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId,
601 unsigned OuterLoopId,
602 CharMatrix &DepMatrix);
603 Loop *OuterLoop;
604 Loop *InnerLoop;
605
606 /// Scev analysis.
607 ScalarEvolution *SE;
608
609 /// Interface to emit optimization remarks.
610 OptimizationRemarkEmitter *ORE;
611};
612
613/// LoopInterchangeTransform interchanges the loop.
614class LoopInterchangeTransform {
615public:
616 LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
617 LoopInfo *LI, DominatorTree *DT,
618 const LoopInterchangeLegality &LIL)
619 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), LIL(LIL) {}
620
621 /// Interchange OuterLoop and InnerLoop.
622 bool transform(ArrayRef<Instruction *> DropNoWrapInsts);
623 void reduction2Memory();
624 void restructureLoops(Loop *NewInner, Loop *NewOuter,
625 BasicBlock *OrigInnerPreHeader,
626 BasicBlock *OrigOuterPreHeader);
627 void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
628
629private:
630 bool adjustLoopLinks();
631 bool adjustLoopBranches();
632
633 Loop *OuterLoop;
634 Loop *InnerLoop;
635
636 /// Scev analysis.
637 ScalarEvolution *SE;
638
639 LoopInfo *LI;
640 DominatorTree *DT;
641
642 const LoopInterchangeLegality &LIL;
643};
644
645struct LoopInterchange {
646 ScalarEvolution *SE = nullptr;
647 LoopInfo *LI = nullptr;
648 DependenceInfo *DI = nullptr;
649 DominatorTree *DT = nullptr;
650 LoopStandardAnalysisResults *AR = nullptr;
651
652 /// Interface to emit optimization remarks.
653 OptimizationRemarkEmitter *ORE;
654
655 LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI,
656 DominatorTree *DT, LoopStandardAnalysisResults *AR,
657 OptimizationRemarkEmitter *ORE)
658 : SE(SE), LI(LI), DI(DI), DT(DT), AR(AR), ORE(ORE) {}
659
660 bool run(Loop *L) {
661 if (L->getParentLoop())
662 return false;
663 SmallVector<Loop *, 8> LoopList;
664 populateWorklist(*L, LoopList);
665 return processLoopList(LoopList);
666 }
667
668 bool run(LoopNest &LN) {
669 SmallVector<Loop *, 8> LoopList(LN.getLoops());
670 for (unsigned I = 1; I < LoopList.size(); ++I)
671 if (LoopList[I]->getParentLoop() != LoopList[I - 1])
672 return false;
673 return processLoopList(LoopList);
674 }
675
676 unsigned selectLoopForInterchange(ArrayRef<Loop *> LoopList) {
677 // TODO: Add a better heuristic to select the loop to be interchanged based
678 // on the dependence matrix. Currently we select the innermost loop.
679 return LoopList.size() - 1;
680 }
681
682 bool processLoopList(SmallVectorImpl<Loop *> &LoopList) {
683 bool Changed = false;
684
685 // Ensure proper loop nest depth.
686 assert(hasSupportedLoopDepth(LoopList, *ORE) &&
687 "Unsupported depth of loop nest.");
688
689 unsigned LoopNestDepth = LoopList.size();
690
691 LLVM_DEBUG({
692 dbgs() << "Processing LoopList of size = " << LoopNestDepth
693 << " containing the following loops:\n";
694 for (auto *L : LoopList) {
695 dbgs() << " - ";
696 L->print(dbgs());
697 }
698 });
699
700 CharMatrix DependencyMatrix;
701 Loop *OuterMostLoop = *(LoopList.begin());
702 if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
703 OuterMostLoop, DI, SE, ORE)) {
704 LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
705 return false;
706 }
707
708 LLVM_DEBUG(dbgs() << "Dependency matrix before interchange:\n";
709 printDepMatrix(DependencyMatrix));
710
711 // Get the Outermost loop exit.
712 BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock();
713 if (!LoopNestExit) {
714 LLVM_DEBUG(dbgs() << "OuterMostLoop '" << OuterMostLoop->getName()
715 << "' needs an unique exit block");
716 return false;
717 }
718
719 unsigned SelecLoopId = selectLoopForInterchange(LoopList);
720 CacheCostManager CCM(LoopList[0], AR, DI);
721 // We try to achieve the globally optimal memory access for the loopnest,
722 // and do interchange based on a bubble-sort fasion. We start from
723 // the innermost loop, move it outwards to the best possible position
724 // and repeat this process.
725 for (unsigned j = SelecLoopId; j > 0; j--) {
726 bool ChangedPerIter = false;
727 for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
728 bool Interchanged =
729 processLoop(LoopList, i, i - 1, DependencyMatrix, CCM);
730 ChangedPerIter |= Interchanged;
731 Changed |= Interchanged;
732 }
733 // Early abort if there was no interchange during an entire round of
734 // moving loops outwards.
735 if (!ChangedPerIter)
736 break;
737 }
738 return Changed;
739 }
740
741 bool processLoop(SmallVectorImpl<Loop *> &LoopList, unsigned InnerLoopId,
742 unsigned OuterLoopId,
743 std::vector<std::vector<char>> &DependencyMatrix,
744 CacheCostManager &CCM) {
745 Loop *OuterLoop = LoopList[OuterLoopId];
746 Loop *InnerLoop = LoopList[InnerLoopId];
747 LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId
748 << " and OuterLoopId = " << OuterLoopId << "\n");
749 LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE, DT);
750 if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
751 LLVM_DEBUG(dbgs() << "Cannot prove legality, not interchanging loops '"
752 << OuterLoop->getName() << "' and '"
753 << InnerLoop->getName() << "'\n");
754 return false;
755 }
756 LLVM_DEBUG(dbgs() << "Loops '" << OuterLoop->getName() << "' and '"
757 << InnerLoop->getName()
758 << "' are legal to interchange\n");
759 LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
760 if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
761 DependencyMatrix, CCM)) {
762 LLVM_DEBUG(dbgs() << "Interchanging loops '" << OuterLoop->getName()
763 << "' and '" << InnerLoop->getName()
764 << "' not profitable.\n");
765 return false;
766 }
767
768 ORE->emit([&]() {
769 return OptimizationRemark(DEBUG_TYPE, "Interchanged",
770 InnerLoop->getStartLoc(),
771 InnerLoop->getHeader())
772 << "Loop interchanged with enclosing loop.";
773 });
774
775 LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LIL);
776 LIT.transform(LIL.getHasNoWrapReductions());
777 LLVM_DEBUG(dbgs() << "Loops interchanged: outer loop '"
778 << OuterLoop->getName() << "' and inner loop '"
779 << InnerLoop->getName() << "'\n");
780 LoopsInterchanged++;
781
782 llvm::formLCSSARecursively(*OuterLoop, *DT, LI, SE);
783
784 // Loops interchanged, update LoopList accordingly.
785 std::swap(LoopList[OuterLoopId], LoopList[InnerLoopId]);
786 // Update the DependencyMatrix
787 interChangeDependencies(DependencyMatrix, InnerLoopId, OuterLoopId);
788
789 LLVM_DEBUG(dbgs() << "Dependency matrix after interchange:\n";
790 printDepMatrix(DependencyMatrix));
791
792 return true;
793 }
794};
795
796} // end anonymous namespace
797
798bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB,
799 Instruction *Skip) {
800 return any_of(*BB, [Skip](const Instruction &I) {
801 if (&I == Skip)
802 return false;
803 return I.mayHaveSideEffects() || I.mayReadFromMemory();
804 });
805}
806
807bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
808 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
809 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
810 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
811
812 LLVM_DEBUG(dbgs() << "Checking if loops '" << OuterLoop->getName()
813 << "' and '" << InnerLoop->getName()
814 << "' are tightly nested\n");
815
816 // A perfectly nested loop will not have any branch in between the outer and
817 // inner block i.e. outer header will branch to either inner preheader and
818 // outerloop latch.
819 for (BasicBlock *Succ : successors(OuterLoopHeader))
820 if (Succ != InnerLoopPreHeader && Succ != InnerLoop->getHeader() &&
821 Succ != OuterLoopLatch)
822 return false;
823
824 LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
825
826 // The inner loop reduction pattern requires storing the LCSSA PHI in
827 // the OuterLoop Latch. Therefore, when reduction2Memory is enabled, skip
828 // that store during checks.
829 Instruction *Skip = nullptr;
830 assert(InnerReductions.size() <= 1 &&
831 "So far we only support at most one reduction.");
832 if (InnerReductions.size() == 1)
833 Skip = InnerReductions[0].LcssaStore;
834
835 // We do not have any basic block in between now make sure the outer header
836 // and outer loop latch doesn't contain any unsafe instructions.
837 if (containsUnsafeInstructions(OuterLoopHeader, Skip) ||
838 containsUnsafeInstructions(OuterLoopLatch, Skip))
839 return false;
840
841 // Also make sure the inner loop preheader does not contain any unsafe
842 // instructions. Note that all instructions in the preheader will be moved to
843 // the outer loop header when interchanging.
844 if (InnerLoopPreHeader != OuterLoopHeader &&
845 containsUnsafeInstructions(InnerLoopPreHeader, Skip))
846 return false;
847
848 BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
849 // Ensure the inner loop exit block flows to the outer loop latch possibly
850 // through empty blocks.
851 const BasicBlock &SuccInner =
852 LoopNest::skipEmptyBlockUntil(InnerLoopExit, OuterLoopLatch);
853 if (&SuccInner != OuterLoopLatch) {
854 LLVM_DEBUG(dbgs() << "Inner loop exit block " << *InnerLoopExit
855 << " does not lead to the outer loop latch.\n";);
856 return false;
857 }
858 // The inner loop exit block does flow to the outer loop latch and not some
859 // other BBs, now make sure it contains safe instructions, since it will be
860 // moved into the (new) inner loop after interchange.
861 if (containsUnsafeInstructions(InnerLoopExit, Skip))
862 return false;
863
864 LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n");
865 // We have a perfect loop nest.
866 return true;
867}
868
869bool LoopInterchangeLegality::isLoopStructureUnderstood() {
870 BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
871 for (PHINode *InnerInduction : InnerLoopInductions) {
872 unsigned Num = InnerInduction->getNumOperands();
873 for (unsigned i = 0; i < Num; ++i) {
874 Value *Val = InnerInduction->getOperand(i);
875 if (isa<Constant>(Val))
876 continue;
878 if (!I)
879 return false;
880 // TODO: Handle triangular loops.
881 // e.g. for(int i=0;i<N;i++)
882 // for(int j=i;j<N;j++)
883 unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
884 if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==
885 InnerLoopPreheader &&
886 !OuterLoop->isLoopInvariant(I)) {
887 return false;
888 }
889 }
890 }
891
892 // TODO: Handle triangular loops of another form.
893 // e.g. for(int i=0;i<N;i++)
894 // for(int j=0;j<i;j++)
895 // or,
896 // for(int i=0;i<N;i++)
897 // for(int j=0;j*i<N;j++)
898 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
899 CondBrInst *InnerLoopLatchBI =
900 dyn_cast<CondBrInst>(InnerLoopLatch->getTerminator());
901 if (!InnerLoopLatchBI)
902 return false;
903 if (CmpInst *InnerLoopCmp =
904 dyn_cast<CmpInst>(InnerLoopLatchBI->getCondition())) {
905 Value *Op0 = InnerLoopCmp->getOperand(0);
906 Value *Op1 = InnerLoopCmp->getOperand(1);
907
908 // LHS and RHS of the inner loop exit condition, e.g.,
909 // in "for(int j=0;j<i;j++)", LHS is j and RHS is i.
910 Value *Left = nullptr;
911 Value *Right = nullptr;
912
913 // Check if V only involves inner loop induction variable.
914 // Return true if V is InnerInduction, or a cast from
915 // InnerInduction, or a binary operator that involves
916 // InnerInduction and a constant.
917 std::function<bool(Value *)> IsPathToInnerIndVar;
918 IsPathToInnerIndVar = [this, &IsPathToInnerIndVar](const Value *V) -> bool {
919 if (llvm::is_contained(InnerLoopInductions, V))
920 return true;
921 if (isa<Constant>(V))
922 return true;
924 if (!I)
925 return false;
926 if (isa<CastInst>(I))
927 return IsPathToInnerIndVar(I->getOperand(0));
929 return IsPathToInnerIndVar(I->getOperand(0)) &&
930 IsPathToInnerIndVar(I->getOperand(1));
931 return false;
932 };
933
934 // In case of multiple inner loop indvars, it is okay if LHS and RHS
935 // are both inner indvar related variables.
936 if (IsPathToInnerIndVar(Op0) && IsPathToInnerIndVar(Op1))
937 return true;
938
939 // Otherwise we check if the cmp instruction compares an inner indvar
940 // related variable (Left) with a outer loop invariant (Right).
941 if (IsPathToInnerIndVar(Op0) && !isa<Constant>(Op0)) {
942 Left = Op0;
943 Right = Op1;
944 } else if (IsPathToInnerIndVar(Op1) && !isa<Constant>(Op1)) {
945 Left = Op1;
946 Right = Op0;
947 }
948
949 if (Left == nullptr)
950 return false;
951
952 const SCEV *S = SE->getSCEV(Right);
953 if (!SE->isLoopInvariant(S, OuterLoop))
954 return false;
955 }
956
957 return true;
958}
959
960// If SV is a LCSSA PHI node with a single incoming value, return the incoming
961// value.
962static Value *followLCSSA(Value *SV) {
964 if (!PHI)
965 return SV;
966
967 if (PHI->getNumIncomingValues() != 1)
968 return SV;
969 return followLCSSA(PHI->getIncomingValue(0));
970}
971
973 SmallVectorImpl<Instruction *> &HasNoWrapInsts) {
976 // Detect floating point reduction only when it can be reordered.
977 if (RD.getExactFPMathInst() != nullptr)
978 return false;
979
981 switch (RK) {
982 case RecurKind::Or:
983 case RecurKind::And:
984 case RecurKind::Xor:
985 case RecurKind::SMin:
986 case RecurKind::SMax:
987 case RecurKind::UMin:
988 case RecurKind::UMax:
989 case RecurKind::FAdd:
990 case RecurKind::FMul:
991 case RecurKind::FMin:
992 case RecurKind::FMax:
998 case RecurKind::AnyOf:
999 return true;
1000
1001 // Change the order of integer addition/multiplication may change the
1002 // semantics. Consider the following case:
1003 //
1004 // int A[2][2] = {{ INT_MAX, INT_MAX }, { INT_MIN, INT_MIN }};
1005 // int sum = 0;
1006 // for (int i = 0; i < 2; i++)
1007 // for (int j = 0; j < 2; j++)
1008 // sum += A[j][i];
1009 //
1010 // If the above loops are exchanged, the addition will cause an
1011 // overflow. To prevent this, we must drop the nuw/nsw flags from the
1012 // addition/multiplication instructions when we actually exchanges the
1013 // loops.
1014 case RecurKind::Add:
1015 case RecurKind::Mul: {
1016 unsigned OpCode = RecurrenceDescriptor::getOpcode(RK);
1018
1019 // Bail out when we fail to collect reduction instructions chain.
1020 if (Ops.empty())
1021 return false;
1022
1023 for (Instruction *I : Ops) {
1024 assert(I->getOpcode() == OpCode &&
1025 "Expected the instruction to be the reduction operation");
1026 (void)OpCode;
1027
1028 // If the instruction has nuw/nsw flags, we must drop them when the
1029 // transformation is actually performed.
1030 if (I->hasNoSignedWrap() || I->hasNoUnsignedWrap())
1031 HasNoWrapInsts.push_back(I);
1032 }
1033 return true;
1034 }
1035
1036 default:
1037 return false;
1038 }
1039 } else
1040 return false;
1041}
1042
1043// Check V's users to see if it is involved in a reduction in L.
1044static PHINode *
1046 SmallVectorImpl<Instruction *> &HasNoWrapInsts) {
1047 // Reduction variables cannot be constants.
1048 if (isa<Constant>(V))
1049 return nullptr;
1050
1051 for (Value *User : V->users()) {
1053 if (PHI->getNumIncomingValues() == 1)
1054 continue;
1055
1056 if (checkReductionKind(L, PHI, HasNoWrapInsts))
1057 return PHI;
1058 else
1059 return nullptr;
1060 }
1061 }
1062
1063 return nullptr;
1064}
1065
1066bool LoopInterchangeLegality::isInnerReduction(
1067 Loop *L, PHINode *Phi, SmallVectorImpl<Instruction *> &HasNoWrapInsts) {
1068
1069 // Only support reduction2Mem when the loop nest to be interchanged is
1070 // the innermost two loops.
1071 if (!L->isInnermost()) {
1072 LLVM_DEBUG(dbgs() << "Only supported when the loop is the innermost.\n");
1073 ORE->emit([&]() {
1074 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1075 L->getStartLoc(), L->getHeader())
1076 << "Only supported when the loop is the innermost.";
1077 });
1078 return false;
1079 }
1080
1081 if (Phi->getNumIncomingValues() != 2)
1082 return false;
1083
1084 Value *Init = Phi->getIncomingValueForBlock(L->getLoopPreheader());
1085 Value *Next = Phi->getIncomingValueForBlock(L->getLoopLatch());
1086
1087 // So far only supports constant initial value.
1088 if (!isa<Constant>(Init)) {
1089 LLVM_DEBUG(
1090 dbgs()
1091 << "Only supported for the reduction with a constant initial value.\n");
1092 ORE->emit([&]() {
1093 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1094 L->getStartLoc(), L->getHeader())
1095 << "Only supported for the reduction with a constant initial "
1096 "value.";
1097 });
1098 return false;
1099 }
1100
1101 // The reduction result must live in the inner loop.
1102 if (Instruction *I = dyn_cast<Instruction>(Next)) {
1103 BasicBlock *BB = I->getParent();
1104 if (!L->contains(BB))
1105 return false;
1106 }
1107
1108 // The reduction should have only one user.
1109 if (!Phi->hasOneUser())
1110 return false;
1111
1112 // Check the reduction kind.
1113 if (!checkReductionKind(L, Phi, HasNoWrapInsts))
1114 return false;
1115
1116 // Find lcssa_phi in OuterLoop's Latch
1117 BasicBlock *ExitBlock = L->getExitBlock();
1118 if (!ExitBlock)
1119 return false;
1120
1121 PHINode *Lcssa = NULL;
1122 for (auto *U : Next->users()) {
1123 if (auto *P = dyn_cast<PHINode>(U)) {
1124 if (P == Phi)
1125 continue;
1126
1127 if (Lcssa == NULL && P->getParent() == ExitBlock &&
1128 P->getIncomingValueForBlock(L->getLoopLatch()) == Next)
1129 Lcssa = P;
1130 else
1131 return false;
1132 } else
1133 return false;
1134 }
1135 if (!Lcssa)
1136 return false;
1137
1138 if (!Lcssa->hasOneUser()) {
1139 LLVM_DEBUG(dbgs() << "Only supported when the reduction is used once in "
1140 "the outer loop.\n");
1141 ORE->emit([&]() {
1142 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1143 L->getStartLoc(), L->getHeader())
1144 << "Only supported when the reduction is used once in the outer "
1145 "loop.";
1146 });
1147 return false;
1148 }
1149
1150 StoreInst *LcssaStore =
1152 if (!LcssaStore || LcssaStore->getParent() != ExitBlock)
1153 return false;
1154
1155 Value *MemRef = LcssaStore->getOperand(1);
1156 Type *ElemTy = LcssaStore->getOperand(0)->getType();
1157
1158 // LcssaStore stores the reduction result in BB.
1159 // When the reduction is initialized from a constant value, we need to load
1160 // from the memory object into the target basic block of the inner loop. This
1161 // means the memory reference was used prematurely. So we must ensure that the
1162 // memory reference does not dominate the target basic block.
1163 // TODO: Move the memory reference definition into the loop header.
1164 if (!DT->dominates(dyn_cast<Instruction>(MemRef), L->getHeader())) {
1165 LLVM_DEBUG(dbgs() << "Only supported when memory reference dominate "
1166 "the inner loop.\n");
1167 ORE->emit([&]() {
1168 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1169 L->getStartLoc(), L->getHeader())
1170 << "Only supported when memory reference dominate the inner "
1171 "loop.";
1172 });
1173 return false;
1174 }
1175
1176 // Found a reduction in the inner loop.
1177 InnerReduction SR;
1178 SR.Reduction = Phi;
1179 SR.Init = Init;
1180 SR.Next = Next;
1181 SR.LcssaPhi = Lcssa;
1182 SR.LcssaStore = LcssaStore;
1183 SR.MemRef = MemRef;
1184 SR.ElemTy = ElemTy;
1185
1186 InnerReductions.push_back(SR);
1187 return true;
1188}
1189
1190bool LoopInterchangeLegality::findInductionAndReductions(
1191 Loop *L, SmallVector<PHINode *, 8> &Inductions, Loop *InnerLoop) {
1192 if (!L->getLoopLatch() || !L->getLoopPredecessor())
1193 return false;
1194 for (PHINode &PHI : L->getHeader()->phis()) {
1195 InductionDescriptor ID;
1197 Inductions.push_back(&PHI);
1198 else {
1199 // PHIs in inner loops need to be part of a reduction in the outer loop,
1200 // discovered when checking the PHIs of the outer loop earlier.
1201 if (!InnerLoop) {
1202 if (OuterInnerReductions.count(&PHI)) {
1203 LLVM_DEBUG(dbgs() << "Found a reduction across the outer loop.\n");
1204 } else if (EnableReduction2Memory &&
1205 isInnerReduction(L, &PHI, HasNoWrapReductions)) {
1206 LLVM_DEBUG(dbgs() << "Found a reduction in the inner loop: \n"
1207 << PHI << '\n');
1208 } else
1209 return false;
1210 } else {
1211 assert(PHI.getNumIncomingValues() == 2 &&
1212 "Phis in loop header should have exactly 2 incoming values");
1213 // Check if we have a PHI node in the outer loop that has a reduction
1214 // result from the inner loop as an incoming value.
1215 Value *V = followLCSSA(PHI.getIncomingValueForBlock(L->getLoopLatch()));
1216 PHINode *InnerRedPhi =
1217 findInnerReductionPhi(InnerLoop, V, HasNoWrapReductions);
1218 if (!InnerRedPhi ||
1219 !llvm::is_contained(InnerRedPhi->incoming_values(), &PHI)) {
1220 LLVM_DEBUG(
1221 dbgs()
1222 << "Failed to recognize PHI as an induction or reduction.\n");
1223 return false;
1224 }
1225 OuterInnerReductions.insert(&PHI);
1226 OuterInnerReductions.insert(InnerRedPhi);
1227 }
1228 }
1229 }
1230
1231 // For now we only support at most one reduction.
1232 if (InnerReductions.size() > 1) {
1233 LLVM_DEBUG(dbgs() << "Only supports at most one reduction.\n");
1234 ORE->emit([&]() {
1235 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1236 L->getStartLoc(), L->getHeader())
1237 << "Only supports at most one reduction.";
1238 });
1239 return false;
1240 }
1241
1242 return true;
1243}
1244
1245// This function indicates the current limitations in the transform as a result
1246// of which we do not proceed.
1247bool LoopInterchangeLegality::currentLimitations() {
1248 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1249
1250 // transform currently expects the loop latches to also be the exiting
1251 // blocks.
1252 if (InnerLoop->getExitingBlock() != InnerLoopLatch ||
1253 OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch() ||
1254 !isa<CondBrInst>(InnerLoopLatch->getTerminator()) ||
1255 !isa<CondBrInst>(OuterLoop->getLoopLatch()->getTerminator())) {
1256 LLVM_DEBUG(
1257 dbgs() << "Loops where the latch is not the exiting block are not"
1258 << " supported currently.\n");
1259 ORE->emit([&]() {
1260 return OptimizationRemarkMissed(DEBUG_TYPE, "ExitingNotLatch",
1261 OuterLoop->getStartLoc(),
1262 OuterLoop->getHeader())
1263 << "Loops where the latch is not the exiting block cannot be"
1264 " interchange currently.";
1265 });
1266 return true;
1267 }
1268
1269 SmallVector<PHINode *, 8> Inductions;
1270 if (!findInductionAndReductions(OuterLoop, Inductions, InnerLoop)) {
1271 LLVM_DEBUG(
1272 dbgs() << "Only outer loops with induction or reduction PHI nodes "
1273 << "are supported currently.\n");
1274 ORE->emit([&]() {
1275 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter",
1276 OuterLoop->getStartLoc(),
1277 OuterLoop->getHeader())
1278 << "Only outer loops with induction or reduction PHI nodes can be"
1279 " interchanged currently.";
1280 });
1281 return true;
1282 }
1283
1284 Inductions.clear();
1285 // For multi-level loop nests, make sure that all phi nodes for inner loops
1286 // at all levels can be recognized as a induction or reduction phi. Bail out
1287 // if a phi node at a certain nesting level cannot be properly recognized.
1288 Loop *CurLevelLoop = OuterLoop;
1289 while (!CurLevelLoop->getSubLoops().empty()) {
1290 // We already made sure that the loop nest is tightly nested.
1291 CurLevelLoop = CurLevelLoop->getSubLoops().front();
1292 if (!findInductionAndReductions(CurLevelLoop, Inductions, nullptr)) {
1293 LLVM_DEBUG(
1294 dbgs() << "Only inner loops with induction or reduction PHI nodes "
1295 << "are supported currently.\n");
1296 ORE->emit([&]() {
1297 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner",
1298 CurLevelLoop->getStartLoc(),
1299 CurLevelLoop->getHeader())
1300 << "Only inner loops with induction or reduction PHI nodes can be"
1301 " interchange currently.";
1302 });
1303 return true;
1304 }
1305 }
1306
1307 // TODO: Triangular loops are not handled for now.
1308 if (!isLoopStructureUnderstood()) {
1309 LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n");
1310 ORE->emit([&]() {
1311 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner",
1312 InnerLoop->getStartLoc(),
1313 InnerLoop->getHeader())
1314 << "Inner loop structure not understood currently.";
1315 });
1316 return true;
1317 }
1318
1319 return false;
1320}
1321
1322bool LoopInterchangeLegality::findInductions(
1323 Loop *L, SmallVectorImpl<PHINode *> &Inductions) {
1324 for (PHINode &PHI : L->getHeader()->phis()) {
1325 InductionDescriptor ID;
1327 Inductions.push_back(&PHI);
1328 }
1329 return !Inductions.empty();
1330}
1331
1332// We currently only support LCSSA PHI nodes in the inner loop exit, if their
1333// users are either reduction PHIs or PHIs outside the outer loop (which means
1334// the we are only interested in the final value after the loop).
1335static bool
1338 PHINode *LcssaReduction) {
1339 BasicBlock *InnerExit = OuterL->getUniqueExitBlock();
1340 for (PHINode &PHI : InnerExit->phis()) {
1341 // The reduction LCSSA PHI will have only one incoming block, which comes
1342 // from the loop latch.
1343 if (PHI.getNumIncomingValues() > 1)
1344 return false;
1345 if (&PHI == LcssaReduction)
1346 return true;
1347 if (any_of(PHI.users(), [&Reductions, OuterL](User *U) {
1348 PHINode *PN = dyn_cast<PHINode>(U);
1349 return !PN ||
1350 (!Reductions.count(PN) && OuterL->contains(PN->getParent()));
1351 })) {
1352 return false;
1353 }
1354 }
1355 return true;
1356}
1357
1358// We currently support LCSSA PHI nodes in the outer loop exit, if their
1359// incoming values do not come from the outer loop latch or if the
1360// outer loop latch has a single predecessor. In that case, the value will
1361// be available if both the inner and outer loop conditions are true, which
1362// will still be true after interchanging. If we have multiple predecessor,
1363// that may not be the case, e.g. because the outer loop latch may be executed
1364// if the inner loop is not executed.
1365static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
1366 BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock();
1367 for (PHINode &PHI : LoopNestExit->phis()) {
1368 for (Value *Incoming : PHI.incoming_values()) {
1370 if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch())
1371 continue;
1372
1373 // The incoming value is defined in the outer loop latch. Currently we
1374 // only support that in case the outer loop latch has a single predecessor.
1375 // This guarantees that the outer loop latch is executed if and only if
1376 // the inner loop is executed (because tightlyNested() guarantees that the
1377 // outer loop header only branches to the inner loop or the outer loop
1378 // latch).
1379 // FIXME: We could weaken this logic and allow multiple predecessors,
1380 // if the values are produced outside the loop latch. We would need
1381 // additional logic to update the PHI nodes in the exit block as
1382 // well.
1383 if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)
1384 return false;
1385 }
1386 }
1387 return true;
1388}
1389
1390// In case of multi-level nested loops, it may occur that lcssa phis exist in
1391// the latch of InnerLoop, i.e., when defs of the incoming values are further
1392// inside the loopnest. Sometimes those incoming values are not available
1393// after interchange, since the original inner latch will become the new outer
1394// latch which may have predecessor paths that do not include those incoming
1395// values.
1396// TODO: Handle transformation of lcssa phis in the InnerLoop latch in case of
1397// multi-level loop nests.
1398static bool areInnerLoopLatchPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
1399 if (InnerLoop->getSubLoops().empty())
1400 return true;
1401 // If the original outer latch has only one predecessor, then values defined
1402 // further inside the looploop, e.g., in the innermost loop, will be available
1403 // at the new outer latch after interchange.
1404 if (OuterLoop->getLoopLatch()->getUniquePredecessor() != nullptr)
1405 return true;
1406
1407 // The outer latch has more than one predecessors, i.e., the inner
1408 // exit and the inner header.
1409 // PHI nodes in the inner latch are lcssa phis where the incoming values
1410 // are defined further inside the loopnest. Check if those phis are used
1411 // in the original inner latch. If that is the case then bail out since
1412 // those incoming values may not be available at the new outer latch.
1413 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1414 for (PHINode &PHI : InnerLoopLatch->phis()) {
1415 for (auto *U : PHI.users()) {
1417 if (InnerLoopLatch == UI->getParent())
1418 return false;
1419 }
1420 }
1421 return true;
1422}
1423
1424bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
1425 unsigned OuterLoopId,
1426 CharMatrix &DepMatrix) {
1427 if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
1428 LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
1429 << " and OuterLoopId = " << OuterLoopId
1430 << " due to dependence\n");
1431 ORE->emit([&]() {
1432 return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
1433 InnerLoop->getStartLoc(),
1434 InnerLoop->getHeader())
1435 << "Cannot interchange loops due to dependences.";
1436 });
1437 return false;
1438 }
1439 // Check if outer and inner loop contain legal instructions only.
1440 for (auto *BB : OuterLoop->blocks())
1441 for (Instruction &I : BB->instructionsWithoutDebug())
1442 if (CallInst *CI = dyn_cast<CallInst>(&I)) {
1443 // readnone functions do not prevent interchanging.
1444 if (CI->onlyWritesMemory())
1445 continue;
1446 LLVM_DEBUG(
1447 dbgs() << "Loops with call instructions cannot be interchanged "
1448 << "safely.");
1449 ORE->emit([&]() {
1450 return OptimizationRemarkMissed(DEBUG_TYPE, "CallInst",
1451 CI->getDebugLoc(),
1452 CI->getParent())
1453 << "Cannot interchange loops due to call instruction.";
1454 });
1455
1456 return false;
1457 }
1458
1459 if (!findInductions(InnerLoop, InnerLoopInductions)) {
1460 LLVM_DEBUG(dbgs() << "Could not find inner loop induction variables.\n");
1461 return false;
1462 }
1463
1464 if (!areInnerLoopLatchPHIsSupported(OuterLoop, InnerLoop)) {
1465 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop latch.\n");
1466 ORE->emit([&]() {
1467 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerLatchPHI",
1468 InnerLoop->getStartLoc(),
1469 InnerLoop->getHeader())
1470 << "Cannot interchange loops because unsupported PHI nodes found "
1471 "in inner loop latch.";
1472 });
1473 return false;
1474 }
1475
1476 // TODO: The loops could not be interchanged due to current limitations in the
1477 // transform module.
1478 if (currentLimitations()) {
1479 LLVM_DEBUG(dbgs() << "Not legal because of current transform limitation\n");
1480 return false;
1481 }
1482
1483 // Check if the loops are tightly nested.
1484 if (!tightlyNested(OuterLoop, InnerLoop)) {
1485 LLVM_DEBUG(dbgs() << "Loops not tightly nested\n");
1486 ORE->emit([&]() {
1487 return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested",
1488 InnerLoop->getStartLoc(),
1489 InnerLoop->getHeader())
1490 << "Cannot interchange loops because they are not tightly "
1491 "nested.";
1492 });
1493 return false;
1494 }
1495
1496 // The LCSSA PHI for the reduction has passed checks before; its user
1497 // is a store instruction.
1498 PHINode *LcssaReduction = nullptr;
1499 assert(InnerReductions.size() <= 1 &&
1500 "So far we only support at most one reduction.");
1501 if (InnerReductions.size() == 1)
1502 LcssaReduction = InnerReductions[0].LcssaPhi;
1503
1504 if (!areInnerLoopExitPHIsSupported(OuterLoop, InnerLoop, OuterInnerReductions,
1505 LcssaReduction)) {
1506 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n");
1507 ORE->emit([&]() {
1508 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1509 InnerLoop->getStartLoc(),
1510 InnerLoop->getHeader())
1511 << "Found unsupported PHI node in loop exit.";
1512 });
1513 return false;
1514 }
1515
1516 if (!areOuterLoopExitPHIsSupported(OuterLoop, InnerLoop)) {
1517 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n");
1518 ORE->emit([&]() {
1519 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1520 OuterLoop->getStartLoc(),
1521 OuterLoop->getHeader())
1522 << "Found unsupported PHI node in loop exit.";
1523 });
1524 return false;
1525 }
1526
1527 return true;
1528}
1529
1530void CacheCostManager::computeIfUnitinialized() {
1531 if (CC.has_value())
1532 return;
1533
1534 LLVM_DEBUG(dbgs() << "Compute CacheCost.\n");
1535 CC = CacheCost::getCacheCost(*OutermostLoop, *AR, *DI);
1536 // Obtain the loop vector returned from loop cache analysis beforehand,
1537 // and put each <Loop, index> pair into a map for constant time query
1538 // later. Indices in loop vector reprsent the optimal order of the
1539 // corresponding loop, e.g., given a loopnest with depth N, index 0
1540 // indicates the loop should be placed as the outermost loop and index N
1541 // indicates the loop should be placed as the innermost loop.
1542 //
1543 // For the old pass manager CacheCost would be null.
1544 if (*CC != nullptr)
1545 for (const auto &[Idx, Cost] : enumerate((*CC)->getLoopCosts()))
1546 CostMap[Cost.first] = Idx;
1547}
1548
1549CacheCost *CacheCostManager::getCacheCost() {
1550 computeIfUnitinialized();
1551 return CC->get();
1552}
1553
1554const DenseMap<const Loop *, unsigned> &CacheCostManager::getCostMap() {
1555 computeIfUnitinialized();
1556 return CostMap;
1557}
1558
1559int LoopInterchangeProfitability::getInstrOrderCost() {
1560 unsigned GoodOrder, BadOrder;
1561 BadOrder = GoodOrder = 0;
1562 for (BasicBlock *BB : InnerLoop->blocks()) {
1563 for (Instruction &Ins : *BB) {
1564 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) {
1565 bool FoundInnerInduction = false;
1566 bool FoundOuterInduction = false;
1567 for (Value *Op : GEP->operands()) {
1568 // Skip operands that are not SCEV-able.
1569 if (!SE->isSCEVable(Op->getType()))
1570 continue;
1571
1572 const SCEV *OperandVal = SE->getSCEV(Op);
1573 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal);
1574 if (!AR)
1575 continue;
1576
1577 // If we find the inner induction after an outer induction e.g.
1578 // for(int i=0;i<N;i++)
1579 // for(int j=0;j<N;j++)
1580 // A[i][j] = A[i-1][j-1]+k;
1581 // then it is a good order.
1582 if (AR->getLoop() == InnerLoop) {
1583 // We found an InnerLoop induction after OuterLoop induction. It is
1584 // a good order.
1585 FoundInnerInduction = true;
1586 if (FoundOuterInduction) {
1587 GoodOrder++;
1588 break;
1589 }
1590 }
1591 // If we find the outer induction after an inner induction e.g.
1592 // for(int i=0;i<N;i++)
1593 // for(int j=0;j<N;j++)
1594 // A[j][i] = A[j-1][i-1]+k;
1595 // then it is a bad order.
1596 if (AR->getLoop() == OuterLoop) {
1597 // We found an OuterLoop induction after InnerLoop induction. It is
1598 // a bad order.
1599 FoundOuterInduction = true;
1600 if (FoundInnerInduction) {
1601 BadOrder++;
1602 break;
1603 }
1604 }
1605 }
1606 }
1607 }
1608 }
1609 return GoodOrder - BadOrder;
1610}
1611
1612std::optional<bool>
1613LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis(
1614 const DenseMap<const Loop *, unsigned> &CostMap, CacheCost *CC) {
1615 // This is the new cost model returned from loop cache analysis.
1616 // A smaller index means the loop should be placed an outer loop, and vice
1617 // versa.
1618 auto InnerLoopIt = CostMap.find(InnerLoop);
1619 if (InnerLoopIt == CostMap.end())
1620 return std::nullopt;
1621 auto OuterLoopIt = CostMap.find(OuterLoop);
1622 if (OuterLoopIt == CostMap.end())
1623 return std::nullopt;
1624
1625 if (CC->getLoopCost(*OuterLoop) == CC->getLoopCost(*InnerLoop))
1626 return std::nullopt;
1627 unsigned InnerIndex = InnerLoopIt->second;
1628 unsigned OuterIndex = OuterLoopIt->second;
1629 LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex
1630 << ", OuterIndex = " << OuterIndex << "\n");
1631 assert(InnerIndex != OuterIndex && "CostMap should assign unique "
1632 "numbers to each loop");
1633 return std::optional<bool>(InnerIndex < OuterIndex);
1634}
1635
1636std::optional<bool>
1637LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
1638 // Legacy cost model: this is rough cost estimation algorithm. It counts the
1639 // good and bad order of induction variables in the instruction and allows
1640 // reordering if number of bad orders is more than good.
1641 int Cost = getInstrOrderCost();
1642 LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
1644 return std::optional<bool>(true);
1645
1646 return std::nullopt;
1647}
1648
1649/// Return true if we can vectorize the loop specified by \p LoopId.
1650static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId) {
1651 for (const auto &Dep : DepMatrix) {
1652 char Dir = Dep[LoopId];
1653 char DepType = Dep.back();
1654 assert((DepType == '<' || DepType == '*') &&
1655 "Unexpected element in dependency vector");
1656
1657 // There are no loop-carried dependencies.
1658 if (Dir == '=' || Dir == 'I')
1659 continue;
1660
1661 // DepType being '<' means that this direction vector represents a forward
1662 // dependency. In principle, a loop with '<' direction can be vectorized in
1663 // this case.
1664 if (Dir == '<' && DepType == '<')
1665 continue;
1666
1667 // We cannot prove that the loop is vectorizable.
1668 return false;
1669 }
1670 return true;
1671}
1672
1673std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
1674 unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) {
1675 // If the outer loop cannot be vectorized, it is not profitable to move this
1676 // to inner position.
1677 if (!canVectorize(DepMatrix, OuterLoopId))
1678 return false;
1679
1680 // If the inner loop cannot be vectorized but the outer loop can be, then it
1681 // is profitable to interchange to enable inner loop parallelism.
1682 if (!canVectorize(DepMatrix, InnerLoopId))
1683 return true;
1684
1685 // If both the inner and the outer loop can be vectorized, it is necessary to
1686 // check the cost of each vectorized loop for profitability decision. At this
1687 // time we do not have a cost model to estimate them, so return nullopt.
1688 // TODO: Estimate the cost of vectorized loop when both the outer and the
1689 // inner loop can be vectorized.
1690 return std::nullopt;
1691}
1692
1693bool LoopInterchangeProfitability::isProfitable(
1694 const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
1695 unsigned OuterLoopId, CharMatrix &DepMatrix, CacheCostManager &CCM) {
1696 // Do not consider loops with a backedge that isn't taken, e.g. an
1697 // unconditional branch true/false, as candidates for interchange.
1698 // TODO: when interchange is forced, we should probably also allow
1699 // interchange for these loops, and thus this logic should be moved just
1700 // below the cost-model ignore check below. But this check is done first
1701 // to avoid the issue in #163954.
1702 const SCEV *InnerBTC = SE->getBackedgeTakenCount(InnerLoop);
1703 const SCEV *OuterBTC = SE->getBackedgeTakenCount(OuterLoop);
1704 if (InnerBTC && InnerBTC->isZero()) {
1705 LLVM_DEBUG(dbgs() << "Inner loop back-edge isn't taken, rejecting "
1706 "single iteration loop\n");
1707 return false;
1708 }
1709 if (OuterBTC && OuterBTC->isZero()) {
1710 LLVM_DEBUG(dbgs() << "Outer loop back-edge isn't taken, rejecting "
1711 "single iteration loop\n");
1712 return false;
1713 }
1714
1715 // Return true if interchange is forced and the cost-model ignored.
1716 if (Profitabilities.size() == 1 && Profitabilities[0] == RuleTy::Ignore)
1717 return true;
1719 "Duplicate rules and option 'ignore' are not allowed");
1720
1721 // isProfitable() is structured to avoid endless loop interchange. If the
1722 // highest priority rule (isProfitablePerLoopCacheAnalysis by default) could
1723 // decide the profitability then, profitability check will stop and return the
1724 // analysis result. If it failed to determine it (e.g., cache analysis failed
1725 // to analyze the loopnest due to delinearization issues) then go ahead the
1726 // second highest priority rule (isProfitablePerInstrOrderCost by default).
1727 // Likewise, if it failed to analysis the profitability then only, the last
1728 // rule (isProfitableForVectorization by default) will decide.
1729 std::optional<bool> shouldInterchange;
1730 for (RuleTy RT : Profitabilities) {
1731 switch (RT) {
1732 case RuleTy::PerLoopCacheAnalysis: {
1733 CacheCost *CC = CCM.getCacheCost();
1734 const DenseMap<const Loop *, unsigned> &CostMap = CCM.getCostMap();
1735 shouldInterchange = isProfitablePerLoopCacheAnalysis(CostMap, CC);
1736 break;
1737 }
1738 case RuleTy::PerInstrOrderCost:
1739 shouldInterchange = isProfitablePerInstrOrderCost();
1740 break;
1741 case RuleTy::ForVectorization:
1742 shouldInterchange =
1743 isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
1744 break;
1745 case RuleTy::Ignore:
1746 llvm_unreachable("Option 'ignore' is not supported with other options");
1747 break;
1748 }
1749
1750 // If this rule could determine the profitability, don't call subsequent
1751 // rules.
1752 if (shouldInterchange.has_value())
1753 break;
1754 }
1755
1756 if (!shouldInterchange.has_value()) {
1757 ORE->emit([&]() {
1758 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1759 InnerLoop->getStartLoc(),
1760 InnerLoop->getHeader())
1761 << "Insufficient information to calculate the cost of loop for "
1762 "interchange.";
1763 });
1764 return false;
1765 } else if (!shouldInterchange.value()) {
1766 ORE->emit([&]() {
1767 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1768 InnerLoop->getStartLoc(),
1769 InnerLoop->getHeader())
1770 << "Interchanging loops is not considered to improve cache "
1771 "locality nor vectorization.";
1772 });
1773 return false;
1774 }
1775 return true;
1776}
1777
1778void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
1779 Loop *InnerLoop) {
1780 for (Loop *L : *OuterLoop)
1781 if (L == InnerLoop) {
1782 OuterLoop->removeChildLoop(L);
1783 return;
1784 }
1785 llvm_unreachable("Couldn't find loop");
1786}
1787
1788/// Update LoopInfo, after interchanging. NewInner and NewOuter refer to the
1789/// new inner and outer loop after interchanging: NewInner is the original
1790/// outer loop and NewOuter is the original inner loop.
1791///
1792/// Before interchanging, we have the following structure
1793/// Outer preheader
1794// Outer header
1795// Inner preheader
1796// Inner header
1797// Inner body
1798// Inner latch
1799// outer bbs
1800// Outer latch
1801//
1802// After interchanging:
1803// Inner preheader
1804// Inner header
1805// Outer preheader
1806// Outer header
1807// Inner body
1808// outer bbs
1809// Outer latch
1810// Inner latch
1811void LoopInterchangeTransform::restructureLoops(
1812 Loop *NewInner, Loop *NewOuter, BasicBlock *OrigInnerPreHeader,
1813 BasicBlock *OrigOuterPreHeader) {
1814 Loop *OuterLoopParent = OuterLoop->getParentLoop();
1815 // The original inner loop preheader moves from the new inner loop to
1816 // the parent loop, if there is one.
1817 NewInner->removeBlockFromLoop(OrigInnerPreHeader);
1818 LI->changeLoopFor(OrigInnerPreHeader, OuterLoopParent);
1819
1820 // Switch the loop levels.
1821 if (OuterLoopParent) {
1822 // Remove the loop from its parent loop.
1823 removeChildLoop(OuterLoopParent, NewInner);
1824 removeChildLoop(NewInner, NewOuter);
1825 OuterLoopParent->addChildLoop(NewOuter);
1826 } else {
1827 removeChildLoop(NewInner, NewOuter);
1828 LI->changeTopLevelLoop(NewInner, NewOuter);
1829 }
1830 while (!NewOuter->isInnermost())
1831 NewInner->addChildLoop(NewOuter->removeChildLoop(NewOuter->begin()));
1832 NewOuter->addChildLoop(NewInner);
1833
1834 // BBs from the original inner loop.
1835 SmallVector<BasicBlock *, 8> OrigInnerBBs(NewOuter->blocks());
1836
1837 // Add BBs from the original outer loop to the original inner loop (excluding
1838 // BBs already in inner loop)
1839 for (BasicBlock *BB : NewInner->blocks())
1840 if (LI->getLoopFor(BB) == NewInner)
1841 NewOuter->addBlockEntry(BB);
1842
1843 // Now remove inner loop header and latch from the new inner loop and move
1844 // other BBs (the loop body) to the new inner loop.
1845 BasicBlock *OuterHeader = NewOuter->getHeader();
1846 BasicBlock *OuterLatch = NewOuter->getLoopLatch();
1847 for (BasicBlock *BB : OrigInnerBBs) {
1848 // Nothing will change for BBs in child loops.
1849 if (LI->getLoopFor(BB) != NewOuter)
1850 continue;
1851 // Remove the new outer loop header and latch from the new inner loop.
1852 if (BB == OuterHeader || BB == OuterLatch)
1853 NewInner->removeBlockFromLoop(BB);
1854 else
1855 LI->changeLoopFor(BB, NewInner);
1856 }
1857
1858 // The preheader of the original outer loop becomes part of the new
1859 // outer loop.
1860 NewOuter->addBlockEntry(OrigOuterPreHeader);
1861 LI->changeLoopFor(OrigOuterPreHeader, NewOuter);
1862
1863 // Tell SE that we move the loops around.
1864 SE->forgetLoop(NewOuter);
1865}
1866
1867/// User can write, or optimizers can generate the reduction for inner loop.
1868/// To make the interchange valid, apply Reduction2Mem by moving the
1869/// initializer and store instructions into the inner loop. So far we only
1870/// handle cases where the reduction variable is initialized to a constant.
1871/// For example, below code:
1872///
1873/// loop:
1874/// re = phi<0.0, next>
1875/// next = re op ...
1876/// endloop
1877/// reduc_sum = phi<next> // lcssa phi
1878/// MEM_REF[idx] = reduc_sum // LcssaStore
1879///
1880/// is transformed into:
1881///
1882/// loop:
1883/// tmp = MEM_REF[idx];
1884/// new_var = !first_iteration ? tmp : 0.0;
1885/// next = new_var op ...
1886/// MEM_REF[idx] = next; // after moving
1887/// endloop
1888///
1889/// In this way the initial const is used in the first iteration of loop.
1890void LoopInterchangeTransform::reduction2Memory() {
1892 LIL.getInnerReductions();
1893
1894 assert(InnerReductions.size() == 1 &&
1895 "So far we only support at most one reduction.");
1896
1897 LoopInterchangeLegality::InnerReduction SR = InnerReductions[0];
1898 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
1899 IRBuilder<> Builder(&*(InnerLoopHeader->getFirstNonPHIIt()));
1900
1901 // Check if it's the first iteration.
1902 LLVMContext &Context = InnerLoopHeader->getContext();
1903 PHINode *FirstIter =
1904 Builder.CreatePHI(Type::getInt1Ty(Context), 2, "first.iter");
1905 FirstIter->addIncoming(ConstantInt::get(Type::getInt1Ty(Context), 1),
1906 InnerLoop->getLoopPreheader());
1907 FirstIter->addIncoming(ConstantInt::get(Type::getInt1Ty(Context), 0),
1908 InnerLoop->getLoopLatch());
1909 assert(FirstIter->isComplete() && "The FirstIter PHI node is not complete.");
1910
1911 // When the reduction is initialized from a constant value, we need to add
1912 // a stmt loading from the memory object to target basic block in inner
1913 // loop.
1914 Instruction *LoadMem = Builder.CreateLoad(SR.ElemTy, SR.MemRef);
1915
1916 // Init new_var to MEM_REF or CONST depending on if it is the first iteration.
1917 Value *NewVar = Builder.CreateSelect(FirstIter, SR.Init, LoadMem, "new.var");
1918
1919 // Replace all uses of the reduction variable with a new variable.
1920 SR.Reduction->replaceAllUsesWith(NewVar);
1921
1922 // Move store instruction into inner loop, just after reduction next's
1923 // definition.
1924 SR.LcssaStore->setOperand(0, SR.Next);
1925 SR.LcssaStore->moveAfter(dyn_cast<Instruction>(SR.Next));
1926}
1927
1928bool LoopInterchangeTransform::transform(
1929 ArrayRef<Instruction *> DropNoWrapInsts) {
1930 bool Transformed = false;
1931
1933 LIL.getInnerReductions();
1934 if (InnerReductions.size() == 1)
1935 reduction2Memory();
1936
1937 if (InnerLoop->getSubLoops().empty()) {
1938 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1939 LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n");
1940 auto &InductionPHIs = LIL.getInnerLoopInductions();
1941 if (InductionPHIs.empty()) {
1942 LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
1943 return false;
1944 }
1945
1946 SmallVector<Instruction *, 8> InnerIndexVarList;
1947 for (PHINode *CurInductionPHI : InductionPHIs) {
1948 if (CurInductionPHI->getIncomingBlock(0) == InnerLoopPreHeader)
1949 InnerIndexVarList.push_back(
1950 dyn_cast<Instruction>(CurInductionPHI->getIncomingValue(1)));
1951 else
1952 InnerIndexVarList.push_back(
1953 dyn_cast<Instruction>(CurInductionPHI->getIncomingValue(0)));
1954 }
1955
1956 // Create a new latch block for the inner loop. We split at the
1957 // current latch's terminator and then move the condition and all
1958 // operands that are not either loop-invariant or the induction PHI into the
1959 // new latch block.
1960 BasicBlock *NewLatch =
1961 SplitBlock(InnerLoop->getLoopLatch(),
1962 InnerLoop->getLoopLatch()->getTerminator(), DT, LI);
1963
1964 SmallSetVector<Instruction *, 4> WorkList;
1965 unsigned i = 0;
1966 auto MoveInstructions = [&i, &WorkList, this, &InductionPHIs, NewLatch]() {
1967 for (; i < WorkList.size(); i++) {
1968 // Duplicate instruction and move it the new latch. Update uses that
1969 // have been moved.
1970 Instruction *NewI = WorkList[i]->clone();
1971 NewI->insertBefore(NewLatch->getFirstNonPHIIt());
1972 assert(!NewI->mayHaveSideEffects() &&
1973 "Moving instructions with side-effects may change behavior of "
1974 "the loop nest!");
1975 for (Use &U : llvm::make_early_inc_range(WorkList[i]->uses())) {
1976 Instruction *UserI = cast<Instruction>(U.getUser());
1977 if (!InnerLoop->contains(UserI->getParent()) ||
1978 UserI->getParent() == NewLatch ||
1979 llvm::is_contained(InductionPHIs, UserI))
1980 U.set(NewI);
1981 }
1982 // Add operands of moved instruction to the worklist, except if they are
1983 // outside the inner loop or are the induction PHI.
1984 for (Value *Op : WorkList[i]->operands()) {
1986 if (!OpI ||
1987 this->LI->getLoopFor(OpI->getParent()) != this->InnerLoop ||
1988 llvm::is_contained(InductionPHIs, OpI))
1989 continue;
1990 WorkList.insert(OpI);
1991 }
1992 }
1993 };
1994
1995 // FIXME: Should we interchange when we have a constant condition?
1998 ->getCondition());
1999 if (CondI)
2000 WorkList.insert(CondI);
2001 MoveInstructions();
2002 for (Instruction *InnerIndexVar : InnerIndexVarList)
2003 WorkList.insert(cast<Instruction>(InnerIndexVar));
2004 MoveInstructions();
2005 }
2006
2007 // Ensure the inner loop phi nodes have a separate basic block.
2008 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
2009 if (&*InnerLoopHeader->getFirstNonPHIIt() !=
2010 InnerLoopHeader->getTerminator()) {
2011 SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHIIt(), DT, LI);
2012 LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n");
2013 }
2014
2015 // Instructions in the original inner loop preheader may depend on values
2016 // defined in the outer loop header. Move them there, because the original
2017 // inner loop preheader will become the entry into the interchanged loop nest.
2018 // Currently we move all instructions and rely on LICM to move invariant
2019 // instructions outside the loop nest.
2020 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
2021 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
2022 if (InnerLoopPreHeader != OuterLoopHeader) {
2023 for (Instruction &I :
2024 make_early_inc_range(make_range(InnerLoopPreHeader->begin(),
2025 std::prev(InnerLoopPreHeader->end()))))
2026 I.moveBeforePreserving(OuterLoopHeader->getTerminator()->getIterator());
2027 }
2028
2029 Transformed |= adjustLoopLinks();
2030 if (!Transformed) {
2031 LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n");
2032 return false;
2033 }
2034
2035 // Finally, drop the nsw/nuw flags from the instructions for reduction
2036 // calculations.
2037 for (Instruction *Reduction : DropNoWrapInsts) {
2038 Reduction->setHasNoSignedWrap(false);
2039 Reduction->setHasNoUnsignedWrap(false);
2040 }
2041
2042 return true;
2043}
2044
2045/// \brief Move all instructions except the terminator from FromBB right before
2046/// InsertBefore
2047static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
2048 BasicBlock *ToBB = InsertBefore->getParent();
2049
2050 ToBB->splice(InsertBefore->getIterator(), FromBB, FromBB->begin(),
2051 FromBB->getTerminator()->getIterator());
2052}
2053
2054/// Swap instructions between \p BB1 and \p BB2 but keep terminators intact.
2055static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2) {
2056 // Save all non-terminator instructions of BB1 into TempInstrs and unlink them
2057 // from BB1 afterwards.
2058 auto Iter = map_range(*BB1, [](Instruction &I) { return &I; });
2059 SmallVector<Instruction *, 4> TempInstrs(Iter.begin(), std::prev(Iter.end()));
2060 for (Instruction *I : TempInstrs)
2061 I->removeFromParent();
2062
2063 // Move instructions from BB2 to BB1.
2064 moveBBContents(BB2, BB1->getTerminator());
2065
2066 // Move instructions from TempInstrs to BB2.
2067 for (Instruction *I : TempInstrs)
2068 I->insertBefore(BB2->getTerminator()->getIterator());
2069}
2070
2071// Update BI to jump to NewBB instead of OldBB. Records updates to the
2072// dominator tree in DTUpdates. If \p MustUpdateOnce is true, assert that
2073// \p OldBB is exactly once in BI's successor list.
2074static void updateSuccessor(Instruction *Term, BasicBlock *OldBB,
2075 BasicBlock *NewBB,
2076 std::vector<DominatorTree::UpdateType> &DTUpdates,
2077 bool MustUpdateOnce = true) {
2078 assert((!MustUpdateOnce || llvm::count(successors(Term), OldBB) == 1) &&
2079 "BI must jump to OldBB exactly once.");
2080 bool Changed = false;
2081 for (Use &Op : Term->operands())
2082 if (Op == OldBB) {
2083 Op.set(NewBB);
2084 Changed = true;
2085 }
2086
2087 if (Changed) {
2088 DTUpdates.push_back(
2089 {DominatorTree::UpdateKind::Insert, Term->getParent(), NewBB});
2090 DTUpdates.push_back(
2091 {DominatorTree::UpdateKind::Delete, Term->getParent(), OldBB});
2092 }
2093 assert(Changed && "Expected a successor to be updated");
2094}
2095
2096// Move Lcssa PHIs to the right place.
2097static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader,
2098 BasicBlock *InnerLatch, BasicBlock *OuterHeader,
2099 BasicBlock *OuterLatch, BasicBlock *OuterExit,
2100 Loop *InnerLoop, LoopInfo *LI) {
2101
2102 // Deal with LCSSA PHI nodes in the exit block of the inner loop, that are
2103 // defined either in the header or latch. Those blocks will become header and
2104 // latch of the new outer loop, and the only possible users can PHI nodes
2105 // in the exit block of the loop nest or the outer loop header (reduction
2106 // PHIs, in that case, the incoming value must be defined in the inner loop
2107 // header). We can just substitute the user with the incoming value and remove
2108 // the PHI.
2109 for (PHINode &P : make_early_inc_range(InnerExit->phis())) {
2110 assert(P.getNumIncomingValues() == 1 &&
2111 "Only loops with a single exit are supported!");
2112
2113 // Incoming values are guaranteed be instructions currently.
2114 auto IncI = cast<Instruction>(P.getIncomingValueForBlock(InnerLatch));
2115 // In case of multi-level nested loops, follow LCSSA to find the incoming
2116 // value defined from the innermost loop.
2117 auto IncIInnerMost = cast<Instruction>(followLCSSA(IncI));
2118 // Skip phis with incoming values from the inner loop body, excluding the
2119 // header and latch.
2120 if (IncIInnerMost->getParent() != InnerLatch &&
2121 IncIInnerMost->getParent() != InnerHeader)
2122 continue;
2123
2124 assert(all_of(P.users(),
2125 [OuterHeader, OuterExit, IncI, InnerHeader](User *U) {
2126 return (cast<PHINode>(U)->getParent() == OuterHeader &&
2127 IncI->getParent() == InnerHeader) ||
2128 cast<PHINode>(U)->getParent() == OuterExit;
2129 }) &&
2130 "Can only replace phis iff the uses are in the loop nest exit or "
2131 "the incoming value is defined in the inner header (it will "
2132 "dominate all loop blocks after interchanging)");
2133 P.replaceAllUsesWith(IncI);
2134 P.eraseFromParent();
2135 }
2136
2137 SmallVector<PHINode *, 8> LcssaInnerExit(
2138 llvm::make_pointer_range(InnerExit->phis()));
2139
2140 SmallVector<PHINode *, 8> LcssaInnerLatch(
2141 llvm::make_pointer_range(InnerLatch->phis()));
2142
2143 // Lcssa PHIs for values used outside the inner loop are in InnerExit.
2144 // If a PHI node has users outside of InnerExit, it has a use outside the
2145 // interchanged loop and we have to preserve it. We move these to
2146 // InnerLatch, which will become the new exit block for the innermost
2147 // loop after interchanging.
2148 for (PHINode *P : LcssaInnerExit)
2149 P->moveBefore(InnerLatch->getFirstNonPHIIt());
2150
2151 // If the inner loop latch contains LCSSA PHIs, those come from a child loop
2152 // and we have to move them to the new inner latch.
2153 for (PHINode *P : LcssaInnerLatch)
2154 P->moveBefore(InnerExit->getFirstNonPHIIt());
2155
2156 // Deal with LCSSA PHI nodes in the loop nest exit block. For PHIs that have
2157 // incoming values defined in the outer loop, we have to add a new PHI
2158 // in the inner loop latch, which became the exit block of the outer loop,
2159 // after interchanging.
2160 if (OuterExit) {
2161 for (PHINode &P : OuterExit->phis()) {
2162 if (P.getNumIncomingValues() != 1)
2163 continue;
2164 // Skip Phis with incoming values defined in the inner loop. Those should
2165 // already have been updated.
2166 auto I = dyn_cast<Instruction>(P.getIncomingValue(0));
2167 if (!I || LI->getLoopFor(I->getParent()) == InnerLoop)
2168 continue;
2169
2170 PHINode *NewPhi = dyn_cast<PHINode>(P.clone());
2171 NewPhi->setIncomingValue(0, P.getIncomingValue(0));
2172 NewPhi->setIncomingBlock(0, OuterLatch);
2173 // We might have incoming edges from other BBs, i.e., the original outer
2174 // header.
2175 for (auto *Pred : predecessors(InnerLatch)) {
2176 if (Pred == OuterLatch)
2177 continue;
2178 NewPhi->addIncoming(P.getIncomingValue(0), Pred);
2179 }
2180 NewPhi->insertBefore(InnerLatch->getFirstNonPHIIt());
2181 P.setIncomingValue(0, NewPhi);
2182 }
2183 }
2184
2185 // Now adjust the incoming blocks for the LCSSA PHIs.
2186 // For PHIs moved from Inner's exit block, we need to replace Inner's latch
2187 // with the new latch.
2188 InnerLatch->replacePhiUsesWith(InnerLatch, OuterLatch);
2189}
2190
2191/// This deals with a corner case when a LCSSA phi node appears in a non-exit
2192/// block: the outer loop latch block does not need to be exit block of the
2193/// inner loop. Consider a loop that was in LCSSA form, but then some
2194/// transformation like loop-unswitch comes along and creates an empty block,
2195/// where BB5 in this example is the outer loop latch block:
2196///
2197/// BB4:
2198/// br label %BB5
2199/// BB5:
2200/// %old.cond.lcssa = phi i16 [ %cond, %BB4 ]
2201/// br outer.header
2202///
2203/// Interchange then brings it in LCSSA form again resulting in this chain of
2204/// single-input phi nodes:
2205///
2206/// BB4:
2207/// %new.cond.lcssa = phi i16 [ %cond, %BB3 ]
2208/// br label %BB5
2209/// BB5:
2210/// %old.cond.lcssa = phi i16 [ %new.cond.lcssa, %BB4 ]
2211///
2212/// The problem is that interchange can reoder blocks BB4 and BB5 placing the
2213/// use before the def if we don't check this. The solution is to simplify
2214/// lcssa phi nodes (remove) if they appear in non-exit blocks.
2215///
2216static void simplifyLCSSAPhis(Loop *OuterLoop, Loop *InnerLoop) {
2217 BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
2218 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
2219
2220 // Do not modify lcssa phis where they actually belong, i.e. in exit blocks.
2221 if (OuterLoopLatch == InnerLoopExit)
2222 return;
2223
2224 // Collect and remove phis in non-exit blocks if they have 1 input.
2226 llvm::make_pointer_range(OuterLoopLatch->phis()));
2227 for (PHINode *Phi : Phis) {
2228 assert(Phi->getNumIncomingValues() == 1 && "Single input phi expected");
2229 LLVM_DEBUG(dbgs() << "Removing 1-input phi in non-exit block: " << *Phi
2230 << "\n");
2231 Phi->replaceAllUsesWith(Phi->getIncomingValue(0));
2232 Phi->eraseFromParent();
2233 }
2234}
2235
2236bool LoopInterchangeTransform::adjustLoopBranches() {
2237 LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n");
2238 std::vector<DominatorTree::UpdateType> DTUpdates;
2239
2240 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
2241 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
2242
2243 assert(OuterLoopPreHeader != OuterLoop->getHeader() &&
2244 InnerLoopPreHeader != InnerLoop->getHeader() && OuterLoopPreHeader &&
2245 InnerLoopPreHeader && "Guaranteed by loop-simplify form");
2246
2247 simplifyLCSSAPhis(OuterLoop, InnerLoop);
2248
2249 // Ensure that both preheaders do not contain PHI nodes and have single
2250 // predecessors. This allows us to move them easily. We use
2251 // InsertPreHeaderForLoop to create an 'extra' preheader, if the existing
2252 // preheaders do not satisfy those conditions.
2253 if (isa<PHINode>(OuterLoopPreHeader->begin()) ||
2254 !OuterLoopPreHeader->getUniquePredecessor())
2255 OuterLoopPreHeader =
2256 InsertPreheaderForLoop(OuterLoop, DT, LI, nullptr, true);
2257 if (InnerLoopPreHeader == OuterLoop->getHeader())
2258 InnerLoopPreHeader =
2259 InsertPreheaderForLoop(InnerLoop, DT, LI, nullptr, true);
2260
2261 // Adjust the loop preheader
2262 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
2263 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
2264 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
2265 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
2266 BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor();
2267 BasicBlock *InnerLoopLatchPredecessor =
2268 InnerLoopLatch->getUniquePredecessor();
2269 BasicBlock *InnerLoopLatchSuccessor;
2270 BasicBlock *OuterLoopLatchSuccessor;
2271
2272 CondBrInst *OuterLoopLatchBI =
2273 dyn_cast<CondBrInst>(OuterLoopLatch->getTerminator());
2274 CondBrInst *InnerLoopLatchBI =
2275 dyn_cast<CondBrInst>(InnerLoopLatch->getTerminator());
2276 Instruction *OuterLoopHeaderBI = OuterLoopHeader->getTerminator();
2277 Instruction *InnerLoopHeaderBI = InnerLoopHeader->getTerminator();
2278
2279 if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor ||
2280 !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI ||
2281 !InnerLoopHeaderBI)
2282 return false;
2283
2284 Instruction *InnerLoopLatchPredecessorBI =
2285 InnerLoopLatchPredecessor->getTerminator();
2286 Instruction *OuterLoopPredecessorBI = OuterLoopPredecessor->getTerminator();
2287
2288 if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI)
2289 return false;
2290 BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor();
2291 if (!InnerLoopHeaderSuccessor)
2292 return false;
2293
2294 // Adjust Loop Preheader and headers.
2295 // The branches in the outer loop predecessor and the outer loop header can
2296 // be unconditional branches or conditional branches with duplicates. Consider
2297 // this when updating the successors.
2298 updateSuccessor(OuterLoopPredecessorBI, OuterLoopPreHeader,
2299 InnerLoopPreHeader, DTUpdates, /*MustUpdateOnce=*/false);
2300 // The outer loop header might or might not branch to the outer latch.
2301 // We are guaranteed to branch to the inner loop preheader.
2302 if (llvm::is_contained(successors(OuterLoopHeaderBI), OuterLoopLatch)) {
2303 // In this case the outerLoopHeader should branch to the InnerLoopLatch.
2304 updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, InnerLoopLatch,
2305 DTUpdates,
2306 /*MustUpdateOnce=*/false);
2307 }
2308 updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader,
2309 InnerLoopHeaderSuccessor, DTUpdates,
2310 /*MustUpdateOnce=*/false);
2311
2312 // Adjust reduction PHI's now that the incoming block has changed.
2313 InnerLoopHeaderSuccessor->replacePhiUsesWith(InnerLoopHeader,
2314 OuterLoopHeader);
2315
2316 updateSuccessor(InnerLoopHeaderBI, InnerLoopHeaderSuccessor,
2317 OuterLoopPreHeader, DTUpdates);
2318
2319 // -------------Adjust loop latches-----------
2320 if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader)
2321 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1);
2322 else
2323 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0);
2324
2325 updateSuccessor(InnerLoopLatchPredecessorBI, InnerLoopLatch,
2326 InnerLoopLatchSuccessor, DTUpdates);
2327
2328 if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader)
2329 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1);
2330 else
2331 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0);
2332
2333 updateSuccessor(InnerLoopLatchBI, InnerLoopLatchSuccessor,
2334 OuterLoopLatchSuccessor, DTUpdates);
2335 updateSuccessor(OuterLoopLatchBI, OuterLoopLatchSuccessor, InnerLoopLatch,
2336 DTUpdates);
2337
2338 DT->applyUpdates(DTUpdates);
2339 restructureLoops(OuterLoop, InnerLoop, InnerLoopPreHeader,
2340 OuterLoopPreHeader);
2341
2342 moveLCSSAPhis(InnerLoopLatchSuccessor, InnerLoopHeader, InnerLoopLatch,
2343 OuterLoopHeader, OuterLoopLatch, InnerLoop->getExitBlock(),
2344 InnerLoop, LI);
2345 // For PHIs in the exit block of the outer loop, outer's latch has been
2346 // replaced by Inners'.
2347 OuterLoopLatchSuccessor->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
2348
2349 auto &OuterInnerReductions = LIL.getOuterInnerReductions();
2350 // Now update the reduction PHIs in the inner and outer loop headers.
2351 SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;
2352 for (PHINode &PHI : InnerLoopHeader->phis())
2353 if (OuterInnerReductions.contains(&PHI))
2354 InnerLoopPHIs.push_back(&PHI);
2355
2356 for (PHINode &PHI : OuterLoopHeader->phis())
2357 if (OuterInnerReductions.contains(&PHI))
2358 OuterLoopPHIs.push_back(&PHI);
2359
2360 // Now move the remaining reduction PHIs from outer to inner loop header and
2361 // vice versa. The PHI nodes must be part of a reduction across the inner and
2362 // outer loop and all the remains to do is and updating the incoming blocks.
2363 for (PHINode *PHI : OuterLoopPHIs) {
2364 LLVM_DEBUG(dbgs() << "Outer loop reduction PHIs:\n"; PHI->dump(););
2365 PHI->moveBefore(InnerLoopHeader->getFirstNonPHIIt());
2366 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
2367 }
2368 for (PHINode *PHI : InnerLoopPHIs) {
2369 LLVM_DEBUG(dbgs() << "Inner loop reduction PHIs:\n"; PHI->dump(););
2370 PHI->moveBefore(OuterLoopHeader->getFirstNonPHIIt());
2371 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
2372 }
2373
2374 // Update the incoming blocks for moved PHI nodes.
2375 OuterLoopHeader->replacePhiUsesWith(InnerLoopPreHeader, OuterLoopPreHeader);
2376 OuterLoopHeader->replacePhiUsesWith(InnerLoopLatch, OuterLoopLatch);
2377 InnerLoopHeader->replacePhiUsesWith(OuterLoopPreHeader, InnerLoopPreHeader);
2378 InnerLoopHeader->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
2379
2380 // Values defined in the outer loop header could be used in the inner loop
2381 // latch. In that case, we need to create LCSSA phis for them, because after
2382 // interchanging they will be defined in the new inner loop and used in the
2383 // new outer loop.
2384 SmallVector<Instruction *, 4> MayNeedLCSSAPhis;
2385 for (Instruction &I :
2386 make_range(OuterLoopHeader->begin(), std::prev(OuterLoopHeader->end())))
2387 MayNeedLCSSAPhis.push_back(&I);
2388 formLCSSAForInstructions(MayNeedLCSSAPhis, *DT, *LI, SE);
2389
2390 return true;
2391}
2392
2393bool LoopInterchangeTransform::adjustLoopLinks() {
2394 // Adjust all branches in the inner and outer loop.
2395 bool Changed = adjustLoopBranches();
2396 if (Changed) {
2397 // We have interchanged the preheaders so we need to interchange the data in
2398 // the preheaders as well. This is because the content of the inner
2399 // preheader was previously executed inside the outer loop.
2400 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
2401 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
2402 swapBBContents(OuterLoopPreHeader, InnerLoopPreHeader);
2403 }
2404 return Changed;
2405}
2406
2410 LPMUpdater &U) {
2411 Function &F = *LN.getParent();
2412 SmallVector<Loop *, 8> LoopList(LN.getLoops());
2413
2414 if (MaxMemInstrCount < 1) {
2415 LLVM_DEBUG(dbgs() << "MaxMemInstrCount should be at least 1");
2416 return PreservedAnalyses::all();
2417 }
2419
2420 // Ensure minimum depth of the loop nest to do the interchange.
2421 if (!hasSupportedLoopDepth(LoopList, ORE))
2422 return PreservedAnalyses::all();
2423 // Ensure computable loop nest.
2424 if (!isComputableLoopNest(&AR.SE, LoopList)) {
2425 LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n");
2426 return PreservedAnalyses::all();
2427 }
2428
2429 ORE.emit([&]() {
2430 return OptimizationRemarkAnalysis(DEBUG_TYPE, "Dependence",
2433 << "Computed dependence info, invoking the transform.";
2434 });
2435
2436 DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
2437 if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, &AR, &ORE).run(LN))
2438 return PreservedAnalyses::all();
2439 U.markLoopNestChanged(true);
2441}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file defines the StringMap class.
Rewrite undef for PHI
ReachingDefInfo InstSet InstSet & Ignore
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
#define DEBUG_TYPE
Hexagon Common GEP
static void moveBBContents(BasicBlock &SourceBB, BasicBlock &TargetBB)
Move the contents of SourceBB to before the last instruction of TargetBB.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
This file defines the interface for the loop cache analysis.
SmallVector< Loop *, 4 > LoopVector
Definition LoopFuse.cpp:379
Loop::LoopBounds::Direction Direction
Definition LoopInfo.cpp:252
static cl::opt< unsigned int > MaxMemInstrCount("loop-interchange-max-meminstr-count", cl::init(64), cl::Hidden, cl::desc("Maximum number of load-store instructions that should be handled " "in the dependency matrix. Higher value may lead to more interchanges " "at the cost of compile-time"))
static cl::opt< int > LoopInterchangeCostThreshold("loop-interchange-threshold", cl::init(0), cl::Hidden, cl::desc("Interchange if you gain more than this number"))
static PHINode * findInnerReductionPhi(Loop *L, Value *V, SmallVectorImpl< Instruction * > &HasNoWrapInsts)
static cl::opt< unsigned int > MinLoopNestDepth("loop-interchange-min-loop-nest-depth", cl::init(2), cl::Hidden, cl::desc("Minimum depth of loop nest considered for the transform"))
static void updateSuccessor(Instruction *Term, BasicBlock *OldBB, BasicBlock *NewBB, std::vector< DominatorTree::UpdateType > &DTUpdates, bool MustUpdateOnce=true)
static cl::opt< bool > EnableReduction2Memory("loop-interchange-reduction-to-mem", cl::init(false), cl::Hidden, cl::desc("Support for the inner-loop reduction pattern."))
static bool isComputableLoopNest(ScalarEvolution *SE, ArrayRef< Loop * > LoopList)
static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop)
static void simplifyLCSSAPhis(Loop *OuterLoop, Loop *InnerLoop)
This deals with a corner case when a LCSSA phi node appears in a non-exit block: the outer loop latch...
static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx, unsigned ToIndx)
static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader, BasicBlock *InnerLatch, BasicBlock *OuterHeader, BasicBlock *OuterLatch, BasicBlock *OuterExit, Loop *InnerLoop, LoopInfo *LI)
static void printDepMatrix(CharMatrix &DepMatrix)
static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2)
Swap instructions between BB1 and BB2 but keep terminators intact.
static bool areInnerLoopExitPHIsSupported(Loop *InnerL, Loop *OuterL, SmallPtrSetImpl< PHINode * > &Reductions, PHINode *LcssaReduction)
static cl::opt< unsigned int > MaxLoopNestDepth("loop-interchange-max-loop-nest-depth", cl::init(10), cl::Hidden, cl::desc("Maximum depth of loop nest considered for the transform"))
static bool hasSupportedLoopDepth(ArrayRef< Loop * > LoopList, OptimizationRemarkEmitter &ORE)
static bool inThisOrder(const Instruction *Src, const Instruction *Dst)
Return true if Src appears before Dst in the same basic block.
static bool areInnerLoopLatchPHIsSupported(Loop *OuterLoop, Loop *InnerLoop)
static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId)
Return true if we can vectorize the loop specified by LoopId.
static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix, unsigned InnerLoopId, unsigned OuterLoopId)
#define DEBUG_TYPE
static Value * followLCSSA(Value *SV)
static void populateWorklist(Loop &L, LoopVector &LoopList)
static bool checkReductionKind(Loop *L, PHINode *PHI, SmallVectorImpl< Instruction * > &HasNoWrapInsts)
static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, Loop *L, DependenceInfo *DI, ScalarEvolution *SE, OptimizationRemarkEmitter *ORE)
static cl::list< RuleTy > Profitabilities("loop-interchange-profitabilities", cl::MiscFlags::CommaSeparated, cl::Hidden, cl::desc("List of profitability heuristics to be used. They are applied in " "the given order"), cl::list_init< RuleTy >({RuleTy::PerLoopCacheAnalysis, RuleTy::PerInstrOrderCost, RuleTy::ForVectorization}), cl::values(clEnumValN(RuleTy::PerLoopCacheAnalysis, "cache", "Prioritize loop cache cost"), clEnumValN(RuleTy::PerInstrOrderCost, "instorder", "Prioritize the IVs order of each instruction"), clEnumValN(RuleTy::ForVectorization, "vectorize", "Prioritize vectorization"), clEnumValN(RuleTy::Ignore, "ignore", "Ignore profitability, force interchange (does not " "work with other options)")))
static std::optional< bool > isLexicographicallyPositive(ArrayRef< char > DV, unsigned Begin, unsigned End)
static bool noDuplicateRulesAndIgnore(ArrayRef< RuleTy > Rules)
This file defines the interface for the loop nest analysis.
This header provides classes for managing a pipeline of passes over loops in LLVM IR.
loop Loop Strength Reduction
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
uint64_t IntrinsicInst * II
#define P(N)
This file contains some templates that are useful if you are working with the STL at all.
SmallVector< Value *, 8 > ValueVector
This file defines the SmallSet class.
This file defines the SmallVector class.
static bool isProfitable(const StableFunctionMap::StableFunctionEntries &SFS)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:186
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:483
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:470
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition BasicBlock.h:539
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI iterator_range< filter_iterator< BasicBlock::const_iterator, std::function< bool(const Instruction &)> > > instructionsWithoutDebug(bool SkipPseudoOp=true) const
Return a const iterator range over the instructions in the block, skipping any debug instructions.
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
LLVM_ABI void replacePhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block to refer to basic block New instead of basic block Old.
LLVM_ABI const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition BasicBlock.h:668
static std::unique_ptr< CacheCost > getCacheCost(Loop &Root, LoopStandardAnalysisResults &AR, DependenceInfo &DI, std::optional< unsigned > TRT=std::nullopt)
Create a CacheCost for the loop nest rooted by Root.
CacheCostTy getLoopCost(const Loop &L) const
Return the estimated cost of loop L if the given loop is part of the loop nest associated with this o...
Value * getCondition() const
BasicBlock * getSuccessor(unsigned i) const
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
iterator end()
Definition DenseMap.h:81
DependenceInfo - This class is the main dependence-analysis driver.
LLVM_ABI std::unique_ptr< Dependence > depends(Instruction *Src, Instruction *Dst, bool UnderRuntimeAssumptions=false)
depends - Tests for a dependence between the Src and Dst instructions.
void applyUpdates(ArrayRef< UpdateType > Updates)
Inform the dominator tree about a sequence of CFG edge insertions and deletions and perform a batch u...
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static LLVM_ABI bool isInductionPHI(PHINode *Phi, const Loop *L, ScalarEvolution *SE, InductionDescriptor &D, const SCEV *Expr=nullptr, SmallVectorImpl< Instruction * > *CastsToIgnore=nullptr)
Returns true if Phi is an induction in the loop L.
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI void insertBefore(InstListType::iterator InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified position.
LLVM_ABI bool mayHaveSideEffects() const LLVM_READONLY
Return true if the instruction may have side effects.
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
void removeBlockFromLoop(BlockT *BB)
This removes the specified basic block from the current loop, updating the Blocks as appropriate.
const std::vector< LoopT * > & getSubLoops() const
Return the loops contained entirely within this loop.
BlockT * getHeader() const
iterator_range< block_iterator > blocks() const
void addChildLoop(LoopT *NewChild)
Add the specified loop to be a child of this loop.
void addBlockEntry(BlockT *BB)
This adds a basic block directly to the basic block list.
BlockT * getExitBlock() const
If getExitBlocks would return exactly one block, return that block.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
iterator begin() const
BlockT * getUniqueExitBlock() const
If getUniqueExitBlocks would return exactly one block, return that block.
LoopT * removeChildLoop(iterator I)
This removes the specified child from being a subloop of this loop.
void changeTopLevelLoop(LoopT *OldLoop, LoopT *NewLoop)
Replace the specified loop in the top-level loops list with the indicated loop.
void changeLoopFor(BlockT *BB, LoopT *L)
Change the top-level loop that contains BB to the specified loop.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
static const BasicBlock & skipEmptyBlockUntil(const BasicBlock *From, const BasicBlock *End, bool CheckUniquePred=false)
Recursivelly traverse all empty 'single successor' basic blocks of From (if there are any).
ArrayRef< Loop * > getLoops() const
Get the loops in the nest.
Function * getParent() const
Return the function to which the loop-nest belongs.
Loop & getOutermostLoop() const
Return the outermost loop in the loop nest.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition LoopInfo.cpp:652
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition LoopInfo.cpp:66
StringRef getName() const
Definition LoopInfo.h:389
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
bool isComplete() const
If the PHI node is complete which means all of its parent's predecessors have incoming value in this ...
op_range incoming_values()
void setIncomingBlock(unsigned i, BasicBlock *BB)
void setIncomingValue(unsigned i, Value *V)
static unsigned getIncomingValueNumForOperand(unsigned i)
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Instruction * getExactFPMathInst() const
Returns 1st non-reassociative FP instruction in the PHI node's use-chain.
static LLVM_ABI bool isReductionPHI(PHINode *Phi, Loop *TheLoop, RecurrenceDescriptor &RedDes, DemandedBits *DB=nullptr, AssumptionCache *AC=nullptr, DominatorTree *DT=nullptr, ScalarEvolution *SE=nullptr)
Returns true if Phi is a reduction in TheLoop.
LLVM_ABI SmallVector< Instruction *, 4 > getReductionOpChain(PHINode *Phi, Loop *L) const
Attempts to find a chain of operations from Phi to LoopExitInst that can be treated as a set of reduc...
RecurKind getRecurrenceKind() const
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:134
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133
std::pair< iterator, bool > try_emplace(StringRef Key, ArgsTy &&...Args)
Emplace a new element for the specified key into the map if the key isn't already in the map.
Definition StringMap.h:381
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:143
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition Value.cpp:166
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
iterator_range< user_iterator > users()
Definition Value.h:427
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition Value.cpp:188
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
list_initializer< Ty > list_init(ArrayRef< Ty > Vals)
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI BasicBlock * InsertPreheaderForLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
InsertPreheaderForLoop - Once we discover that a loop doesn't have a preheader, this method is called...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
InstructionCost Cost
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
auto successors(const MachineBasicBlock *BB)
LLVM_ABI bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition LCSSA.cpp:449
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2173
auto map_range(ContainerTy &&C, FuncTy F)
Return a range that applies F to the elements of C.
Definition STLExtras.h:366
AnalysisManager< Loop, LoopStandardAnalysisResults & > LoopAnalysisManager
The loop analysis manager.
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:2026
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FMinimumNum
FP min with llvm.minimumnum semantics.
@ Or
Bitwise or logical OR of integers.
@ FMinimum
FP min with llvm.minimum semantics.
@ Mul
Product of integers.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ Xor
Bitwise or logical XOR of integers.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMaximum
FP max with llvm.maximum semantics.
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Add
Sum of integers.
@ FAdd
Sum of floats.
@ FMaximumNum
FP max with llvm.maximumnum semantics.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2012
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI bool formLCSSAForInstructions(SmallVectorImpl< Instruction * > &Worklist, const DominatorTree &DT, const LoopInfo &LI, ScalarEvolution *SE, SmallVectorImpl< PHINode * > *PHIsToRemove=nullptr, SmallVectorImpl< PHINode * > *InsertedPHIs=nullptr)
Ensures LCSSA form for every instruction from the Worklist in the scope of innermost containing loop.
Definition LCSSA.cpp:308
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
auto predecessors(const MachineBasicBlock *BB)
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
Definition iterator.h:368
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
PreservedAnalyses run(LoopNest &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...