LLVM 20.0.0git
LoopLoadElimination.cpp
Go to the documentation of this file.
1//===- LoopLoadElimination.cpp - Loop Load Elimination Pass ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implement a loop-aware load elimination pass.
10//
11// It uses LoopAccessAnalysis to identify loop-carried dependences with a
12// distance of one between stores and loads. These form the candidates for the
13// transformation. The source value of each store then propagated to the user
14// of the corresponding load. This makes the load dead.
15//
16// The pass can also version the loop and add memchecks in order to prove that
17// may-aliasing stores can't change the value in memory before it's read by the
18// load.
19//
20//===----------------------------------------------------------------------===//
21
23#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/DenseMap.h"
26#include "llvm/ADT/STLExtras.h"
29#include "llvm/ADT/Statistic.h"
42#include "llvm/IR/DataLayout.h"
43#include "llvm/IR/Dominators.h"
45#include "llvm/IR/Module.h"
46#include "llvm/IR/PassManager.h"
47#include "llvm/IR/Type.h"
48#include "llvm/IR/Value.h"
51#include "llvm/Support/Debug.h"
58#include <algorithm>
59#include <cassert>
60#include <forward_list>
61#include <tuple>
62#include <utility>
63
64using namespace llvm;
65
66#define LLE_OPTION "loop-load-elim"
67#define DEBUG_TYPE LLE_OPTION
68
70 "runtime-check-per-loop-load-elim", cl::Hidden,
71 cl::desc("Max number of memchecks allowed per eliminated load on average"),
72 cl::init(1));
73
75 "loop-load-elimination-scev-check-threshold", cl::init(8), cl::Hidden,
76 cl::desc("The maximum number of SCEV checks allowed for Loop "
77 "Load Elimination"));
78
79STATISTIC(NumLoopLoadEliminted, "Number of loads eliminated by LLE");
80
81namespace {
82
83/// Represent a store-to-forwarding candidate.
84struct StoreToLoadForwardingCandidate {
85 LoadInst *Load;
86 StoreInst *Store;
87
88 StoreToLoadForwardingCandidate(LoadInst *Load, StoreInst *Store)
89 : Load(Load), Store(Store) {}
90
91 /// Return true if the dependence from the store to the load has an
92 /// absolute distance of one.
93 /// E.g. A[i+1] = A[i] (or A[i-1] = A[i] for descending loop)
94 bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE,
95 Loop *L) const {
96 Value *LoadPtr = Load->getPointerOperand();
97 Value *StorePtr = Store->getPointerOperand();
98 Type *LoadType = getLoadStoreType(Load);
99 auto &DL = Load->getDataLayout();
100
101 assert(LoadPtr->getType()->getPointerAddressSpace() ==
102 StorePtr->getType()->getPointerAddressSpace() &&
103 DL.getTypeSizeInBits(LoadType) ==
104 DL.getTypeSizeInBits(getLoadStoreType(Store)) &&
105 "Should be a known dependence");
106
107 int64_t StrideLoad = getPtrStride(PSE, LoadType, LoadPtr, L).value_or(0);
108 int64_t StrideStore = getPtrStride(PSE, LoadType, StorePtr, L).value_or(0);
109 if (!StrideLoad || !StrideStore || StrideLoad != StrideStore)
110 return false;
111
112 // TODO: This check for stride values other than 1 and -1 can be eliminated.
113 // However, doing so may cause the LoopAccessAnalysis to overcompensate,
114 // generating numerous non-wrap runtime checks that may undermine the
115 // benefits of load elimination. To safely implement support for non-unit
116 // strides, we would need to ensure either that the processed case does not
117 // require these additional checks, or improve the LAA to handle them more
118 // efficiently, or potentially both.
119 if (std::abs(StrideLoad) != 1)
120 return false;
121
122 unsigned TypeByteSize = DL.getTypeAllocSize(const_cast<Type *>(LoadType));
123
124 auto *LoadPtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(LoadPtr));
125 auto *StorePtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(StorePtr));
126
127 // We don't need to check non-wrapping here because forward/backward
128 // dependence wouldn't be valid if these weren't monotonic accesses.
129 auto *Dist = dyn_cast<SCEVConstant>(
130 PSE.getSE()->getMinusSCEV(StorePtrSCEV, LoadPtrSCEV));
131 if (!Dist)
132 return false;
133 const APInt &Val = Dist->getAPInt();
134 return Val == TypeByteSize * StrideLoad;
135 }
136
137 Value *getLoadPtr() const { return Load->getPointerOperand(); }
138
139#ifndef NDEBUG
141 const StoreToLoadForwardingCandidate &Cand) {
142 OS << *Cand.Store << " -->\n";
143 OS.indent(2) << *Cand.Load << "\n";
144 return OS;
145 }
146#endif
147};
148
149} // end anonymous namespace
150
151/// Check if the store dominates all latches, so as long as there is no
152/// intervening store this value will be loaded in the next iteration.
153static bool doesStoreDominatesAllLatches(BasicBlock *StoreBlock, Loop *L,
154 DominatorTree *DT) {
156 L->getLoopLatches(Latches);
157 return llvm::all_of(Latches, [&](const BasicBlock *Latch) {
158 return DT->dominates(StoreBlock, Latch);
159 });
160}
161
162/// Return true if the load is not executed on all paths in the loop.
163static bool isLoadConditional(LoadInst *Load, Loop *L) {
164 return Load->getParent() != L->getHeader();
165}
166
167namespace {
168
169/// The per-loop class that does most of the work.
170class LoadEliminationForLoop {
171public:
172 LoadEliminationForLoop(Loop *L, LoopInfo *LI, const LoopAccessInfo &LAI,
175 : L(L), LI(LI), LAI(LAI), DT(DT), BFI(BFI), PSI(PSI), PSE(LAI.getPSE()) {}
176
177 /// Look through the loop-carried and loop-independent dependences in
178 /// this loop and find store->load dependences.
179 ///
180 /// Note that no candidate is returned if LAA has failed to analyze the loop
181 /// (e.g. if it's not bottom-tested, contains volatile memops, etc.)
182 std::forward_list<StoreToLoadForwardingCandidate>
183 findStoreToLoadDependences(const LoopAccessInfo &LAI) {
184 std::forward_list<StoreToLoadForwardingCandidate> Candidates;
185
186 const auto &DepChecker = LAI.getDepChecker();
187 const auto *Deps = DepChecker.getDependences();
188 if (!Deps)
189 return Candidates;
190
191 // Find store->load dependences (consequently true dep). Both lexically
192 // forward and backward dependences qualify. Disqualify loads that have
193 // other unknown dependences.
194
195 SmallPtrSet<Instruction *, 4> LoadsWithUnknownDependence;
196
197 for (const auto &Dep : *Deps) {
198 Instruction *Source = Dep.getSource(DepChecker);
199 Instruction *Destination = Dep.getDestination(DepChecker);
200
203 if (isa<LoadInst>(Source))
204 LoadsWithUnknownDependence.insert(Source);
205 if (isa<LoadInst>(Destination))
206 LoadsWithUnknownDependence.insert(Destination);
207 continue;
208 }
209
210 if (Dep.isBackward())
211 // Note that the designations source and destination follow the program
212 // order, i.e. source is always first. (The direction is given by the
213 // DepType.)
214 std::swap(Source, Destination);
215 else
216 assert(Dep.isForward() && "Needs to be a forward dependence");
217
218 auto *Store = dyn_cast<StoreInst>(Source);
219 if (!Store)
220 continue;
221 auto *Load = dyn_cast<LoadInst>(Destination);
222 if (!Load)
223 continue;
224
225 // Only propagate if the stored values are bit/pointer castable.
228 Store->getDataLayout()))
229 continue;
230
231 Candidates.emplace_front(Load, Store);
232 }
233
234 if (!LoadsWithUnknownDependence.empty())
235 Candidates.remove_if([&](const StoreToLoadForwardingCandidate &C) {
236 return LoadsWithUnknownDependence.count(C.Load);
237 });
238
239 return Candidates;
240 }
241
242 /// Return the index of the instruction according to program order.
243 unsigned getInstrIndex(Instruction *Inst) {
244 auto I = InstOrder.find(Inst);
245 assert(I != InstOrder.end() && "No index for instruction");
246 return I->second;
247 }
248
249 /// If a load has multiple candidates associated (i.e. different
250 /// stores), it means that it could be forwarding from multiple stores
251 /// depending on control flow. Remove these candidates.
252 ///
253 /// Here, we rely on LAA to include the relevant loop-independent dependences.
254 /// LAA is known to omit these in the very simple case when the read and the
255 /// write within an alias set always takes place using the *same* pointer.
256 ///
257 /// However, we know that this is not the case here, i.e. we can rely on LAA
258 /// to provide us with loop-independent dependences for the cases we're
259 /// interested. Consider the case for example where a loop-independent
260 /// dependece S1->S2 invalidates the forwarding S3->S2.
261 ///
262 /// A[i] = ... (S1)
263 /// ... = A[i] (S2)
264 /// A[i+1] = ... (S3)
265 ///
266 /// LAA will perform dependence analysis here because there are two
267 /// *different* pointers involved in the same alias set (&A[i] and &A[i+1]).
268 void removeDependencesFromMultipleStores(
269 std::forward_list<StoreToLoadForwardingCandidate> &Candidates) {
270 // If Store is nullptr it means that we have multiple stores forwarding to
271 // this store.
272 using LoadToSingleCandT =
274 LoadToSingleCandT LoadToSingleCand;
275
276 for (const auto &Cand : Candidates) {
277 bool NewElt;
278 LoadToSingleCandT::iterator Iter;
279
280 std::tie(Iter, NewElt) =
281 LoadToSingleCand.insert(std::make_pair(Cand.Load, &Cand));
282 if (!NewElt) {
283 const StoreToLoadForwardingCandidate *&OtherCand = Iter->second;
284 // Already multiple stores forward to this load.
285 if (OtherCand == nullptr)
286 continue;
287
288 // Handle the very basic case when the two stores are in the same block
289 // so deciding which one forwards is easy. The later one forwards as
290 // long as they both have a dependence distance of one to the load.
291 if (Cand.Store->getParent() == OtherCand->Store->getParent() &&
292 Cand.isDependenceDistanceOfOne(PSE, L) &&
293 OtherCand->isDependenceDistanceOfOne(PSE, L)) {
294 // They are in the same block, the later one will forward to the load.
295 if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store))
296 OtherCand = &Cand;
297 } else
298 OtherCand = nullptr;
299 }
300 }
301
302 Candidates.remove_if([&](const StoreToLoadForwardingCandidate &Cand) {
303 if (LoadToSingleCand[Cand.Load] != &Cand) {
304 LLVM_DEBUG(
305 dbgs() << "Removing from candidates: \n"
306 << Cand
307 << " The load may have multiple stores forwarding to "
308 << "it\n");
309 return true;
310 }
311 return false;
312 });
313 }
314
315 /// Given two pointers operations by their RuntimePointerChecking
316 /// indices, return true if they require an alias check.
317 ///
318 /// We need a check if one is a pointer for a candidate load and the other is
319 /// a pointer for a possibly intervening store.
320 bool needsChecking(unsigned PtrIdx1, unsigned PtrIdx2,
321 const SmallPtrSetImpl<Value *> &PtrsWrittenOnFwdingPath,
322 const SmallPtrSetImpl<Value *> &CandLoadPtrs) {
323 Value *Ptr1 =
325 Value *Ptr2 =
327 return ((PtrsWrittenOnFwdingPath.count(Ptr1) && CandLoadPtrs.count(Ptr2)) ||
328 (PtrsWrittenOnFwdingPath.count(Ptr2) && CandLoadPtrs.count(Ptr1)));
329 }
330
331 /// Return pointers that are possibly written to on the path from a
332 /// forwarding store to a load.
333 ///
334 /// These pointers need to be alias-checked against the forwarding candidates.
335 SmallPtrSet<Value *, 4> findPointersWrittenOnForwardingPath(
337 // From FirstStore to LastLoad neither of the elimination candidate loads
338 // should overlap with any of the stores.
339 //
340 // E.g.:
341 //
342 // st1 C[i]
343 // ld1 B[i] <-------,
344 // ld0 A[i] <----, | * LastLoad
345 // ... | |
346 // st2 E[i] | |
347 // st3 B[i+1] -- | -' * FirstStore
348 // st0 A[i+1] ---'
349 // st4 D[i]
350 //
351 // st0 forwards to ld0 if the accesses in st4 and st1 don't overlap with
352 // ld0.
353
354 LoadInst *LastLoad =
355 llvm::max_element(Candidates,
356 [&](const StoreToLoadForwardingCandidate &A,
357 const StoreToLoadForwardingCandidate &B) {
358 return getInstrIndex(A.Load) <
359 getInstrIndex(B.Load);
360 })
361 ->Load;
362 StoreInst *FirstStore =
363 llvm::min_element(Candidates,
364 [&](const StoreToLoadForwardingCandidate &A,
365 const StoreToLoadForwardingCandidate &B) {
366 return getInstrIndex(A.Store) <
367 getInstrIndex(B.Store);
368 })
369 ->Store;
370
371 // We're looking for stores after the first forwarding store until the end
372 // of the loop, then from the beginning of the loop until the last
373 // forwarded-to load. Collect the pointer for the stores.
374 SmallPtrSet<Value *, 4> PtrsWrittenOnFwdingPath;
375
376 auto InsertStorePtr = [&](Instruction *I) {
377 if (auto *S = dyn_cast<StoreInst>(I))
378 PtrsWrittenOnFwdingPath.insert(S->getPointerOperand());
379 };
380 const auto &MemInstrs = LAI.getDepChecker().getMemoryInstructions();
381 std::for_each(MemInstrs.begin() + getInstrIndex(FirstStore) + 1,
382 MemInstrs.end(), InsertStorePtr);
383 std::for_each(MemInstrs.begin(), &MemInstrs[getInstrIndex(LastLoad)],
384 InsertStorePtr);
385
386 return PtrsWrittenOnFwdingPath;
387 }
388
389 /// Determine the pointer alias checks to prove that there are no
390 /// intervening stores.
393
394 SmallPtrSet<Value *, 4> PtrsWrittenOnFwdingPath =
395 findPointersWrittenOnForwardingPath(Candidates);
396
397 // Collect the pointers of the candidate loads.
398 SmallPtrSet<Value *, 4> CandLoadPtrs;
399 for (const auto &Candidate : Candidates)
400 CandLoadPtrs.insert(Candidate.getLoadPtr());
401
402 const auto &AllChecks = LAI.getRuntimePointerChecking()->getChecks();
404
405 copy_if(AllChecks, std::back_inserter(Checks),
406 [&](const RuntimePointerCheck &Check) {
407 for (auto PtrIdx1 : Check.first->Members)
408 for (auto PtrIdx2 : Check.second->Members)
409 if (needsChecking(PtrIdx1, PtrIdx2, PtrsWrittenOnFwdingPath,
410 CandLoadPtrs))
411 return true;
412 return false;
413 });
414
415 LLVM_DEBUG(dbgs() << "\nPointer Checks (count: " << Checks.size()
416 << "):\n");
418
419 return Checks;
420 }
421
422 /// Perform the transformation for a candidate.
423 void
424 propagateStoredValueToLoadUsers(const StoreToLoadForwardingCandidate &Cand,
425 SCEVExpander &SEE) {
426 // loop:
427 // %x = load %gep_i
428 // = ... %x
429 // store %y, %gep_i_plus_1
430 //
431 // =>
432 //
433 // ph:
434 // %x.initial = load %gep_0
435 // loop:
436 // %x.storeforward = phi [%x.initial, %ph] [%y, %loop]
437 // %x = load %gep_i <---- now dead
438 // = ... %x.storeforward
439 // store %y, %gep_i_plus_1
440
441 Value *Ptr = Cand.Load->getPointerOperand();
442 auto *PtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(Ptr));
443 auto *PH = L->getLoopPreheader();
444 assert(PH && "Preheader should exist!");
445 Value *InitialPtr = SEE.expandCodeFor(PtrSCEV->getStart(), Ptr->getType(),
446 PH->getTerminator());
447 Value *Initial =
448 new LoadInst(Cand.Load->getType(), InitialPtr, "load_initial",
449 /* isVolatile */ false, Cand.Load->getAlign(),
450 PH->getTerminator()->getIterator());
451 // We don't give any debug location to Initial, because it is inserted
452 // into the loop's preheader. A debug location inside the loop will cause
453 // a misleading stepping when debugging. The test update-debugloc-store
454 // -forwarded.ll checks this.
455
456 PHINode *PHI = PHINode::Create(Initial->getType(), 2, "store_forwarded");
457 PHI->insertBefore(L->getHeader()->begin());
458 PHI->addIncoming(Initial, PH);
459
460 Type *LoadType = Initial->getType();
461 Type *StoreType = Cand.Store->getValueOperand()->getType();
462 auto &DL = Cand.Load->getDataLayout();
463 (void)DL;
464
465 assert(DL.getTypeSizeInBits(LoadType) == DL.getTypeSizeInBits(StoreType) &&
466 "The type sizes should match!");
467
468 Value *StoreValue = Cand.Store->getValueOperand();
469 if (LoadType != StoreType) {
470 StoreValue = CastInst::CreateBitOrPointerCast(StoreValue, LoadType,
471 "store_forward_cast",
472 Cand.Store->getIterator());
473 // Because it casts the old `load` value and is used by the new `phi`
474 // which replaces the old `load`, we give the `load`'s debug location
475 // to it.
476 cast<Instruction>(StoreValue)->setDebugLoc(Cand.Load->getDebugLoc());
477 }
478
479 PHI->addIncoming(StoreValue, L->getLoopLatch());
480
481 Cand.Load->replaceAllUsesWith(PHI);
482 PHI->setDebugLoc(Cand.Load->getDebugLoc());
483 }
484
485 /// Top-level driver for each loop: find store->load forwarding
486 /// candidates, add run-time checks and perform transformation.
487 bool processLoop() {
488 LLVM_DEBUG(dbgs() << "\nIn \"" << L->getHeader()->getParent()->getName()
489 << "\" checking " << *L << "\n");
490
491 // Look for store-to-load forwarding cases across the
492 // backedge. E.g.:
493 //
494 // loop:
495 // %x = load %gep_i
496 // = ... %x
497 // store %y, %gep_i_plus_1
498 //
499 // =>
500 //
501 // ph:
502 // %x.initial = load %gep_0
503 // loop:
504 // %x.storeforward = phi [%x.initial, %ph] [%y, %loop]
505 // %x = load %gep_i <---- now dead
506 // = ... %x.storeforward
507 // store %y, %gep_i_plus_1
508
509 // First start with store->load dependences.
510 auto StoreToLoadDependences = findStoreToLoadDependences(LAI);
511 if (StoreToLoadDependences.empty())
512 return false;
513
514 // Generate an index for each load and store according to the original
515 // program order. This will be used later.
516 InstOrder = LAI.getDepChecker().generateInstructionOrderMap();
517
518 // To keep things simple for now, remove those where the load is potentially
519 // fed by multiple stores.
520 removeDependencesFromMultipleStores(StoreToLoadDependences);
521 if (StoreToLoadDependences.empty())
522 return false;
523
524 // Filter the candidates further.
526 for (const StoreToLoadForwardingCandidate &Cand : StoreToLoadDependences) {
527 LLVM_DEBUG(dbgs() << "Candidate " << Cand);
528
529 // Make sure that the stored values is available everywhere in the loop in
530 // the next iteration.
531 if (!doesStoreDominatesAllLatches(Cand.Store->getParent(), L, DT))
532 continue;
533
534 // If the load is conditional we can't hoist its 0-iteration instance to
535 // the preheader because that would make it unconditional. Thus we would
536 // access a memory location that the original loop did not access.
537 if (isLoadConditional(Cand.Load, L))
538 continue;
539
540 // Check whether the SCEV difference is the same as the induction step,
541 // thus we load the value in the next iteration.
542 if (!Cand.isDependenceDistanceOfOne(PSE, L))
543 continue;
544
545 assert(isa<SCEVAddRecExpr>(PSE.getSCEV(Cand.Load->getPointerOperand())) &&
546 "Loading from something other than indvar?");
547 assert(
548 isa<SCEVAddRecExpr>(PSE.getSCEV(Cand.Store->getPointerOperand())) &&
549 "Storing to something other than indvar?");
550
551 Candidates.push_back(Cand);
553 dbgs()
554 << Candidates.size()
555 << ". Valid store-to-load forwarding across the loop backedge\n");
556 }
557 if (Candidates.empty())
558 return false;
559
560 // Check intervening may-alias stores. These need runtime checks for alias
561 // disambiguation.
562 SmallVector<RuntimePointerCheck, 4> Checks = collectMemchecks(Candidates);
563
564 // Too many checks are likely to outweigh the benefits of forwarding.
565 if (Checks.size() > Candidates.size() * CheckPerElim) {
566 LLVM_DEBUG(dbgs() << "Too many run-time checks needed.\n");
567 return false;
568 }
569
570 if (LAI.getPSE().getPredicate().getComplexity() >
572 LLVM_DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n");
573 return false;
574 }
575
576 if (!L->isLoopSimplifyForm()) {
577 LLVM_DEBUG(dbgs() << "Loop is not is loop-simplify form");
578 return false;
579 }
580
581 if (!Checks.empty() || !LAI.getPSE().getPredicate().isAlwaysTrue()) {
582 if (LAI.hasConvergentOp()) {
583 LLVM_DEBUG(dbgs() << "Versioning is needed but not allowed with "
584 "convergent calls\n");
585 return false;
586 }
587
588 auto *HeaderBB = L->getHeader();
589 auto *F = HeaderBB->getParent();
590 bool OptForSize = F->hasOptSize() ||
591 llvm::shouldOptimizeForSize(HeaderBB, PSI, BFI,
592 PGSOQueryType::IRPass);
593 if (OptForSize) {
595 dbgs() << "Versioning is needed but not allowed when optimizing "
596 "for size.\n");
597 return false;
598 }
599
600 // Point of no-return, start the transformation. First, version the loop
601 // if necessary.
602
603 LoopVersioning LV(LAI, Checks, L, LI, DT, PSE.getSE());
604 LV.versionLoop();
605
606 // After versioning, some of the candidates' pointers could stop being
607 // SCEVAddRecs. We need to filter them out.
608 auto NoLongerGoodCandidate = [this](
609 const StoreToLoadForwardingCandidate &Cand) {
610 return !isa<SCEVAddRecExpr>(
611 PSE.getSCEV(Cand.Load->getPointerOperand())) ||
612 !isa<SCEVAddRecExpr>(
613 PSE.getSCEV(Cand.Store->getPointerOperand()));
614 };
615 llvm::erase_if(Candidates, NoLongerGoodCandidate);
616 }
617
618 // Next, propagate the value stored by the store to the users of the load.
619 // Also for the first iteration, generate the initial value of the load.
620 SCEVExpander SEE(*PSE.getSE(), L->getHeader()->getDataLayout(),
621 "storeforward");
622 for (const auto &Cand : Candidates)
623 propagateStoredValueToLoadUsers(Cand, SEE);
624 NumLoopLoadEliminted += Candidates.size();
625
626 return true;
627 }
628
629private:
630 Loop *L;
631
632 /// Maps the load/store instructions to their index according to
633 /// program order.
635
636 // Analyses used.
637 LoopInfo *LI;
638 const LoopAccessInfo &LAI;
639 DominatorTree *DT;
643};
644
645} // end anonymous namespace
646
648 DominatorTree &DT,
652 LoopAccessInfoManager &LAIs) {
653 // Build up a worklist of inner-loops to transform to avoid iterator
654 // invalidation.
655 // FIXME: This logic comes from other passes that actually change the loop
656 // nest structure. It isn't clear this is necessary (or useful) for a pass
657 // which merely optimizes the use of loads in a loop.
658 SmallVector<Loop *, 8> Worklist;
659
660 bool Changed = false;
661
662 for (Loop *TopLevelLoop : LI)
663 for (Loop *L : depth_first(TopLevelLoop)) {
664 Changed |= simplifyLoop(L, &DT, &LI, SE, AC, /*MSSAU*/ nullptr, false);
665 // We only handle inner-most loops.
666 if (L->isInnermost())
667 Worklist.push_back(L);
668 }
669
670 // Now walk the identified inner loops.
671 for (Loop *L : Worklist) {
672 // Match historical behavior
673 if (!L->isRotatedForm() || !L->getExitingBlock())
674 continue;
675 // The actual work is performed by LoadEliminationForLoop.
676 LoadEliminationForLoop LEL(L, &LI, LAIs.getInfo(*L), &DT, BFI, PSI);
677 Changed |= LEL.processLoop();
678 if (Changed)
679 LAIs.clear();
680 }
681 return Changed;
682}
683
686 auto &LI = AM.getResult<LoopAnalysis>(F);
687 // There are no loops in the function. Return before computing other expensive
688 // analyses.
689 if (LI.empty())
690 return PreservedAnalyses::all();
691 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
692 auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
693 auto &AC = AM.getResult<AssumptionAnalysis>(F);
694 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
695 auto *PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
696 auto *BFI = (PSI && PSI->hasProfileSummary()) ?
697 &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;
699
700 bool Changed = eliminateLoadsAcrossLoops(F, LI, DT, BFI, PSI, &SE, &AC, LAIs);
701
702 if (!Changed)
703 return PreservedAnalyses::all();
704
708 return PA;
709}
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
Rewrite undef for PHI
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.
#define Check(C,...)
This is the interface for a simple mod/ref and alias analysis over globals.
This header provides classes for managing per-loop analyses.
static bool eliminateLoadsAcrossLoops(Function &F, LoopInfo &LI, DominatorTree &DT, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, ScalarEvolution *SE, AssumptionCache *AC, LoopAccessInfoManager &LAIs)
static cl::opt< unsigned > LoadElimSCEVCheckThreshold("loop-load-elimination-scev-check-threshold", cl::init(8), cl::Hidden, cl::desc("The maximum number of SCEV checks allowed for Loop " "Load Elimination"))
static bool isLoadConditional(LoadInst *Load, Loop *L)
Return true if the load is not executed on all paths in the loop.
static bool doesStoreDominatesAllLatches(BasicBlock *StoreBlock, Loop *L, DominatorTree *DT)
Check if the store dominates all latches, so as long as there is no intervening store this value will...
static cl::opt< unsigned > CheckPerElim("runtime-check-per-loop-load-elim", cl::Hidden, cl::desc("Max number of memchecks allowed per eliminated load on average"), cl::init(1))
This header defines the LoopLoadEliminationPass object.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Module.h This file contains the declarations for the Module class.
if(PassOpts->AAPipeline)
This header defines various interfaces for pass management in LLVM.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:77
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:405
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
Analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
static bool isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy, const DataLayout &DL)
Check whether a bitcast, inttoptr, or ptrtoint cast between these types is valid and a no-op.
static CastInst * CreateBitOrPointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, a PtrToInt, or an IntToPTr cast instruction.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:211
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
An instruction for reading from memory.
Definition: Instructions.h:174
This analysis provides dependence information for the memory accesses of a loop.
const LoopAccessInfo & getInfo(Loop &L)
Drive the analysis of memory accesses in the loop.
const MemoryDepChecker & getDepChecker() const
the Memory Dependence Checker which can determine the loop-independent and loop-carried dependences b...
const RuntimePointerChecking * getRuntimePointerChecking() const
const PredicatedScalarEvolution & getPSE() const
Used to add runtime SCEV checks.
bool hasConvergentOp() const
Return true if there is a convergent operation in the loop.
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
This class emits a version of the loop where run-time checks ensure that may-alias pointers can't ove...
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
const SmallVectorImpl< Instruction * > & getMemoryInstructions() const
The vector of memory access instructions.
const SmallVectorImpl< Dependence > * getDependences() const
Returns the memory dependences.
DenseMap< Instruction *, unsigned > generateInstructionOrderMap() const
Generate a mapping between the memory instructions and their indices according to program order.
An analysis over an "inner" IR unit that provides access to an analysis manager over a "outer" IR uni...
Definition: PassManager.h:688
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
const SCEVPredicate & getPredicate() const
const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
void preserve()
Mark an analysis as preserved.
Definition: Analysis.h:131
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Analysis providing profile information.
void printChecks(raw_ostream &OS, const SmallVectorImpl< RuntimePointerCheck > &Checks, unsigned Depth=0) const
Print Checks.
const SmallVectorImpl< RuntimePointerCheck > & getChecks() const
Returns the checks that generateChecks created.
const PointerInfo & getPointerInfo(unsigned PtrIdx) const
Return PointerInfo for pointer at index PtrIdx.
This class uses information about analyze scalars to rewrite expressions in canonical form.
virtual unsigned getComplexity() const
Returns the estimated complexity of this predicate.
virtual bool isAlwaysTrue() const =0
Returns true if the predicate is always true.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:346
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:435
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:367
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:502
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:290
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
Simplify each loop in a loop nest recursively.
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1987
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
std::pair< const RuntimeCheckingPtrGroup *, const RuntimeCheckingPtrGroup * > RuntimePointerCheck
A memcheck which made up of a pair of grouped pointers.
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1768
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1997
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Definition: APFixedPoint.h:292
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:2082
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
iterator_range< df_iterator< T > > depth_first(const T &G)
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define SEE(c)
Definition: regcomp.c:254
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
TrackingVH< Value > PointerValue
Holds the pointer value that we need to check.