Go to the documentation of this file.
63 #include <forward_list>
69 #define LLE_OPTION "loop-load-elim"
70 #define DEBUG_TYPE LLE_OPTION
73 "runtime-check-per-loop-load-elim",
cl::Hidden,
74 cl::desc(
"Max number of memchecks allowed per eliminated load on average"),
79 cl::desc(
"The maximum number of SCEV checks allowed for Loop "
82 STATISTIC(NumLoopLoadEliminted,
"Number of loads eliminated by LLE");
87 struct StoreToLoadForwardingCandidate {
98 Value *LoadPtr =
Load->getPointerOperand();
105 "Should be a known dependence");
114 auto &
DL =
Load->getParent()->getModule()->getDataLayout();
115 unsigned TypeByteSize =
DL.getTypeAllocSize(
const_cast<Type *
>(LoadType));
117 auto *LoadPtrSCEV = cast<SCEVAddRecExpr>(PSE.
getSCEV(LoadPtr));
118 auto *StorePtrSCEV = cast<SCEVAddRecExpr>(PSE.
getSCEV(StorePtr));
122 auto *Dist = cast<SCEVConstant>(
124 const APInt &Val = Dist->getAPInt();
125 return Val == TypeByteSize;
128 Value *getLoadPtr()
const {
return Load->getPointerOperand(); }
132 const StoreToLoadForwardingCandidate &Cand) {
133 OS << *Cand.Store <<
" -->\n";
134 OS.
indent(2) << *Cand.Load <<
"\n";
161 class LoadEliminationForLoop {
166 : L(L), LI(LI), LAI(LAI), DT(DT),
BFI(
BFI), PSI(PSI), PSE(LAI.getPSE()) {}
173 std::forward_list<StoreToLoadForwardingCandidate>
175 std::forward_list<StoreToLoadForwardingCandidate> Candidates;
187 for (
const auto &Dep : *Deps) {
189 Instruction *Destination = Dep.getDestination(LAI);
192 if (isa<LoadInst>(
Source))
194 if (isa<LoadInst>(Destination))
195 LoadsWithUnknownDepedence.
insert(Destination);
199 if (Dep.isBackward())
205 assert(Dep.isForward() &&
"Needs to be a forward dependence");
210 auto *
Load = dyn_cast<LoadInst>(Destination);
215 if (
Store->getPointerOperandType() !=
Load->getPointerOperandType() ||
222 if (!LoadsWithUnknownDepedence.
empty())
223 Candidates.remove_if([&](
const StoreToLoadForwardingCandidate &
C) {
224 return LoadsWithUnknownDepedence.
count(
C.Load);
232 auto I = InstOrder.find(Inst);
233 assert(
I != InstOrder.end() &&
"No index for instruction");
256 void removeDependencesFromMultipleStores(
257 std::forward_list<StoreToLoadForwardingCandidate> &Candidates) {
260 using LoadToSingleCandT =
262 LoadToSingleCandT LoadToSingleCand;
264 for (
const auto &Cand : Candidates) {
266 LoadToSingleCandT::iterator Iter;
268 std::tie(Iter, NewElt) =
269 LoadToSingleCand.
insert(std::make_pair(Cand.Load, &Cand));
271 const StoreToLoadForwardingCandidate *&OtherCand = Iter->second;
273 if (OtherCand ==
nullptr)
279 if (Cand.Store->getParent() == OtherCand->Store->getParent() &&
280 Cand.isDependenceDistanceOfOne(PSE, L) &&
281 OtherCand->isDependenceDistanceOfOne(PSE, L)) {
283 if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store))
290 Candidates.remove_if([&](
const StoreToLoadForwardingCandidate &Cand) {
291 if (LoadToSingleCand[Cand.Load] != &Cand) {
293 dbgs() <<
"Removing from candidates: \n"
295 <<
" The load may have multiple stores forwarding to "
308 bool needsChecking(
unsigned PtrIdx1,
unsigned PtrIdx2,
315 return ((PtrsWrittenOnFwdingPath.
count(Ptr1) && CandLoadPtrs.
count(Ptr2)) ||
316 (PtrsWrittenOnFwdingPath.
count(Ptr2) && CandLoadPtrs.
count(Ptr1)));
343 std::max_element(Candidates.begin(), Candidates.end(),
344 [&](
const StoreToLoadForwardingCandidate &A,
345 const StoreToLoadForwardingCandidate &
B) {
346 return getInstrIndex(A.Load) < getInstrIndex(B.Load);
350 std::min_element(Candidates.begin(), Candidates.end(),
351 [&](
const StoreToLoadForwardingCandidate &A,
352 const StoreToLoadForwardingCandidate &
B) {
353 return getInstrIndex(A.Store) <
354 getInstrIndex(B.Store);
364 if (
auto *
S = dyn_cast<StoreInst>(
I))
365 PtrsWrittenOnFwdingPath.
insert(
S->getPointerOperand());
368 std::for_each(MemInstrs.begin() + getInstrIndex(FirstStore) + 1,
369 MemInstrs.end(), InsertStorePtr);
370 std::for_each(MemInstrs.begin(), &MemInstrs[getInstrIndex(LastLoad)],
373 return PtrsWrittenOnFwdingPath;
382 findPointersWrittenOnForwardingPath(Candidates);
386 for (
const auto &Candidate : Candidates)
387 CandLoadPtrs.
insert(Candidate.getLoadPtr());
392 copy_if(AllChecks, std::back_inserter(Checks),
394 for (
auto PtrIdx1 :
Check.first->Members)
395 for (
auto PtrIdx2 :
Check.second->Members)
396 if (needsChecking(PtrIdx1, PtrIdx2, PtrsWrittenOnFwdingPath,
411 propagateStoredValueToLoadUsers(
const StoreToLoadForwardingCandidate &Cand,
428 Value *Ptr = Cand.Load->getPointerOperand();
429 auto *PtrSCEV = cast<SCEVAddRecExpr>(PSE.
getSCEV(Ptr));
431 assert(PH &&
"Preheader should exist!");
432 Value *InitialPtr =
SEE.expandCodeFor(PtrSCEV->getStart(), Ptr->
getType(),
433 PH->getTerminator());
435 Cand.Load->getType(), InitialPtr,
"load_initial",
436 false, Cand.Load->getAlign(), PH->getTerminator());
443 Cand.Load->replaceAllUsesWith(PHI);
450 <<
"\" checking " << *L <<
"\n");
471 auto StoreToLoadDependences = findStoreToLoadDependences(LAI);
472 if (StoreToLoadDependences.empty())
481 removeDependencesFromMultipleStores(StoreToLoadDependences);
482 if (StoreToLoadDependences.empty())
487 for (
const StoreToLoadForwardingCandidate &Cand : StoreToLoadDependences) {
503 if (!Cand.isDependenceDistanceOfOne(PSE, L))
506 assert(isa<SCEVAddRecExpr>(PSE.
getSCEV(Cand.Load->getPointerOperand())) &&
507 "Loading from something other than indvar?");
509 isa<SCEVAddRecExpr>(PSE.
getSCEV(Cand.Store->getPointerOperand())) &&
510 "Storing to something other than indvar?");
512 Candidates.push_back(Cand);
516 <<
". Valid store-to-load forwarding across the loop backedge\n");
518 if (Candidates.empty())
545 "convergent calls\n");
550 auto *
F = HeaderBB->getParent();
551 bool OptForSize =
F->hasOptSize() ||
556 dbgs() <<
"Versioning is needed but not allowed when optimizing "
569 auto NoLongerGoodCandidate = [
this](
570 const StoreToLoadForwardingCandidate &Cand) {
571 return !isa<SCEVAddRecExpr>(
572 PSE.
getSCEV(Cand.Load->getPointerOperand())) ||
573 !isa<SCEVAddRecExpr>(
574 PSE.
getSCEV(Cand.Store->getPointerOperand()));
583 for (
const auto &Cand : Candidates)
584 propagateStoredValueToLoadUsers(Cand,
SEE);
585 NumLoopLoadEliminted += Candidates.size();
620 bool Changed =
false;
622 for (
Loop *TopLevelLoop : LI)
624 Changed |=
simplifyLoop(L, &DT, &LI, SE, AC,
nullptr,
false);
627 Worklist.push_back(L);
631 for (
Loop *L : Worklist) {
636 LoadEliminationForLoop LEL(L, &LI, GetLAI(*L), &DT,
BFI, PSI);
637 Changed |= LEL.processLoop();
658 auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
659 auto &LAA = getAnalysis<LoopAccessLegacyAnalysis>();
660 auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
661 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
663 &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI() :
665 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
669 F, LI, DT,
BFI, PSI, SE,
nullptr,
691 static const char LLE_name[] =
"Loop Load Elimination";
704 return new LoopLoadElimination();
722 auto *
BFI = (PSI && PSI->hasProfileSummary()) ?
729 TLI,
TTI,
nullptr,
nullptr,
nullptr};
FunctionPass * createLoopLoadEliminationPass()
A set of analyses that are preserved following a run of a transformation pass.
An analysis over an "inner" IR unit that provides access to an analysis manager over a "outer" IR uni...
A manager for alias analyses.
Analysis pass providing the TargetTransformInfo.
Analysis pass that exposes the ScalarEvolution for a function.
This analysis provides dependence information for the memory accesses of a loop.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
This is an optimization pass for GlobalISel generic memory operations.
std::pair< const RuntimeCheckingPtrGroup *, const RuntimeCheckingPtrGroup * > RuntimePointerCheck
A memcheck which made up of a pair of grouped pointers.
const SCEVPredicate & getPredicate() const
const Function * getParent() const
Return the enclosing method, or null if none.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represents a single loop in the control flow graph.
This class uses information about analyze scalars to rewrite expressions in canonical form.
bool hasProfileSummary() const
Returns true if profile summary is available.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
The main scalar evolution driver.
DenseMap< Instruction *, unsigned > generateInstructionOrderMap() const
Generate a mapping between the memory instructions and their indices according to program order.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
static const char LLE_name[]
This analysis provides dependence information for the memory accesses of a loop.
Type * getLoadStoreType(Value *I)
A helper function that returns the type of a load or store instruction.
static void getLazyBFIAnalysisUsage(AnalysisUsage &AU)
Helper for client passes to set up the analysis usage on behalf of this pass.
The instances of the Type class are immutable: once they are created, they are never changed.
int64_t getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const ValueToValueMap &StridesMap=ValueToValueMap(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
The legacy pass manager's analysis pass to compute loop information.
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
This is an alternative analysis pass to BlockFrequencyInfoWrapperPass.
LLVM Basic Block Representation.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
void getLoopLatches(SmallVectorImpl< BlockT * > &LoopLatches) const
Return all loop latch blocks of this loop.
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
(vector float) vec_cmpeq(*A, *B) C
Represent the analysis usage information of a pass.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Legacy analysis pass which computes a DominatorTree.
STATISTIC(NumFunctions, "Total number of functions")
This class implements an extremely fast bulk output stream that can only output to a stream.
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
const SmallVectorImpl< Dependence > * getDependences() const
Returns the memory dependences.
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
Analysis pass which computes BlockFrequencyInfo.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
bool hasConvergentOp() const
Return true if there is a convergent operation in the loop.
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Analysis providing profile information.
An efficient, type-erasing, non-owning reference to a callable.
bool isLoopSimplifyForm() const
Return true if the Loop is in the form that the LoopSimplify form transforms loops to,...
An instruction for storing to memory.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
A function analysis which provides an AssumptionCache.
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
This class emits a version of the loop where run-time checks ensure that may-alias pointers can't ove...
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
const SmallVectorImpl< RuntimePointerCheck > & getChecks() const
Returns the checks that generateChecks created.
const RuntimePointerChecking * getRuntimePointerChecking() const
initializer< Ty > init(const Ty &Val)
Drive the analysis of memory accesses in the loop.
An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
const SmallVectorImpl< Instruction * > & getMemoryInstructions() const
The vector of memory access instructions.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
virtual bool isAlwaysTrue() const =0
Returns true if the predicate is always true.
static bool doesStoreDominatesAllLatches(BasicBlock *StoreBlock, Loop *L, DominatorTree *DT)
Check if the store dominates all latches, so as long as there is no intervening store this value will...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Class for arbitrary precision integers.
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
const MemoryDepChecker & getDepChecker() const
the Memory Dependence Checker which can determine the loop-independent and loop-carried dependences b...
virtual unsigned getComplexity() const
Returns the estimated complexity of this predicate.
A cache of @llvm.assume calls within a function.
Type * getType() const
All values are typed, get the type of this value.
if(llvm_vc STREQUAL "") set(fake_version_inc "$
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isLoadConditional(LoadInst *Load, Loop *L)
Return true if the load is not executed on all paths in the loop.
add sub stmia L5 ldr r0 bl L_printf $stub Instead of a and a wouldn t it be better to do three moves *Return an aggregate type is even return S
static cl::opt< unsigned > LoadElimSCEVCheckThreshold("loop-load-elimination-scev-check-threshold", cl::init(8), cl::Hidden, cl::desc("The maximum number of SCEV checks allowed for Loop " "Load Elimination"))
StringRef getName() const
Return a constant reference to the value's name.
this could be done in SelectionDAGISel along with other special for
An instruction for reading from memory.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
const Instruction & front() const
iterator_range< df_iterator< T > > depth_first(const T &G)
static bool runOnFunction(Function &F, bool PostInlining)
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
static cl::opt< unsigned > CheckPerElim("runtime-check-per-loop-load-elim", cl::Hidden, cl::desc("Max number of memchecks allowed per eliminated load on average"), cl::init(1))
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
TrackingVH< Value > PointerValue
Holds the pointer value that we need to check.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
void printChecks(raw_ostream &OS, const SmallVectorImpl< RuntimePointerCheck > &Checks, unsigned Depth=0) const
Print Checks.
BlockT * getHeader() const
bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
Simplify each loop in a loop nest recursively.
Analysis pass which computes a DominatorTree.
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
LLVM_NODISCARD bool empty() const
Legacy wrapper pass to provide the GlobalsAAResult object.
const PointerInfo & getPointerInfo(unsigned PtrIdx) const
Return PointerInfo for pointer at index PtrIdx.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
A container for analyses that lazily runs them and caches their results.
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
FunctionPass class - This class is used to implement most global optimizations.
AnalysisUsage & addRequiredID(const void *ID)
AnalysisUsage & addRequired()
bool isRotatedForm() const
Return true if the loop is in rotated form.
static bool eliminateLoadsAcrossLoops(Function &F, LoopInfo &LI, DominatorTree &DT, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, ScalarEvolution *SE, AssumptionCache *AC, function_ref< const LoopAccessInfo &(Loop &)> GetLAI)
const PredicatedScalarEvolution & getPSE() const
Used to add runtime SCEV checks.
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
void initializeLoopLoadEliminationPass(PassRegistry &)
LLVM Value Representation.
Analysis pass providing the TargetLibraryInfo.
Analysis pass that exposes the LoopInfo for a function.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.