LLVM 20.0.0git
LoopDataPrefetch.cpp
Go to the documentation of this file.
1//===-------- LoopDataPrefetch.cpp - Loop Data Prefetching Pass -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements a Loop Data Prefetching Pass.
10//
11//===----------------------------------------------------------------------===//
12
15
17#include "llvm/ADT/Statistic.h"
25#include "llvm/IR/Dominators.h"
26#include "llvm/IR/Function.h"
28#include "llvm/Support/Debug.h"
32
33#define DEBUG_TYPE "loop-data-prefetch"
34
35using namespace llvm;
36
37// By default, we limit this to creating 16 PHIs (which is a little over half
38// of the allocatable register set).
39static cl::opt<bool>
40PrefetchWrites("loop-prefetch-writes", cl::Hidden, cl::init(false),
41 cl::desc("Prefetch write addresses"));
42
44 PrefetchDistance("prefetch-distance",
45 cl::desc("Number of instructions to prefetch ahead"),
47
49 MinPrefetchStride("min-prefetch-stride",
50 cl::desc("Min stride to add prefetches"), cl::Hidden);
51
53 "max-prefetch-iters-ahead",
54 cl::desc("Max number of iterations to prefetch ahead"), cl::Hidden);
55
56STATISTIC(NumPrefetches, "Number of prefetches inserted");
57
58namespace {
59
60/// Loop prefetch implementation class.
61class LoopDataPrefetch {
62public:
63 LoopDataPrefetch(AssumptionCache *AC, DominatorTree *DT, LoopInfo *LI,
66 : AC(AC), DT(DT), LI(LI), SE(SE), TTI(TTI), ORE(ORE) {}
67
68 bool run();
69
70private:
71 bool runOnLoop(Loop *L);
72
73 /// Check if the stride of the accesses is large enough to
74 /// warrant a prefetch.
75 bool isStrideLargeEnough(const SCEVAddRecExpr *AR, unsigned TargetMinStride);
76
77 unsigned getMinPrefetchStride(unsigned NumMemAccesses,
78 unsigned NumStridedMemAccesses,
79 unsigned NumPrefetches,
80 bool HasCall) {
81 if (MinPrefetchStride.getNumOccurrences() > 0)
82 return MinPrefetchStride;
83 return TTI->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
84 NumPrefetches, HasCall);
85 }
86
87 unsigned getPrefetchDistance() {
88 if (PrefetchDistance.getNumOccurrences() > 0)
89 return PrefetchDistance;
90 return TTI->getPrefetchDistance();
91 }
92
93 unsigned getMaxPrefetchIterationsAhead() {
94 if (MaxPrefetchIterationsAhead.getNumOccurrences() > 0)
97 }
98
99 bool doPrefetchWrites() {
100 if (PrefetchWrites.getNumOccurrences() > 0)
101 return PrefetchWrites;
102 return TTI->enableWritePrefetching();
103 }
104
105 AssumptionCache *AC;
106 DominatorTree *DT;
107 LoopInfo *LI;
108 ScalarEvolution *SE;
111};
112
113/// Legacy class for inserting loop data prefetches.
114class LoopDataPrefetchLegacyPass : public FunctionPass {
115public:
116 static char ID; // Pass ID, replacement for typeid
117 LoopDataPrefetchLegacyPass() : FunctionPass(ID) {
119 }
120
121 void getAnalysisUsage(AnalysisUsage &AU) const override {
133 }
134
135 bool runOnFunction(Function &F) override;
136 };
137}
138
139char LoopDataPrefetchLegacyPass::ID = 0;
140INITIALIZE_PASS_BEGIN(LoopDataPrefetchLegacyPass, "loop-data-prefetch",
141 "Loop Data Prefetch", false, false)
145INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
148INITIALIZE_PASS_END(LoopDataPrefetchLegacyPass, "loop-data-prefetch",
150
152 return new LoopDataPrefetchLegacyPass();
153}
154
155bool LoopDataPrefetch::isStrideLargeEnough(const SCEVAddRecExpr *AR,
156 unsigned TargetMinStride) {
157 // No need to check if any stride goes.
158 if (TargetMinStride <= 1)
159 return true;
160
161 const auto *ConstStride = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE));
162 // If MinStride is set, don't prefetch unless we can ensure that stride is
163 // larger.
164 if (!ConstStride)
165 return false;
166
167 unsigned AbsStride = std::abs(ConstStride->getAPInt().getSExtValue());
168 return TargetMinStride <= AbsStride;
169}
170
174 LoopInfo *LI = &AM.getResult<LoopAnalysis>(F);
180
181 LoopDataPrefetch LDP(AC, DT, LI, SE, TTI, ORE);
182 bool Changed = LDP.run();
183
184 if (Changed) {
188 return PA;
189 }
190
191 return PreservedAnalyses::all();
192}
193
194bool LoopDataPrefetchLegacyPass::runOnFunction(Function &F) {
195 if (skipFunction(F))
196 return false;
197
198 DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
199 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
200 ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
201 AssumptionCache *AC =
202 &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
204 &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
205 const TargetTransformInfo *TTI =
206 &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
207
208 LoopDataPrefetch LDP(AC, DT, LI, SE, TTI, ORE);
209 return LDP.run();
210}
211
212bool LoopDataPrefetch::run() {
213 // If PrefetchDistance is not set, don't run the pass. This gives an
214 // opportunity for targets to run this pass for selected subtargets only
215 // (whose TTI sets PrefetchDistance and CacheLineSize).
216 if (getPrefetchDistance() == 0 || TTI->getCacheLineSize() == 0) {
217 LLVM_DEBUG(dbgs() << "Please set both PrefetchDistance and CacheLineSize "
218 "for loop data prefetch.\n");
219 return false;
220 }
221
222 bool MadeChange = false;
223
224 for (Loop *I : *LI)
225 for (Loop *L : depth_first(I))
226 MadeChange |= runOnLoop(L);
227
228 return MadeChange;
229}
230
231/// A record for a potential prefetch made during the initial scan of the
232/// loop. This is used to let a single prefetch target multiple memory accesses.
233struct Prefetch {
234 /// The address formula for this prefetch as returned by ScalarEvolution.
236 /// The point of insertion for the prefetch instruction.
237 Instruction *InsertPt = nullptr;
238 /// True if targeting a write memory access.
239 bool Writes = false;
240 /// The (first seen) prefetched instruction.
241 Instruction *MemI = nullptr;
242
243 /// Constructor to create a new Prefetch for \p I.
244 Prefetch(const SCEVAddRecExpr *L, Instruction *I) : LSCEVAddRec(L) {
245 addInstruction(I);
246 };
247
248 /// Add the instruction \param I to this prefetch. If it's not the first
249 /// one, 'InsertPt' and 'Writes' will be updated as required.
250 /// \param PtrDiff the known constant address difference to the first added
251 /// instruction.
253 int64_t PtrDiff = 0) {
254 if (!InsertPt) {
255 MemI = I;
256 InsertPt = I;
257 Writes = isa<StoreInst>(I);
258 } else {
259 BasicBlock *PrefBB = InsertPt->getParent();
260 BasicBlock *InsBB = I->getParent();
261 if (PrefBB != InsBB) {
262 BasicBlock *DomBB = DT->findNearestCommonDominator(PrefBB, InsBB);
263 if (DomBB != PrefBB)
264 InsertPt = DomBB->getTerminator();
265 }
266
267 if (isa<StoreInst>(I) && PtrDiff == 0)
268 Writes = true;
269 }
270 }
271};
272
273bool LoopDataPrefetch::runOnLoop(Loop *L) {
274 bool MadeChange = false;
275
276 // Only prefetch in the inner-most loop
277 if (!L->isInnermost())
278 return MadeChange;
279
281 CodeMetrics::collectEphemeralValues(L, AC, EphValues);
282
283 // Calculate the number of iterations ahead to prefetch
285 bool HasCall = false;
286 for (const auto BB : L->blocks()) {
287 // If the loop already has prefetches, then assume that the user knows
288 // what they are doing and don't add any more.
289 for (auto &I : *BB) {
290 if (isa<CallInst>(&I) || isa<InvokeInst>(&I)) {
291 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
292 if (F->getIntrinsicID() == Intrinsic::prefetch)
293 return MadeChange;
294 if (TTI->isLoweredToCall(F))
295 HasCall = true;
296 } else { // indirect call.
297 HasCall = true;
298 }
299 }
300 }
301 Metrics.analyzeBasicBlock(BB, *TTI, EphValues);
302 }
303
304 if (!Metrics.NumInsts.isValid())
305 return MadeChange;
306
307 unsigned LoopSize = *Metrics.NumInsts.getValue();
308 if (!LoopSize)
309 LoopSize = 1;
310
311 unsigned ItersAhead = getPrefetchDistance() / LoopSize;
312 if (!ItersAhead)
313 ItersAhead = 1;
314
315 if (ItersAhead > getMaxPrefetchIterationsAhead())
316 return MadeChange;
317
318 unsigned ConstantMaxTripCount = SE->getSmallConstantMaxTripCount(L);
319 if (ConstantMaxTripCount && ConstantMaxTripCount < ItersAhead + 1)
320 return MadeChange;
321
322 unsigned NumMemAccesses = 0;
323 unsigned NumStridedMemAccesses = 0;
324 SmallVector<Prefetch, 16> Prefetches;
325 for (const auto BB : L->blocks())
326 for (auto &I : *BB) {
327 Value *PtrValue;
328 Instruction *MemI;
329
330 if (LoadInst *LMemI = dyn_cast<LoadInst>(&I)) {
331 MemI = LMemI;
332 PtrValue = LMemI->getPointerOperand();
333 } else if (StoreInst *SMemI = dyn_cast<StoreInst>(&I)) {
334 if (!doPrefetchWrites()) continue;
335 MemI = SMemI;
336 PtrValue = SMemI->getPointerOperand();
337 } else continue;
338
339 unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace();
340 if (!TTI->shouldPrefetchAddressSpace(PtrAddrSpace))
341 continue;
342 NumMemAccesses++;
343 if (L->isLoopInvariant(PtrValue))
344 continue;
345
346 const SCEV *LSCEV = SE->getSCEV(PtrValue);
347 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
348 if (!LSCEVAddRec)
349 continue;
350 NumStridedMemAccesses++;
351
352 // We don't want to double prefetch individual cache lines. If this
353 // access is known to be within one cache line of some other one that
354 // has already been prefetched, then don't prefetch this one as well.
355 bool DupPref = false;
356 for (auto &Pref : Prefetches) {
357 const SCEV *PtrDiff = SE->getMinusSCEV(LSCEVAddRec, Pref.LSCEVAddRec);
358 if (const SCEVConstant *ConstPtrDiff =
359 dyn_cast<SCEVConstant>(PtrDiff)) {
360 int64_t PD = std::abs(ConstPtrDiff->getValue()->getSExtValue());
361 if (PD < (int64_t) TTI->getCacheLineSize()) {
362 Pref.addInstruction(MemI, DT, PD);
363 DupPref = true;
364 break;
365 }
366 }
367 }
368 if (!DupPref)
369 Prefetches.push_back(Prefetch(LSCEVAddRec, MemI));
370 }
371
372 unsigned TargetMinStride =
373 getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
374 Prefetches.size(), HasCall);
375
376 LLVM_DEBUG(dbgs() << "Prefetching " << ItersAhead
377 << " iterations ahead (loop size: " << LoopSize << ") in "
378 << L->getHeader()->getParent()->getName() << ": " << *L);
379 LLVM_DEBUG(dbgs() << "Loop has: "
380 << NumMemAccesses << " memory accesses, "
381 << NumStridedMemAccesses << " strided memory accesses, "
382 << Prefetches.size() << " potential prefetch(es), "
383 << "a minimum stride of " << TargetMinStride << ", "
384 << (HasCall ? "calls" : "no calls") << ".\n");
385
386 for (auto &P : Prefetches) {
387 // Check if the stride of the accesses is large enough to warrant a
388 // prefetch.
389 if (!isStrideLargeEnough(P.LSCEVAddRec, TargetMinStride))
390 continue;
391
392 BasicBlock *BB = P.InsertPt->getParent();
393 SCEVExpander SCEVE(*SE, BB->getDataLayout(), "prefaddr");
394 const SCEV *NextLSCEV = SE->getAddExpr(P.LSCEVAddRec, SE->getMulExpr(
395 SE->getConstant(P.LSCEVAddRec->getType(), ItersAhead),
396 P.LSCEVAddRec->getStepRecurrence(*SE)));
397 if (!SCEVE.isSafeToExpand(NextLSCEV))
398 continue;
399
400 unsigned PtrAddrSpace = NextLSCEV->getType()->getPointerAddressSpace();
401 Type *I8Ptr = PointerType::get(BB->getContext(), PtrAddrSpace);
402 Value *PrefPtrValue = SCEVE.expandCodeFor(NextLSCEV, I8Ptr, P.InsertPt);
403
404 IRBuilder<> Builder(P.InsertPt);
406 Builder.CreateIntrinsic(Intrinsic::prefetch, PrefPtrValue->getType(),
407 {PrefPtrValue, ConstantInt::get(I32, P.Writes),
408 ConstantInt::get(I32, 3),
409 ConstantInt::get(I32, 1)});
410 ++NumPrefetches;
411 LLVM_DEBUG(dbgs() << " Access: "
412 << *P.MemI->getOperand(isa<LoadInst>(P.MemI) ? 0 : 1)
413 << ", SCEV: " << *P.LSCEVAddRec << "\n");
414 ORE->emit([&]() {
415 return OptimizationRemark(DEBUG_TYPE, "Prefetched", P.MemI)
416 << "prefetched memory access";
417 });
418
419 MadeChange = true;
420 }
421
422 return MadeChange;
423}
#define LLVM_DEBUG(...)
Definition: Debug.h:106
This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.
SmallVector< uint32_t, 0 > Writes
Definition: ELF_riscv.cpp:497
#define DEBUG_TYPE
static cl::opt< bool > PrefetchWrites("loop-prefetch-writes", cl::Hidden, cl::init(false), cl::desc("Prefetch write addresses"))
static cl::opt< unsigned > MinPrefetchStride("min-prefetch-stride", cl::desc("Min stride to add prefetches"), cl::Hidden)
static cl::opt< unsigned > PrefetchDistance("prefetch-distance", cl::desc("Number of instructions to prefetch ahead"), cl::Hidden)
static cl::opt< unsigned > MaxPrefetchIterationsAhead("max-prefetch-iters-ahead", cl::desc("Max number of iterations to prefetch ahead"), cl::Hidden)
loop data prefetch
This file provides the interface for LLVM's Loop Data Prefetching Pass.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Machine Trace Metrics
static const Function * getCalledFunction(const Value *V)
#define P(N)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:57
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
This pass exposes codegen information to IR-level passes.
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:410
Represent the analysis usage information of a pass.
AnalysisUsage & addRequiredID(const void *ID)
Definition: Pass.cpp:270
AnalysisUsage & addPreservedID(const void *ID)
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
const DataLayout & getDataLayout() const
Get the data layout of the module this basic block belongs to.
Definition: BasicBlock.cpp:296
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:168
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:317
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
Instruction * findNearestCommonDominator(Instruction *I1, Instruction *I2) const
Find the nearest instruction I that dominates both I1 and I2, in the sense that a result produced bef...
Definition: Dominators.cpp:344
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:310
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2697
An instruction for reading from memory.
Definition: Instructions.h:176
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Run the pass over the function.
The legacy pass manager's analysis pass to compute loop information.
Definition: LoopInfo.h:593
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
OptimizationRemarkEmitter legacy analysis pass.
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:98
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
void preserve()
Mark an analysis as preserved.
Definition: Analysis.h:131
This node represents a polynomial recurrence on the trip count of the specified loop.
const SCEV * getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
This class represents a constant integer value.
This class uses information about analyze scalars to rewrite expressions in canonical form.
This class represents an analyzed expression in the program.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
unsigned getSmallConstantMaxTripCount(const Loop *L, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Returns the upper bound of the loop trip count as a normal unsigned value.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
Analysis pass providing the TargetTransformInfo.
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
unsigned getMaxPrefetchIterationsAhead() const
unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, unsigned NumPrefetches, bool HasCall) const
Some HW prefetchers can handle accesses up to a certain constant stride.
bool shouldPrefetchAddressSpace(unsigned AS) const
bool isLoweredToCall(const Function *F) const
Test whether calls to a function lower to actual program function calls.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static IntegerType * getInt32Ty(LLVMContext &C)
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
const ParentTy * getParent() const
Definition: ilist_node.h:32
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ PD
PD - Prefix code for packed double precision vector floating point operations performed in the SSE re...
Definition: X86BaseInfo.h:721
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
char & LoopSimplifyID
void initializeLoopDataPrefetchLegacyPassPass(PassRegistry &)
FunctionPass * createLoopDataPrefetchPass()
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
iterator_range< df_iterator< T > > depth_first(const T &G)
A record for a potential prefetch made during the initial scan of the loop.
void addInstruction(Instruction *I, DominatorTree *DT=nullptr, int64_t PtrDiff=0)
Add the instruction.
const SCEVAddRecExpr * LSCEVAddRec
The address formula for this prefetch as returned by ScalarEvolution.
Prefetch(const SCEVAddRecExpr *L, Instruction *I)
Constructor to create a new Prefetch for I.
Utility to calculate the size and a few similar metrics for a set of basic blocks.
Definition: CodeMetrics.h:33
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71