LLVM 20.0.0git
LoopCacheAnalysis.cpp
Go to the documentation of this file.
1//===- LoopCacheAnalysis.cpp - Loop Cache Analysis -------------------------==//
2//
3// The LLVM Compiler Infrastructure
4//
5// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
6// See https://llvm.org/LICENSE.txt for license information.
7// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8//
9//===----------------------------------------------------------------------===//
10///
11/// \file
12/// This file defines the implementation for the loop cache analysis.
13/// The implementation is largely based on the following paper:
14///
15/// Compiler Optimizations for Improving Data Locality
16/// By: Steve Carr, Katherine S. McKinley, Chau-Wen Tseng
17/// http://www.cs.utexas.edu/users/mckinley/papers/asplos-1994.pdf
18///
19/// The general approach taken to estimate the number of cache lines used by the
20/// memory references in an inner loop is:
21/// 1. Partition memory references that exhibit temporal or spacial reuse
22/// into reference groups.
23/// 2. For each loop L in the a loop nest LN:
24/// a. Compute the cost of the reference group
25/// b. Compute the loop cost by summing up the reference groups costs
26//===----------------------------------------------------------------------===//
27
30#include "llvm/ADT/Sequence.h"
39#include "llvm/Support/Debug.h"
40
41using namespace llvm;
42
43#define DEBUG_TYPE "loop-cache-cost"
44
46 "default-trip-count", cl::init(100), cl::Hidden,
47 cl::desc("Use this to specify the default trip count of a loop"));
48
49// In this analysis two array references are considered to exhibit temporal
50// reuse if they access either the same memory location, or a memory location
51// with distance smaller than a configurable threshold.
53 "temporal-reuse-threshold", cl::init(2), cl::Hidden,
54 cl::desc("Use this to specify the max. distance between array elements "
55 "accessed in a loop so that the elements are classified to have "
56 "temporal reuse"));
57
58/// Retrieve the innermost loop in the given loop nest \p Loops. It returns a
59/// nullptr if any loops in the loop vector supplied has more than one sibling.
60/// The loop vector is expected to contain loops collected in breadth-first
61/// order.
63 assert(!Loops.empty() && "Expecting a non-empy loop vector");
64
65 Loop *LastLoop = Loops.back();
66 Loop *ParentLoop = LastLoop->getParentLoop();
67
68 if (ParentLoop == nullptr) {
69 assert(Loops.size() == 1 && "Expecting a single loop");
70 return LastLoop;
71 }
72
73 return (llvm::is_sorted(Loops,
74 [](const Loop *L1, const Loop *L2) {
75 return L1->getLoopDepth() < L2->getLoopDepth();
76 }))
77 ? LastLoop
78 : nullptr;
79}
80
81static bool isOneDimensionalArray(const SCEV &AccessFn, const SCEV &ElemSize,
82 const Loop &L, ScalarEvolution &SE) {
83 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(&AccessFn);
84 if (!AR || !AR->isAffine())
85 return false;
86
87 assert(AR->getLoop() && "AR should have a loop");
88
89 // Check that start and increment are not add recurrences.
90 const SCEV *Start = AR->getStart();
91 const SCEV *Step = AR->getStepRecurrence(SE);
92 if (isa<SCEVAddRecExpr>(Start) || isa<SCEVAddRecExpr>(Step))
93 return false;
94
95 // Check that start and increment are both invariant in the loop.
96 if (!SE.isLoopInvariant(Start, &L) || !SE.isLoopInvariant(Step, &L))
97 return false;
98
99 const SCEV *StepRec = AR->getStepRecurrence(SE);
100 if (StepRec && SE.isKnownNegative(StepRec))
101 StepRec = SE.getNegativeSCEV(StepRec);
102
103 return StepRec == &ElemSize;
104}
105
106/// Compute the trip count for the given loop \p L or assume a default value if
107/// it is not a compile time constant. Return the SCEV expression for the trip
108/// count.
109static const SCEV *computeTripCount(const Loop &L, const SCEV &ElemSize,
110 ScalarEvolution &SE) {
111 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(&L);
112 const SCEV *TripCount = (!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
113 isa<SCEVConstant>(BackedgeTakenCount))
114 ? SE.getTripCountFromExitCount(BackedgeTakenCount)
115 : nullptr;
116
117 if (!TripCount) {
118 LLVM_DEBUG(dbgs() << "Trip count of loop " << L.getName()
119 << " could not be computed, using DefaultTripCount\n");
120 TripCount = SE.getConstant(ElemSize.getType(), DefaultTripCount);
121 }
122
123 return TripCount;
124}
125
126//===----------------------------------------------------------------------===//
127// IndexedReference implementation
128//
130 if (!R.IsValid) {
131 OS << R.StoreOrLoadInst;
132 OS << ", IsValid=false.";
133 return OS;
134 }
135
136 OS << *R.BasePointer;
137 for (const SCEV *Subscript : R.Subscripts)
138 OS << "[" << *Subscript << "]";
139
140 OS << ", Sizes: ";
141 for (const SCEV *Size : R.Sizes)
142 OS << "[" << *Size << "]";
143
144 return OS;
145}
146
148 const LoopInfo &LI, ScalarEvolution &SE)
149 : StoreOrLoadInst(StoreOrLoadInst), SE(SE) {
150 assert((isa<StoreInst>(StoreOrLoadInst) || isa<LoadInst>(StoreOrLoadInst)) &&
151 "Expecting a load or store instruction");
152
153 IsValid = delinearize(LI);
154 if (IsValid)
155 LLVM_DEBUG(dbgs().indent(2) << "Succesfully delinearized: " << *this
156 << "\n");
157}
158
159std::optional<bool>
161 AAResults &AA) const {
162 assert(IsValid && "Expecting a valid reference");
163
164 if (BasePointer != Other.getBasePointer() && !isAliased(Other, AA)) {
165 LLVM_DEBUG(dbgs().indent(2)
166 << "No spacial reuse: different base pointers\n");
167 return false;
168 }
169
170 unsigned NumSubscripts = getNumSubscripts();
171 if (NumSubscripts != Other.getNumSubscripts()) {
172 LLVM_DEBUG(dbgs().indent(2)
173 << "No spacial reuse: different number of subscripts\n");
174 return false;
175 }
176
177 // all subscripts must be equal, except the leftmost one (the last one).
178 for (auto SubNum : seq<unsigned>(0, NumSubscripts - 1)) {
179 if (getSubscript(SubNum) != Other.getSubscript(SubNum)) {
180 LLVM_DEBUG(dbgs().indent(2) << "No spacial reuse, different subscripts: "
181 << "\n\t" << *getSubscript(SubNum) << "\n\t"
182 << *Other.getSubscript(SubNum) << "\n");
183 return false;
184 }
185 }
186
187 // the difference between the last subscripts must be less than the cache line
188 // size.
189 const SCEV *LastSubscript = getLastSubscript();
190 const SCEV *OtherLastSubscript = Other.getLastSubscript();
191 const SCEVConstant *Diff = dyn_cast<SCEVConstant>(
192 SE.getMinusSCEV(LastSubscript, OtherLastSubscript));
193
194 if (Diff == nullptr) {
195 LLVM_DEBUG(dbgs().indent(2)
196 << "No spacial reuse, difference between subscript:\n\t"
197 << *LastSubscript << "\n\t" << OtherLastSubscript
198 << "\nis not constant.\n");
199 return std::nullopt;
200 }
201
202 bool InSameCacheLine = (Diff->getValue()->getSExtValue() < CLS);
203
204 LLVM_DEBUG({
205 if (InSameCacheLine)
206 dbgs().indent(2) << "Found spacial reuse.\n";
207 else
208 dbgs().indent(2) << "No spacial reuse.\n";
209 });
210
211 return InSameCacheLine;
212}
213
214std::optional<bool>
216 unsigned MaxDistance, const Loop &L,
217 DependenceInfo &DI, AAResults &AA) const {
218 assert(IsValid && "Expecting a valid reference");
219
220 if (BasePointer != Other.getBasePointer() && !isAliased(Other, AA)) {
221 LLVM_DEBUG(dbgs().indent(2)
222 << "No temporal reuse: different base pointer\n");
223 return false;
224 }
225
226 std::unique_ptr<Dependence> D =
227 DI.depends(&StoreOrLoadInst, &Other.StoreOrLoadInst, true);
228
229 if (D == nullptr) {
230 LLVM_DEBUG(dbgs().indent(2) << "No temporal reuse: no dependence\n");
231 return false;
232 }
233
234 if (D->isLoopIndependent()) {
235 LLVM_DEBUG(dbgs().indent(2) << "Found temporal reuse\n");
236 return true;
237 }
238
239 // Check the dependence distance at every loop level. There is temporal reuse
240 // if the distance at the given loop's depth is small (|d| <= MaxDistance) and
241 // it is zero at every other loop level.
242 int LoopDepth = L.getLoopDepth();
243 int Levels = D->getLevels();
244 for (int Level = 1; Level <= Levels; ++Level) {
245 const SCEV *Distance = D->getDistance(Level);
246 const SCEVConstant *SCEVConst = dyn_cast_or_null<SCEVConstant>(Distance);
247
248 if (SCEVConst == nullptr) {
249 LLVM_DEBUG(dbgs().indent(2) << "No temporal reuse: distance unknown\n");
250 return std::nullopt;
251 }
252
253 const ConstantInt &CI = *SCEVConst->getValue();
254 if (Level != LoopDepth && !CI.isZero()) {
255 LLVM_DEBUG(dbgs().indent(2)
256 << "No temporal reuse: distance is not zero at depth=" << Level
257 << "\n");
258 return false;
259 } else if (Level == LoopDepth && CI.getSExtValue() > MaxDistance) {
261 dbgs().indent(2)
262 << "No temporal reuse: distance is greater than MaxDistance at depth="
263 << Level << "\n");
264 return false;
265 }
266 }
267
268 LLVM_DEBUG(dbgs().indent(2) << "Found temporal reuse\n");
269 return true;
270}
271
273 unsigned CLS) const {
274 assert(IsValid && "Expecting a valid reference");
275 LLVM_DEBUG({
276 dbgs().indent(2) << "Computing cache cost for:\n";
277 dbgs().indent(4) << *this << "\n";
278 });
279
280 // If the indexed reference is loop invariant the cost is one.
281 if (isLoopInvariant(L)) {
282 LLVM_DEBUG(dbgs().indent(4) << "Reference is loop invariant: RefCost=1\n");
283 return 1;
284 }
285
286 const SCEV *TripCount = computeTripCount(L, *Sizes.back(), SE);
287 assert(TripCount && "Expecting valid TripCount");
288 LLVM_DEBUG(dbgs() << "TripCount=" << *TripCount << "\n");
289
290 const SCEV *RefCost = nullptr;
291 const SCEV *Stride = nullptr;
292 if (isConsecutive(L, Stride, CLS)) {
293 // If the indexed reference is 'consecutive' the cost is
294 // (TripCount*Stride)/CLS.
295 assert(Stride != nullptr &&
296 "Stride should not be null for consecutive access!");
297 Type *WiderType = SE.getWiderType(Stride->getType(), TripCount->getType());
298 const SCEV *CacheLineSize = SE.getConstant(WiderType, CLS);
299 Stride = SE.getNoopOrAnyExtend(Stride, WiderType);
300 TripCount = SE.getNoopOrZeroExtend(TripCount, WiderType);
301 const SCEV *Numerator = SE.getMulExpr(Stride, TripCount);
302 // Round the fractional cost up to the nearest integer number.
303 // The impact is the most significant when cost is calculated
304 // to be a number less than one, because it makes more sense
305 // to say one cache line is used rather than zero cache line
306 // is used.
307 RefCost = SE.getUDivCeilSCEV(Numerator, CacheLineSize);
308
309 LLVM_DEBUG(dbgs().indent(4)
310 << "Access is consecutive: RefCost=(TripCount*Stride)/CLS="
311 << *RefCost << "\n");
312 } else {
313 // If the indexed reference is not 'consecutive' the cost is proportional to
314 // the trip count and the depth of the dimension which the subject loop
315 // subscript is accessing. We try to estimate this by multiplying the cost
316 // by the trip counts of loops corresponding to the inner dimensions. For
317 // example, given the indexed reference 'A[i][j][k]', and assuming the
318 // i-loop is in the innermost position, the cost would be equal to the
319 // iterations of the i-loop multiplied by iterations of the j-loop.
320 RefCost = TripCount;
321
322 int Index = getSubscriptIndex(L);
323 assert(Index >= 0 && "Could not locate a valid Index");
324
325 for (unsigned I = Index + 1; I < getNumSubscripts() - 1; ++I) {
326 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(getSubscript(I));
327 assert(AR && AR->getLoop() && "Expecting valid loop");
328 const SCEV *TripCount =
329 computeTripCount(*AR->getLoop(), *Sizes.back(), SE);
330 Type *WiderType = SE.getWiderType(RefCost->getType(), TripCount->getType());
331 RefCost = SE.getMulExpr(SE.getNoopOrZeroExtend(RefCost, WiderType),
332 SE.getNoopOrZeroExtend(TripCount, WiderType));
333 }
334
335 LLVM_DEBUG(dbgs().indent(4)
336 << "Access is not consecutive: RefCost=" << *RefCost << "\n");
337 }
338 assert(RefCost && "Expecting a valid RefCost");
339
340 // Attempt to fold RefCost into a constant.
341 if (auto ConstantCost = dyn_cast<SCEVConstant>(RefCost))
342 return ConstantCost->getValue()->getZExtValue();
343
344 LLVM_DEBUG(dbgs().indent(4)
345 << "RefCost is not a constant! Setting to RefCost=InvalidCost "
346 "(invalid value).\n");
347
349}
350
351bool IndexedReference::tryDelinearizeFixedSize(
352 const SCEV *AccessFn, SmallVectorImpl<const SCEV *> &Subscripts) {
353 SmallVector<int, 4> ArraySizes;
354 if (!tryDelinearizeFixedSizeImpl(&SE, &StoreOrLoadInst, AccessFn, Subscripts,
355 ArraySizes))
356 return false;
357
358 // Populate Sizes with scev expressions to be used in calculations later.
359 for (auto Idx : seq<unsigned>(1, Subscripts.size()))
360 Sizes.push_back(
361 SE.getConstant(Subscripts[Idx]->getType(), ArraySizes[Idx - 1]));
362
363 LLVM_DEBUG({
364 dbgs() << "Delinearized subscripts of fixed-size array\n"
365 << "GEP:" << *getLoadStorePointerOperand(&StoreOrLoadInst)
366 << "\n";
367 });
368 return true;
369}
370
371bool IndexedReference::delinearize(const LoopInfo &LI) {
372 assert(Subscripts.empty() && "Subscripts should be empty");
373 assert(Sizes.empty() && "Sizes should be empty");
374 assert(!IsValid && "Should be called once from the constructor");
375 LLVM_DEBUG(dbgs() << "Delinearizing: " << StoreOrLoadInst << "\n");
376
377 const SCEV *ElemSize = SE.getElementSize(&StoreOrLoadInst);
378 const BasicBlock *BB = StoreOrLoadInst.getParent();
379
380 if (Loop *L = LI.getLoopFor(BB)) {
381 const SCEV *AccessFn =
382 SE.getSCEVAtScope(getPointerOperand(&StoreOrLoadInst), L);
383
384 BasePointer = dyn_cast<SCEVUnknown>(SE.getPointerBase(AccessFn));
385 if (BasePointer == nullptr) {
387 dbgs().indent(2)
388 << "ERROR: failed to delinearize, can't identify base pointer\n");
389 return false;
390 }
391
392 bool IsFixedSize = false;
393 // Try to delinearize fixed-size arrays.
394 if (tryDelinearizeFixedSize(AccessFn, Subscripts)) {
395 IsFixedSize = true;
396 // The last element of Sizes is the element size.
397 Sizes.push_back(ElemSize);
398 LLVM_DEBUG(dbgs().indent(2) << "In Loop '" << L->getName()
399 << "', AccessFn: " << *AccessFn << "\n");
400 }
401
402 AccessFn = SE.getMinusSCEV(AccessFn, BasePointer);
403
404 // Try to delinearize parametric-size arrays.
405 if (!IsFixedSize) {
406 LLVM_DEBUG(dbgs().indent(2) << "In Loop '" << L->getName()
407 << "', AccessFn: " << *AccessFn << "\n");
408 llvm::delinearize(SE, AccessFn, Subscripts, Sizes,
409 SE.getElementSize(&StoreOrLoadInst));
410 }
411
412 if (Subscripts.empty() || Sizes.empty() ||
413 Subscripts.size() != Sizes.size()) {
414 // Attempt to determine whether we have a single dimensional array access.
415 // before giving up.
416 if (!isOneDimensionalArray(*AccessFn, *ElemSize, *L, SE)) {
417 LLVM_DEBUG(dbgs().indent(2)
418 << "ERROR: failed to delinearize reference\n");
419 Subscripts.clear();
420 Sizes.clear();
421 return false;
422 }
423
424 // The array may be accessed in reverse, for example:
425 // for (i = N; i > 0; i--)
426 // A[i] = 0;
427 // In this case, reconstruct the access function using the absolute value
428 // of the step recurrence.
429 const SCEVAddRecExpr *AccessFnAR = dyn_cast<SCEVAddRecExpr>(AccessFn);
430 const SCEV *StepRec = AccessFnAR ? AccessFnAR->getStepRecurrence(SE) : nullptr;
431
432 if (StepRec && SE.isKnownNegative(StepRec))
433 AccessFn = SE.getAddRecExpr(AccessFnAR->getStart(),
434 SE.getNegativeSCEV(StepRec),
435 AccessFnAR->getLoop(),
436 AccessFnAR->getNoWrapFlags());
437 const SCEV *Div = SE.getUDivExactExpr(AccessFn, ElemSize);
438 Subscripts.push_back(Div);
439 Sizes.push_back(ElemSize);
440 }
441
442 return all_of(Subscripts, [&](const SCEV *Subscript) {
443 return isSimpleAddRecurrence(*Subscript, *L);
444 });
445 }
446
447 return false;
448}
449
450bool IndexedReference::isLoopInvariant(const Loop &L) const {
451 Value *Addr = getPointerOperand(&StoreOrLoadInst);
452 assert(Addr != nullptr && "Expecting either a load or a store instruction");
453 assert(SE.isSCEVable(Addr->getType()) && "Addr should be SCEVable");
454
455 if (SE.isLoopInvariant(SE.getSCEV(Addr), &L))
456 return true;
457
458 // The indexed reference is loop invariant if none of the coefficients use
459 // the loop induction variable.
460 bool allCoeffForLoopAreZero = all_of(Subscripts, [&](const SCEV *Subscript) {
461 return isCoeffForLoopZeroOrInvariant(*Subscript, L);
462 });
463
464 return allCoeffForLoopAreZero;
465}
466
467bool IndexedReference::isConsecutive(const Loop &L, const SCEV *&Stride,
468 unsigned CLS) const {
469 // The indexed reference is 'consecutive' if the only coefficient that uses
470 // the loop induction variable is the last one...
471 const SCEV *LastSubscript = Subscripts.back();
472 for (const SCEV *Subscript : Subscripts) {
473 if (Subscript == LastSubscript)
474 continue;
475 if (!isCoeffForLoopZeroOrInvariant(*Subscript, L))
476 return false;
477 }
478
479 // ...and the access stride is less than the cache line size.
480 const SCEV *Coeff = getLastCoefficient();
481 const SCEV *ElemSize = Sizes.back();
482 Type *WiderType = SE.getWiderType(Coeff->getType(), ElemSize->getType());
483 // FIXME: This assumes that all values are signed integers which may
484 // be incorrect in unusual codes and incorrectly use sext instead of zext.
485 // for (uint32_t i = 0; i < 512; ++i) {
486 // uint8_t trunc = i;
487 // A[trunc] = 42;
488 // }
489 // This consecutively iterates twice over A. If `trunc` is sign-extended,
490 // we would conclude that this may iterate backwards over the array.
491 // However, LoopCacheAnalysis is heuristic anyway and transformations must
492 // not result in wrong optimizations if the heuristic was incorrect.
493 Stride = SE.getMulExpr(SE.getNoopOrSignExtend(Coeff, WiderType),
494 SE.getNoopOrSignExtend(ElemSize, WiderType));
495 const SCEV *CacheLineSize = SE.getConstant(Stride->getType(), CLS);
496
497 Stride = SE.isKnownNegative(Stride) ? SE.getNegativeSCEV(Stride) : Stride;
499}
500
501int IndexedReference::getSubscriptIndex(const Loop &L) const {
502 for (auto Idx : seq<int>(0, getNumSubscripts())) {
503 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(getSubscript(Idx));
504 if (AR && AR->getLoop() == &L) {
505 return Idx;
506 }
507 }
508 return -1;
509}
510
511const SCEV *IndexedReference::getLastCoefficient() const {
512 const SCEV *LastSubscript = getLastSubscript();
513 auto *AR = cast<SCEVAddRecExpr>(LastSubscript);
514 return AR->getStepRecurrence(SE);
515}
516
517bool IndexedReference::isCoeffForLoopZeroOrInvariant(const SCEV &Subscript,
518 const Loop &L) const {
519 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(&Subscript);
520 return (AR != nullptr) ? AR->getLoop() != &L
521 : SE.isLoopInvariant(&Subscript, &L);
522}
523
524bool IndexedReference::isSimpleAddRecurrence(const SCEV &Subscript,
525 const Loop &L) const {
526 if (!isa<SCEVAddRecExpr>(Subscript))
527 return false;
528
529 const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(&Subscript);
530 assert(AR->getLoop() && "AR should have a loop");
531
532 if (!AR->isAffine())
533 return false;
534
535 const SCEV *Start = AR->getStart();
536 const SCEV *Step = AR->getStepRecurrence(SE);
537
538 if (!SE.isLoopInvariant(Start, &L) || !SE.isLoopInvariant(Step, &L))
539 return false;
540
541 return true;
542}
543
544bool IndexedReference::isAliased(const IndexedReference &Other,
545 AAResults &AA) const {
546 const auto &Loc1 = MemoryLocation::get(&StoreOrLoadInst);
547 const auto &Loc2 = MemoryLocation::get(&Other.StoreOrLoadInst);
548 return AA.isMustAlias(Loc1, Loc2);
549}
550
551//===----------------------------------------------------------------------===//
552// CacheCost implementation
553//
555 for (const auto &LC : CC.LoopCosts) {
556 const Loop *L = LC.first;
557 OS << "Loop '" << L->getName() << "' has cost = " << LC.second << "\n";
558 }
559 return OS;
560}
561
564 AAResults &AA, DependenceInfo &DI,
565 std::optional<unsigned> TRT)
566 : Loops(Loops), TRT(TRT.value_or(TemporalReuseThreshold)), LI(LI), SE(SE),
567 TTI(TTI), AA(AA), DI(DI) {
568 assert(!Loops.empty() && "Expecting a non-empty loop vector.");
569
570 for (const Loop *L : Loops) {
571 unsigned TripCount = SE.getSmallConstantTripCount(L);
572 TripCount = (TripCount == 0) ? DefaultTripCount : TripCount;
573 TripCounts.push_back({L, TripCount});
574 }
575
576 calculateCacheFootprint();
577}
578
579std::unique_ptr<CacheCost>
581 DependenceInfo &DI, std::optional<unsigned> TRT) {
582 if (!Root.isOutermost()) {
583 LLVM_DEBUG(dbgs() << "Expecting the outermost loop in a loop nest\n");
584 return nullptr;
585 }
586
587 LoopVectorTy Loops;
589
590 if (!getInnerMostLoop(Loops)) {
591 LLVM_DEBUG(dbgs() << "Cannot compute cache cost of loop nest with more "
592 "than one innermost loop\n");
593 return nullptr;
594 }
595
596 return std::make_unique<CacheCost>(Loops, AR.LI, AR.SE, AR.TTI, AR.AA, DI, TRT);
597}
598
599void CacheCost::calculateCacheFootprint() {
600 LLVM_DEBUG(dbgs() << "POPULATING REFERENCE GROUPS\n");
601 ReferenceGroupsTy RefGroups;
602 if (!populateReferenceGroups(RefGroups))
603 return;
604
605 LLVM_DEBUG(dbgs() << "COMPUTING LOOP CACHE COSTS\n");
606 for (const Loop *L : Loops) {
608 LoopCosts,
609 [L](const LoopCacheCostTy &LCC) { return LCC.first == L; }) &&
610 "Should not add duplicate element");
611 CacheCostTy LoopCost = computeLoopCacheCost(*L, RefGroups);
612 LoopCosts.push_back(std::make_pair(L, LoopCost));
613 }
614
615 sortLoopCosts();
616 RefGroups.clear();
617}
618
619bool CacheCost::populateReferenceGroups(ReferenceGroupsTy &RefGroups) const {
620 assert(RefGroups.empty() && "Reference groups should be empty");
621
622 unsigned CLS = TTI.getCacheLineSize();
623 Loop *InnerMostLoop = getInnerMostLoop(Loops);
624 assert(InnerMostLoop != nullptr && "Expecting a valid innermost loop");
625
626 for (BasicBlock *BB : InnerMostLoop->getBlocks()) {
627 for (Instruction &I : *BB) {
628 if (!isa<StoreInst>(I) && !isa<LoadInst>(I))
629 continue;
630
631 std::unique_ptr<IndexedReference> R(new IndexedReference(I, LI, SE));
632 if (!R->isValid())
633 continue;
634
635 bool Added = false;
636 for (ReferenceGroupTy &RefGroup : RefGroups) {
637 const IndexedReference &Representative = *RefGroup.front();
638 LLVM_DEBUG({
639 dbgs() << "References:\n";
640 dbgs().indent(2) << *R << "\n";
641 dbgs().indent(2) << Representative << "\n";
642 });
643
644
645 // FIXME: Both positive and negative access functions will be placed
646 // into the same reference group, resulting in a bi-directional array
647 // access such as:
648 // for (i = N; i > 0; i--)
649 // A[i] = A[N - i];
650 // having the same cost calculation as a single dimention access pattern
651 // for (i = 0; i < N; i++)
652 // A[i] = A[i];
653 // when in actuality, depending on the array size, the first example
654 // should have a cost closer to 2x the second due to the two cache
655 // access per iteration from opposite ends of the array
656 std::optional<bool> HasTemporalReuse =
657 R->hasTemporalReuse(Representative, *TRT, *InnerMostLoop, DI, AA);
658 std::optional<bool> HasSpacialReuse =
659 R->hasSpacialReuse(Representative, CLS, AA);
660
661 if ((HasTemporalReuse && *HasTemporalReuse) ||
662 (HasSpacialReuse && *HasSpacialReuse)) {
663 RefGroup.push_back(std::move(R));
664 Added = true;
665 break;
666 }
667 }
668
669 if (!Added) {
671 RG.push_back(std::move(R));
672 RefGroups.push_back(std::move(RG));
673 }
674 }
675 }
676
677 if (RefGroups.empty())
678 return false;
679
680 LLVM_DEBUG({
681 dbgs() << "\nIDENTIFIED REFERENCE GROUPS:\n";
682 int n = 1;
683 for (const ReferenceGroupTy &RG : RefGroups) {
684 dbgs().indent(2) << "RefGroup " << n << ":\n";
685 for (const auto &IR : RG)
686 dbgs().indent(4) << *IR << "\n";
687 n++;
688 }
689 dbgs() << "\n";
690 });
691
692 return true;
693}
694
696CacheCost::computeLoopCacheCost(const Loop &L,
697 const ReferenceGroupsTy &RefGroups) const {
698 if (!L.isLoopSimplifyForm())
699 return InvalidCost;
700
701 LLVM_DEBUG(dbgs() << "Considering loop '" << L.getName()
702 << "' as innermost loop.\n");
703
704 // Compute the product of the trip counts of each other loop in the nest.
705 CacheCostTy TripCountsProduct = 1;
706 for (const auto &TC : TripCounts) {
707 if (TC.first == &L)
708 continue;
709 TripCountsProduct *= TC.second;
710 }
711
712 CacheCostTy LoopCost = 0;
713 for (const ReferenceGroupTy &RG : RefGroups) {
714 CacheCostTy RefGroupCost = computeRefGroupCacheCost(RG, L);
715 LoopCost += RefGroupCost * TripCountsProduct;
716 }
717
718 LLVM_DEBUG(dbgs().indent(2) << "Loop '" << L.getName()
719 << "' has cost=" << LoopCost << "\n");
720
721 return LoopCost;
722}
723
724CacheCostTy CacheCost::computeRefGroupCacheCost(const ReferenceGroupTy &RG,
725 const Loop &L) const {
726 assert(!RG.empty() && "Reference group should have at least one member.");
727
728 const IndexedReference *Representative = RG.front().get();
729 return Representative->computeRefCost(L, TTI.getCacheLineSize());
730}
731
732//===----------------------------------------------------------------------===//
733// LoopCachePrinterPass implementation
734//
737 LPMUpdater &U) {
738 Function *F = L.getHeader()->getParent();
739 DependenceInfo DI(F, &AR.AA, &AR.SE, &AR.LI);
740
741 if (auto CC = CacheCost::getCacheCost(L, AR, DI))
742 OS << *CC;
743
744 return PreservedAnalyses::all();
745}
This file builds on the ADT/GraphTraits.h file to build a generic breadth first graph iterator.
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
Hexagon Hardware Loops
Legalize the Machine IR a function s Machine IR
Definition: Legalizer.cpp:81
static bool isOneDimensionalArray(const SCEV &AccessFn, const SCEV &ElemSize, const Loop &L, ScalarEvolution &SE)
static cl::opt< unsigned > TemporalReuseThreshold("temporal-reuse-threshold", cl::init(2), cl::Hidden, cl::desc("Use this to specify the max. distance between array elements " "accessed in a loop so that the elements are classified to have " "temporal reuse"))
static const SCEV * computeTripCount(const Loop &L, const SCEV &ElemSize, ScalarEvolution &SE)
Compute the trip count for the given loop L or assume a default value if it is not a compile time con...
static Loop * getInnerMostLoop(const LoopVectorTy &Loops)
Retrieve the innermost loop in the given loop nest Loops.
static cl::opt< unsigned > DefaultTripCount("default-trip-count", cl::init(100), cl::Hidden, cl::desc("Use this to specify the default trip count of a loop"))
This file defines the interface for the loop cache analysis.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
Provides some synthesis utilities to produce sequences of values.
This file defines the SmallVector class.
static cl::opt< unsigned > CacheLineSize("cache-line-size", cl::init(0), cl::Hidden, cl::desc("Use this to override the target cache line size when " "specified by the user."))
This pass exposes codegen information to IR-level passes.
bool isMustAlias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A trivial helper function to check to see if the specified pointers are must-alias.
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
CacheCost represents the estimated cost of a inner loop as the number of cache lines used by the memo...
CacheCost(const LoopVectorTy &Loops, const LoopInfo &LI, ScalarEvolution &SE, TargetTransformInfo &TTI, AAResults &AA, DependenceInfo &DI, std::optional< unsigned > TRT=std::nullopt)
Construct a CacheCost object for the loop nest described by Loops.
static std::unique_ptr< CacheCost > getCacheCost(Loop &Root, LoopStandardAnalysisResults &AR, DependenceInfo &DI, std::optional< unsigned > TRT=std::nullopt)
Create a CacheCost for the loop nest rooted by Root.
static CacheCostTy constexpr InvalidCost
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:782
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:206
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:161
DependenceInfo - This class is the main dependence-analysis driver.
std::unique_ptr< Dependence > depends(Instruction *Src, Instruction *Dst, bool PossiblyLoopIndependent)
depends - Tests for a dependence between the Src and Dst instructions.
Represents a memory reference as a base pointer and a set of indexing operations.
CacheCostTy computeRefCost(const Loop &L, unsigned CLS) const
Compute the cost of the reference w.r.t.
const SCEV * getSubscript(unsigned SubNum) const
std::optional< bool > hasSpacialReuse(const IndexedReference &Other, unsigned CLS, AAResults &AA) const
Return true/false if the current object and the indexed reference Other are/aren't in the same cache ...
std::optional< bool > hasTemporalReuse(const IndexedReference &Other, unsigned MaxDistance, const Loop &L, DependenceInfo &DI, AAResults &AA) const
Return true if the current object and the indexed reference Other have distance smaller than MaxDista...
IndexedReference(Instruction &StoreOrLoadInst, const LoopInfo &LI, ScalarEvolution &SE)
Construct an indexed reference given a StoreOrLoadInst instruction.
const SCEV * getLastSubscript() const
size_t getNumSubscripts() const
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
bool isOutermost() const
Return true if the loop does not have a parent (natural) loop.
unsigned getLoopDepth() const
Return the nesting level of this loop.
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
LoopT * getParentLoop() const
Return the parent loop if it exists or nullptr for top level loops.
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
This node represents a polynomial recurrence on the trip count of the specified loop.
const SCEV * getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents a constant integer value.
ConstantInt * getValue() const
NoWrapFlags getNoWrapFlags(NoWrapFlags Mask=NoWrapMask) const
This class represents an analyzed expression in the program.
Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
const SCEV * getUDivCeilSCEV(const SCEV *N, const SCEV *D)
Compute ceil(N / D).
Type * getWiderType(Type *Ty1, Type *Ty2) const
bool isKnownNegative(const SCEV *S)
Test if the given expression is known to be negative.
const SCEV * getSCEVAtScope(const SCEV *S, const Loop *L)
Return a SCEV expression for the specified value at the specified scope in the program.
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getNoopOrSignExtend(const SCEV *V, Type *Ty)
Return a SCEV corresponding to a conversion of the input value to the specified type.
const SCEV * getTripCountFromExitCount(const SCEV *ExitCount)
A version of getTripCountFromExitCount below which always picks an evaluation type which can not resu...
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
bool isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
const SCEV * getAddRecExpr(const SCEV *Start, const SCEV *Step, const Loop *L, SCEV::NoWrapFlags Flags)
Get an add recurrence expression for the specified loop.
bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
const SCEV * getNoopOrAnyExtend(const SCEV *V, Type *Ty)
Return a SCEV corresponding to a conversion of the input value to the specified type.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getNoopOrZeroExtend(const SCEV *V, Type *Ty)
Return a SCEV corresponding to a conversion of the input value to the specified type.
unsigned getSmallConstantTripCount(const Loop *L)
Returns the exact trip count of the loop if we can compute it, and the result is a small constant.
const SCEV * getPointerBase(const SCEV *V)
Transitively follow the chain of pointer-type operands until reaching a SCEV that does not have a sin...
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getElementSize(Instruction *Inst)
Return the size of an element read or written by Inst.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
LLVM Value Representation.
Definition: Value.h:74
const ParentTy * getParent() const
Definition: ilist_node.h:32
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2098
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
Definition: STLExtras.h:1909
iterator_range< bf_iterator< T > > breadth_first(const T &G)
@ Other
Any other memory.
bool tryDelinearizeFixedSizeImpl(ScalarEvolution *SE, Instruction *Inst, const SCEV *AccessFn, SmallVectorImpl< const SCEV * > &Subscripts, SmallVectorImpl< int > &Sizes)
Implementation of fixed size array delinearization.
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Definition: APFixedPoint.h:292
void delinearize(ScalarEvolution &SE, const SCEV *Expr, SmallVectorImpl< const SCEV * > &Subscripts, SmallVectorImpl< const SCEV * > &Sizes, const SCEV *ElementSize)
Split this SCEVAddRecExpr into two vectors of SCEVs representing the subscripts and sizes of an array...
int64_t CacheCostTy
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...