LLVM 19.0.0git
LoopAccessAnalysis.cpp
Go to the documentation of this file.
1//===- LoopAccessAnalysis.cpp - Loop Access Analysis Implementation --------==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// The implementation for the loop memory dependence that was originally
10// developed for the loop vectorizer.
11//
12//===----------------------------------------------------------------------===//
13
15#include "llvm/ADT/APInt.h"
16#include "llvm/ADT/DenseMap.h"
19#include "llvm/ADT/STLExtras.h"
20#include "llvm/ADT/SetVector.h"
22#include "llvm/ADT/SmallSet.h"
36#include "llvm/IR/BasicBlock.h"
37#include "llvm/IR/Constants.h"
38#include "llvm/IR/DataLayout.h"
39#include "llvm/IR/DebugLoc.h"
42#include "llvm/IR/Dominators.h"
43#include "llvm/IR/Function.h"
45#include "llvm/IR/InstrTypes.h"
46#include "llvm/IR/Instruction.h"
48#include "llvm/IR/Operator.h"
49#include "llvm/IR/PassManager.h"
51#include "llvm/IR/Type.h"
52#include "llvm/IR/Value.h"
53#include "llvm/IR/ValueHandle.h"
56#include "llvm/Support/Debug.h"
59#include <algorithm>
60#include <cassert>
61#include <cstdint>
62#include <iterator>
63#include <utility>
64#include <variant>
65#include <vector>
66
67using namespace llvm;
68using namespace llvm::PatternMatch;
69
70#define DEBUG_TYPE "loop-accesses"
71
73VectorizationFactor("force-vector-width", cl::Hidden,
74 cl::desc("Sets the SIMD width. Zero is autoselect."),
77
79VectorizationInterleave("force-vector-interleave", cl::Hidden,
80 cl::desc("Sets the vectorization interleave count. "
81 "Zero is autoselect."),
85
87 "runtime-memory-check-threshold", cl::Hidden,
88 cl::desc("When performing memory disambiguation checks at runtime do not "
89 "generate more than this number of comparisons (default = 8)."),
92
93/// The maximum iterations used to merge memory checks
95 "memory-check-merge-threshold", cl::Hidden,
96 cl::desc("Maximum number of comparisons done when trying to merge "
97 "runtime memory checks. (default = 100)"),
98 cl::init(100));
99
100/// Maximum SIMD width.
101const unsigned VectorizerParams::MaxVectorWidth = 64;
102
103/// We collect dependences up to this threshold.
105 MaxDependences("max-dependences", cl::Hidden,
106 cl::desc("Maximum number of dependences collected by "
107 "loop-access analysis (default = 100)"),
108 cl::init(100));
109
110/// This enables versioning on the strides of symbolically striding memory
111/// accesses in code like the following.
112/// for (i = 0; i < N; ++i)
113/// A[i * Stride1] += B[i * Stride2] ...
114///
115/// Will be roughly translated to
116/// if (Stride1 == 1 && Stride2 == 1) {
117/// for (i = 0; i < N; i+=4)
118/// A[i:i+3] += ...
119/// } else
120/// ...
122 "enable-mem-access-versioning", cl::init(true), cl::Hidden,
123 cl::desc("Enable symbolic stride memory access versioning"));
124
125/// Enable store-to-load forwarding conflict detection. This option can
126/// be disabled for correctness testing.
128 "store-to-load-forwarding-conflict-detection", cl::Hidden,
129 cl::desc("Enable conflict detection in loop-access analysis"),
130 cl::init(true));
131
133 "max-forked-scev-depth", cl::Hidden,
134 cl::desc("Maximum recursion depth when finding forked SCEVs (default = 5)"),
135 cl::init(5));
136
138 "laa-speculate-unit-stride", cl::Hidden,
139 cl::desc("Speculate that non-constant strides are unit in LAA"),
140 cl::init(true));
141
143 "hoist-runtime-checks", cl::Hidden,
144 cl::desc(
145 "Hoist inner loop runtime memory checks to outer loop if possible"),
148
150 return ::VectorizationInterleave.getNumOccurrences() > 0;
151}
152
154 const DenseMap<Value *, const SCEV *> &PtrToStride,
155 Value *Ptr) {
156 const SCEV *OrigSCEV = PSE.getSCEV(Ptr);
157
158 // If there is an entry in the map return the SCEV of the pointer with the
159 // symbolic stride replaced by one.
161 if (SI == PtrToStride.end())
162 // For a non-symbolic stride, just return the original expression.
163 return OrigSCEV;
164
165 const SCEV *StrideSCEV = SI->second;
166 // Note: This assert is both overly strong and overly weak. The actual
167 // invariant here is that StrideSCEV should be loop invariant. The only
168 // such invariant strides we happen to speculate right now are unknowns
169 // and thus this is a reasonable proxy of the actual invariant.
170 assert(isa<SCEVUnknown>(StrideSCEV) && "shouldn't be in map");
171
172 ScalarEvolution *SE = PSE.getSE();
173 const auto *CT = SE->getOne(StrideSCEV->getType());
174 PSE.addPredicate(*SE->getEqualPredicate(StrideSCEV, CT));
175 auto *Expr = PSE.getSCEV(Ptr);
176
177 LLVM_DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV
178 << " by: " << *Expr << "\n");
179 return Expr;
180}
181
183 unsigned Index, RuntimePointerChecking &RtCheck)
184 : High(RtCheck.Pointers[Index].End), Low(RtCheck.Pointers[Index].Start),
185 AddressSpace(RtCheck.Pointers[Index]
186 .PointerValue->getType()
188 NeedsFreeze(RtCheck.Pointers[Index].NeedsFreeze) {
190}
191
192/// Calculate Start and End points of memory access.
193/// Let's assume A is the first access and B is a memory access on N-th loop
194/// iteration. Then B is calculated as:
195/// B = A + Step*N .
196/// Step value may be positive or negative.
197/// N is a calculated back-edge taken count:
198/// N = (TripCount > 0) ? RoundDown(TripCount -1 , VF) : 0
199/// Start and End points are calculated in the following way:
200/// Start = UMIN(A, B) ; End = UMAX(A, B) + SizeOfElt,
201/// where SizeOfElt is the size of single memory access in bytes.
202///
203/// There is no conflict when the intervals are disjoint:
204/// NoConflict = (P2.Start >= P1.End) || (P1.Start >= P2.End)
206 Type *AccessTy, bool WritePtr,
207 unsigned DepSetId, unsigned ASId,
209 bool NeedsFreeze) {
210 ScalarEvolution *SE = PSE.getSE();
211
212 const SCEV *ScStart;
213 const SCEV *ScEnd;
214
215 if (SE->isLoopInvariant(PtrExpr, Lp)) {
216 ScStart = ScEnd = PtrExpr;
217 } else {
218 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrExpr);
219 assert(AR && "Invalid addrec expression");
220 const SCEV *Ex = PSE.getBackedgeTakenCount();
221
222 ScStart = AR->getStart();
223 ScEnd = AR->evaluateAtIteration(Ex, *SE);
224 const SCEV *Step = AR->getStepRecurrence(*SE);
225
226 // For expressions with negative step, the upper bound is ScStart and the
227 // lower bound is ScEnd.
228 if (const auto *CStep = dyn_cast<SCEVConstant>(Step)) {
229 if (CStep->getValue()->isNegative())
230 std::swap(ScStart, ScEnd);
231 } else {
232 // Fallback case: the step is not constant, but we can still
233 // get the upper and lower bounds of the interval by using min/max
234 // expressions.
235 ScStart = SE->getUMinExpr(ScStart, ScEnd);
236 ScEnd = SE->getUMaxExpr(AR->getStart(), ScEnd);
237 }
238 }
239 assert(SE->isLoopInvariant(ScStart, Lp) && "ScStart needs to be invariant");
240 assert(SE->isLoopInvariant(ScEnd, Lp)&& "ScEnd needs to be invariant");
241
242 // Add the size of the pointed element to ScEnd.
243 auto &DL = Lp->getHeader()->getModule()->getDataLayout();
244 Type *IdxTy = DL.getIndexType(Ptr->getType());
245 const SCEV *EltSizeSCEV = SE->getStoreSizeOfExpr(IdxTy, AccessTy);
246 ScEnd = SE->getAddExpr(ScEnd, EltSizeSCEV);
247
248 Pointers.emplace_back(Ptr, ScStart, ScEnd, WritePtr, DepSetId, ASId, PtrExpr,
249 NeedsFreeze);
250}
251
252void RuntimePointerChecking::tryToCreateDiffCheck(
253 const RuntimeCheckingPtrGroup &CGI, const RuntimeCheckingPtrGroup &CGJ) {
254 if (!CanUseDiffCheck)
255 return;
256
257 // If either group contains multiple different pointers, bail out.
258 // TODO: Support multiple pointers by using the minimum or maximum pointer,
259 // depending on src & sink.
260 if (CGI.Members.size() != 1 || CGJ.Members.size() != 1) {
261 CanUseDiffCheck = false;
262 return;
263 }
264
265 PointerInfo *Src = &Pointers[CGI.Members[0]];
266 PointerInfo *Sink = &Pointers[CGJ.Members[0]];
267
268 // If either pointer is read and written, multiple checks may be needed. Bail
269 // out.
270 if (!DC.getOrderForAccess(Src->PointerValue, !Src->IsWritePtr).empty() ||
271 !DC.getOrderForAccess(Sink->PointerValue, !Sink->IsWritePtr).empty()) {
272 CanUseDiffCheck = false;
273 return;
274 }
275
276 ArrayRef<unsigned> AccSrc =
277 DC.getOrderForAccess(Src->PointerValue, Src->IsWritePtr);
278 ArrayRef<unsigned> AccSink =
279 DC.getOrderForAccess(Sink->PointerValue, Sink->IsWritePtr);
280 // If either pointer is accessed multiple times, there may not be a clear
281 // src/sink relation. Bail out for now.
282 if (AccSrc.size() != 1 || AccSink.size() != 1) {
283 CanUseDiffCheck = false;
284 return;
285 }
286 // If the sink is accessed before src, swap src/sink.
287 if (AccSink[0] < AccSrc[0])
288 std::swap(Src, Sink);
289
290 auto *SrcAR = dyn_cast<SCEVAddRecExpr>(Src->Expr);
291 auto *SinkAR = dyn_cast<SCEVAddRecExpr>(Sink->Expr);
292 if (!SrcAR || !SinkAR || SrcAR->getLoop() != DC.getInnermostLoop() ||
293 SinkAR->getLoop() != DC.getInnermostLoop()) {
294 CanUseDiffCheck = false;
295 return;
296 }
297
299 DC.getInstructionsForAccess(Src->PointerValue, Src->IsWritePtr);
301 DC.getInstructionsForAccess(Sink->PointerValue, Sink->IsWritePtr);
302 Type *SrcTy = getLoadStoreType(SrcInsts[0]);
303 Type *DstTy = getLoadStoreType(SinkInsts[0]);
304 if (isa<ScalableVectorType>(SrcTy) || isa<ScalableVectorType>(DstTy)) {
305 CanUseDiffCheck = false;
306 return;
307 }
308 const DataLayout &DL =
309 SinkAR->getLoop()->getHeader()->getModule()->getDataLayout();
310 unsigned AllocSize =
311 std::max(DL.getTypeAllocSize(SrcTy), DL.getTypeAllocSize(DstTy));
312
313 // Only matching constant steps matching the AllocSize are supported at the
314 // moment. This simplifies the difference computation. Can be extended in the
315 // future.
316 auto *Step = dyn_cast<SCEVConstant>(SinkAR->getStepRecurrence(*SE));
317 if (!Step || Step != SrcAR->getStepRecurrence(*SE) ||
318 Step->getAPInt().abs() != AllocSize) {
319 CanUseDiffCheck = false;
320 return;
321 }
322
323 IntegerType *IntTy =
324 IntegerType::get(Src->PointerValue->getContext(),
325 DL.getPointerSizeInBits(CGI.AddressSpace));
326
327 // When counting down, the dependence distance needs to be swapped.
328 if (Step->getValue()->isNegative())
329 std::swap(SinkAR, SrcAR);
330
331 const SCEV *SinkStartInt = SE->getPtrToIntExpr(SinkAR->getStart(), IntTy);
332 const SCEV *SrcStartInt = SE->getPtrToIntExpr(SrcAR->getStart(), IntTy);
333 if (isa<SCEVCouldNotCompute>(SinkStartInt) ||
334 isa<SCEVCouldNotCompute>(SrcStartInt)) {
335 CanUseDiffCheck = false;
336 return;
337 }
338
339 const Loop *InnerLoop = SrcAR->getLoop();
340 // If the start values for both Src and Sink also vary according to an outer
341 // loop, then it's probably better to avoid creating diff checks because
342 // they may not be hoisted. We should instead let llvm::addRuntimeChecks
343 // do the expanded full range overlap checks, which can be hoisted.
344 if (HoistRuntimeChecks && InnerLoop->getParentLoop() &&
345 isa<SCEVAddRecExpr>(SinkStartInt) && isa<SCEVAddRecExpr>(SrcStartInt)) {
346 auto *SrcStartAR = cast<SCEVAddRecExpr>(SrcStartInt);
347 auto *SinkStartAR = cast<SCEVAddRecExpr>(SinkStartInt);
348 const Loop *StartARLoop = SrcStartAR->getLoop();
349 if (StartARLoop == SinkStartAR->getLoop() &&
350 StartARLoop == InnerLoop->getParentLoop() &&
351 // If the diff check would already be loop invariant (due to the
352 // recurrences being the same), then we prefer to keep the diff checks
353 // because they are cheaper.
354 SrcStartAR->getStepRecurrence(*SE) !=
355 SinkStartAR->getStepRecurrence(*SE)) {
356 LLVM_DEBUG(dbgs() << "LAA: Not creating diff runtime check, since these "
357 "cannot be hoisted out of the outer loop\n");
358 CanUseDiffCheck = false;
359 return;
360 }
361 }
362
363 LLVM_DEBUG(dbgs() << "LAA: Creating diff runtime check for:\n"
364 << "SrcStart: " << *SrcStartInt << '\n'
365 << "SinkStartInt: " << *SinkStartInt << '\n');
366 DiffChecks.emplace_back(SrcStartInt, SinkStartInt, AllocSize,
367 Src->NeedsFreeze || Sink->NeedsFreeze);
368}
369
370SmallVector<RuntimePointerCheck, 4> RuntimePointerChecking::generateChecks() {
372
373 for (unsigned I = 0; I < CheckingGroups.size(); ++I) {
374 for (unsigned J = I + 1; J < CheckingGroups.size(); ++J) {
377
378 if (needsChecking(CGI, CGJ)) {
379 tryToCreateDiffCheck(CGI, CGJ);
380 Checks.push_back(std::make_pair(&CGI, &CGJ));
381 }
382 }
383 }
384 return Checks;
385}
386
387void RuntimePointerChecking::generateChecks(
388 MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) {
389 assert(Checks.empty() && "Checks is not empty");
390 groupChecks(DepCands, UseDependencies);
391 Checks = generateChecks();
392}
393
395 const RuntimeCheckingPtrGroup &M, const RuntimeCheckingPtrGroup &N) const {
396 for (unsigned I = 0, EI = M.Members.size(); EI != I; ++I)
397 for (unsigned J = 0, EJ = N.Members.size(); EJ != J; ++J)
398 if (needsChecking(M.Members[I], N.Members[J]))
399 return true;
400 return false;
401}
402
403/// Compare \p I and \p J and return the minimum.
404/// Return nullptr in case we couldn't find an answer.
405static const SCEV *getMinFromExprs(const SCEV *I, const SCEV *J,
406 ScalarEvolution *SE) {
407 const SCEV *Diff = SE->getMinusSCEV(J, I);
408 const SCEVConstant *C = dyn_cast<const SCEVConstant>(Diff);
409
410 if (!C)
411 return nullptr;
412 if (C->getValue()->isNegative())
413 return J;
414 return I;
415}
416
418 RuntimePointerChecking &RtCheck) {
419 return addPointer(
420 Index, RtCheck.Pointers[Index].Start, RtCheck.Pointers[Index].End,
421 RtCheck.Pointers[Index].PointerValue->getType()->getPointerAddressSpace(),
422 RtCheck.Pointers[Index].NeedsFreeze, *RtCheck.SE);
423}
424
426 const SCEV *End, unsigned AS,
427 bool NeedsFreeze,
428 ScalarEvolution &SE) {
429 assert(AddressSpace == AS &&
430 "all pointers in a checking group must be in the same address space");
431
432 // Compare the starts and ends with the known minimum and maximum
433 // of this set. We need to know how we compare against the min/max
434 // of the set in order to be able to emit memchecks.
435 const SCEV *Min0 = getMinFromExprs(Start, Low, &SE);
436 if (!Min0)
437 return false;
438
439 const SCEV *Min1 = getMinFromExprs(End, High, &SE);
440 if (!Min1)
441 return false;
442
443 // Update the low bound expression if we've found a new min value.
444 if (Min0 == Start)
445 Low = Start;
446
447 // Update the high bound expression if we've found a new max value.
448 if (Min1 != End)
449 High = End;
450
452 this->NeedsFreeze |= NeedsFreeze;
453 return true;
454}
455
456void RuntimePointerChecking::groupChecks(
457 MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) {
458 // We build the groups from dependency candidates equivalence classes
459 // because:
460 // - We know that pointers in the same equivalence class share
461 // the same underlying object and therefore there is a chance
462 // that we can compare pointers
463 // - We wouldn't be able to merge two pointers for which we need
464 // to emit a memcheck. The classes in DepCands are already
465 // conveniently built such that no two pointers in the same
466 // class need checking against each other.
467
468 // We use the following (greedy) algorithm to construct the groups
469 // For every pointer in the equivalence class:
470 // For each existing group:
471 // - if the difference between this pointer and the min/max bounds
472 // of the group is a constant, then make the pointer part of the
473 // group and update the min/max bounds of that group as required.
474
475 CheckingGroups.clear();
476
477 // If we need to check two pointers to the same underlying object
478 // with a non-constant difference, we shouldn't perform any pointer
479 // grouping with those pointers. This is because we can easily get
480 // into cases where the resulting check would return false, even when
481 // the accesses are safe.
482 //
483 // The following example shows this:
484 // for (i = 0; i < 1000; ++i)
485 // a[5000 + i * m] = a[i] + a[i + 9000]
486 //
487 // Here grouping gives a check of (5000, 5000 + 1000 * m) against
488 // (0, 10000) which is always false. However, if m is 1, there is no
489 // dependence. Not grouping the checks for a[i] and a[i + 9000] allows
490 // us to perform an accurate check in this case.
491 //
492 // The above case requires that we have an UnknownDependence between
493 // accesses to the same underlying object. This cannot happen unless
494 // FoundNonConstantDistanceDependence is set, and therefore UseDependencies
495 // is also false. In this case we will use the fallback path and create
496 // separate checking groups for all pointers.
497
498 // If we don't have the dependency partitions, construct a new
499 // checking pointer group for each pointer. This is also required
500 // for correctness, because in this case we can have checking between
501 // pointers to the same underlying object.
502 if (!UseDependencies) {
503 for (unsigned I = 0; I < Pointers.size(); ++I)
504 CheckingGroups.push_back(RuntimeCheckingPtrGroup(I, *this));
505 return;
506 }
507
508 unsigned TotalComparisons = 0;
509
511 for (unsigned Index = 0; Index < Pointers.size(); ++Index) {
512 auto Iter = PositionMap.insert({Pointers[Index].PointerValue, {}});
513 Iter.first->second.push_back(Index);
514 }
515
516 // We need to keep track of what pointers we've already seen so we
517 // don't process them twice.
519
520 // Go through all equivalence classes, get the "pointer check groups"
521 // and add them to the overall solution. We use the order in which accesses
522 // appear in 'Pointers' to enforce determinism.
523 for (unsigned I = 0; I < Pointers.size(); ++I) {
524 // We've seen this pointer before, and therefore already processed
525 // its equivalence class.
526 if (Seen.count(I))
527 continue;
528
529 MemoryDepChecker::MemAccessInfo Access(Pointers[I].PointerValue,
530 Pointers[I].IsWritePtr);
531
533 auto LeaderI = DepCands.findValue(DepCands.getLeaderValue(Access));
534
535 // Because DepCands is constructed by visiting accesses in the order in
536 // which they appear in alias sets (which is deterministic) and the
537 // iteration order within an equivalence class member is only dependent on
538 // the order in which unions and insertions are performed on the
539 // equivalence class, the iteration order is deterministic.
540 for (auto MI = DepCands.member_begin(LeaderI), ME = DepCands.member_end();
541 MI != ME; ++MI) {
542 auto PointerI = PositionMap.find(MI->getPointer());
543 assert(PointerI != PositionMap.end() &&
544 "pointer in equivalence class not found in PositionMap");
545 for (unsigned Pointer : PointerI->second) {
546 bool Merged = false;
547 // Mark this pointer as seen.
548 Seen.insert(Pointer);
549
550 // Go through all the existing sets and see if we can find one
551 // which can include this pointer.
552 for (RuntimeCheckingPtrGroup &Group : Groups) {
553 // Don't perform more than a certain amount of comparisons.
554 // This should limit the cost of grouping the pointers to something
555 // reasonable. If we do end up hitting this threshold, the algorithm
556 // will create separate groups for all remaining pointers.
557 if (TotalComparisons > MemoryCheckMergeThreshold)
558 break;
559
560 TotalComparisons++;
561
562 if (Group.addPointer(Pointer, *this)) {
563 Merged = true;
564 break;
565 }
566 }
567
568 if (!Merged)
569 // We couldn't add this pointer to any existing set or the threshold
570 // for the number of comparisons has been reached. Create a new group
571 // to hold the current pointer.
572 Groups.push_back(RuntimeCheckingPtrGroup(Pointer, *this));
573 }
574 }
575
576 // We've computed the grouped checks for this partition.
577 // Save the results and continue with the next one.
578 llvm::copy(Groups, std::back_inserter(CheckingGroups));
579 }
580}
581
583 const SmallVectorImpl<int> &PtrToPartition, unsigned PtrIdx1,
584 unsigned PtrIdx2) {
585 return (PtrToPartition[PtrIdx1] != -1 &&
586 PtrToPartition[PtrIdx1] == PtrToPartition[PtrIdx2]);
587}
588
589bool RuntimePointerChecking::needsChecking(unsigned I, unsigned J) const {
590 const PointerInfo &PointerI = Pointers[I];
591 const PointerInfo &PointerJ = Pointers[J];
592
593 // No need to check if two readonly pointers intersect.
594 if (!PointerI.IsWritePtr && !PointerJ.IsWritePtr)
595 return false;
596
597 // Only need to check pointers between two different dependency sets.
598 if (PointerI.DependencySetId == PointerJ.DependencySetId)
599 return false;
600
601 // Only need to check pointers in the same alias set.
602 if (PointerI.AliasSetId != PointerJ.AliasSetId)
603 return false;
604
605 return true;
606}
607
610 unsigned Depth) const {
611 unsigned N = 0;
612 for (const auto &Check : Checks) {
613 const auto &First = Check.first->Members, &Second = Check.second->Members;
614
615 OS.indent(Depth) << "Check " << N++ << ":\n";
616
617 OS.indent(Depth + 2) << "Comparing group (" << Check.first << "):\n";
618 for (unsigned K = 0; K < First.size(); ++K)
619 OS.indent(Depth + 2) << *Pointers[First[K]].PointerValue << "\n";
620
621 OS.indent(Depth + 2) << "Against group (" << Check.second << "):\n";
622 for (unsigned K = 0; K < Second.size(); ++K)
623 OS.indent(Depth + 2) << *Pointers[Second[K]].PointerValue << "\n";
624 }
625}
626
628
629 OS.indent(Depth) << "Run-time memory checks:\n";
630 printChecks(OS, Checks, Depth);
631
632 OS.indent(Depth) << "Grouped accesses:\n";
633 for (unsigned I = 0; I < CheckingGroups.size(); ++I) {
634 const auto &CG = CheckingGroups[I];
635
636 OS.indent(Depth + 2) << "Group " << &CG << ":\n";
637 OS.indent(Depth + 4) << "(Low: " << *CG.Low << " High: " << *CG.High
638 << ")\n";
639 for (unsigned J = 0; J < CG.Members.size(); ++J) {
640 OS.indent(Depth + 6) << "Member: " << *Pointers[CG.Members[J]].Expr
641 << "\n";
642 }
643 }
644}
645
646namespace {
647
648/// Analyses memory accesses in a loop.
649///
650/// Checks whether run time pointer checks are needed and builds sets for data
651/// dependence checking.
652class AccessAnalysis {
653public:
654 /// Read or write access location.
655 typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
656 typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList;
657
658 AccessAnalysis(Loop *TheLoop, AAResults *AA, LoopInfo *LI,
661 SmallPtrSetImpl<MDNode *> &LoopAliasScopes)
662 : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DepCands(DA), PSE(PSE),
663 LoopAliasScopes(LoopAliasScopes) {
664 // We're analyzing dependences across loop iterations.
665 BAA.enableCrossIterationMode();
666 }
667
668 /// Register a load and whether it is only read from.
669 void addLoad(MemoryLocation &Loc, Type *AccessTy, bool IsReadOnly) {
670 Value *Ptr = const_cast<Value *>(Loc.Ptr);
671 AST.add(adjustLoc(Loc));
672 Accesses[MemAccessInfo(Ptr, false)].insert(AccessTy);
673 if (IsReadOnly)
674 ReadOnlyPtr.insert(Ptr);
675 }
676
677 /// Register a store.
678 void addStore(MemoryLocation &Loc, Type *AccessTy) {
679 Value *Ptr = const_cast<Value *>(Loc.Ptr);
680 AST.add(adjustLoc(Loc));
681 Accesses[MemAccessInfo(Ptr, true)].insert(AccessTy);
682 }
683
684 /// Check if we can emit a run-time no-alias check for \p Access.
685 ///
686 /// Returns true if we can emit a run-time no alias check for \p Access.
687 /// If we can check this access, this also adds it to a dependence set and
688 /// adds a run-time to check for it to \p RtCheck. If \p Assume is true,
689 /// we will attempt to use additional run-time checks in order to get
690 /// the bounds of the pointer.
691 bool createCheckForAccess(RuntimePointerChecking &RtCheck,
692 MemAccessInfo Access, Type *AccessTy,
693 const DenseMap<Value *, const SCEV *> &Strides,
695 Loop *TheLoop, unsigned &RunningDepId,
696 unsigned ASId, bool ShouldCheckStride, bool Assume);
697
698 /// Check whether we can check the pointers at runtime for
699 /// non-intersection.
700 ///
701 /// Returns true if we need no check or if we do and we can generate them
702 /// (i.e. the pointers have computable bounds).
703 bool canCheckPtrAtRT(RuntimePointerChecking &RtCheck, ScalarEvolution *SE,
704 Loop *TheLoop, const DenseMap<Value *, const SCEV *> &Strides,
705 Value *&UncomputablePtr, bool ShouldCheckWrap = false);
706
707 /// Goes over all memory accesses, checks whether a RT check is needed
708 /// and builds sets of dependent accesses.
709 void buildDependenceSets() {
710 processMemAccesses();
711 }
712
713 /// Initial processing of memory accesses determined that we need to
714 /// perform dependency checking.
715 ///
716 /// Note that this can later be cleared if we retry memcheck analysis without
717 /// dependency checking (i.e. FoundNonConstantDistanceDependence).
718 bool isDependencyCheckNeeded() { return !CheckDeps.empty(); }
719
720 /// We decided that no dependence analysis would be used. Reset the state.
721 void resetDepChecks(MemoryDepChecker &DepChecker) {
722 CheckDeps.clear();
723 DepChecker.clearDependences();
724 }
725
726 MemAccessInfoList &getDependenciesToCheck() { return CheckDeps; }
727
730 return UnderlyingObjects;
731 }
732
733private:
735
736 /// Adjust the MemoryLocation so that it represents accesses to this
737 /// location across all iterations, rather than a single one.
738 MemoryLocation adjustLoc(MemoryLocation Loc) const {
739 // The accessed location varies within the loop, but remains within the
740 // underlying object.
742 Loc.AATags.Scope = adjustAliasScopeList(Loc.AATags.Scope);
743 Loc.AATags.NoAlias = adjustAliasScopeList(Loc.AATags.NoAlias);
744 return Loc;
745 }
746
747 /// Drop alias scopes that are only valid within a single loop iteration.
748 MDNode *adjustAliasScopeList(MDNode *ScopeList) const {
749 if (!ScopeList)
750 return nullptr;
751
752 // For the sake of simplicity, drop the whole scope list if any scope is
753 // iteration-local.
754 if (any_of(ScopeList->operands(), [&](Metadata *Scope) {
755 return LoopAliasScopes.contains(cast<MDNode>(Scope));
756 }))
757 return nullptr;
758
759 return ScopeList;
760 }
761
762 /// Go over all memory access and check whether runtime pointer checks
763 /// are needed and build sets of dependency check candidates.
764 void processMemAccesses();
765
766 /// Map of all accesses. Values are the types used to access memory pointed to
767 /// by the pointer.
768 PtrAccessMap Accesses;
769
770 /// The loop being checked.
771 const Loop *TheLoop;
772
773 /// List of accesses that need a further dependence check.
774 MemAccessInfoList CheckDeps;
775
776 /// Set of pointers that are read only.
777 SmallPtrSet<Value*, 16> ReadOnlyPtr;
778
779 /// Batched alias analysis results.
780 BatchAAResults BAA;
781
782 /// An alias set tracker to partition the access set by underlying object and
783 //intrinsic property (such as TBAA metadata).
784 AliasSetTracker AST;
785
786 LoopInfo *LI;
787
788 /// Sets of potentially dependent accesses - members of one set share an
789 /// underlying pointer. The set "CheckDeps" identfies which sets really need a
790 /// dependence check.
792
793 /// Initial processing of memory accesses determined that we may need
794 /// to add memchecks. Perform the analysis to determine the necessary checks.
795 ///
796 /// Note that, this is different from isDependencyCheckNeeded. When we retry
797 /// memcheck analysis without dependency checking
798 /// (i.e. FoundNonConstantDistanceDependence), isDependencyCheckNeeded is
799 /// cleared while this remains set if we have potentially dependent accesses.
800 bool IsRTCheckAnalysisNeeded = false;
801
802 /// The SCEV predicate containing all the SCEV-related assumptions.
804
806
807 /// Alias scopes that are declared inside the loop, and as such not valid
808 /// across iterations.
809 SmallPtrSetImpl<MDNode *> &LoopAliasScopes;
810};
811
812} // end anonymous namespace
813
814/// Check whether a pointer can participate in a runtime bounds check.
815/// If \p Assume, try harder to prove that we can compute the bounds of \p Ptr
816/// by adding run-time checks (overflow checks) if necessary.
818 const SCEV *PtrScev, Loop *L, bool Assume) {
819 // The bounds for loop-invariant pointer is trivial.
820 if (PSE.getSE()->isLoopInvariant(PtrScev, L))
821 return true;
822
823 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
824
825 if (!AR && Assume)
826 AR = PSE.getAsAddRec(Ptr);
827
828 if (!AR)
829 return false;
830
831 return AR->isAffine();
832}
833
834/// Check whether a pointer address cannot wrap.
836 const DenseMap<Value *, const SCEV *> &Strides, Value *Ptr, Type *AccessTy,
837 Loop *L) {
838 const SCEV *PtrScev = PSE.getSCEV(Ptr);
839 if (PSE.getSE()->isLoopInvariant(PtrScev, L))
840 return true;
841
842 int64_t Stride = getPtrStride(PSE, AccessTy, Ptr, L, Strides).value_or(0);
843 if (Stride == 1 || PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW))
844 return true;
845
846 return false;
847}
848
849static void visitPointers(Value *StartPtr, const Loop &InnermostLoop,
850 function_ref<void(Value *)> AddPointer) {
852 SmallVector<Value *> WorkList;
853 WorkList.push_back(StartPtr);
854
855 while (!WorkList.empty()) {
856 Value *Ptr = WorkList.pop_back_val();
857 if (!Visited.insert(Ptr).second)
858 continue;
859 auto *PN = dyn_cast<PHINode>(Ptr);
860 // SCEV does not look through non-header PHIs inside the loop. Such phis
861 // can be analyzed by adding separate accesses for each incoming pointer
862 // value.
863 if (PN && InnermostLoop.contains(PN->getParent()) &&
864 PN->getParent() != InnermostLoop.getHeader()) {
865 for (const Use &Inc : PN->incoming_values())
866 WorkList.push_back(Inc);
867 } else
868 AddPointer(Ptr);
869 }
870}
871
872// Walk back through the IR for a pointer, looking for a select like the
873// following:
874//
875// %offset = select i1 %cmp, i64 %a, i64 %b
876// %addr = getelementptr double, double* %base, i64 %offset
877// %ld = load double, double* %addr, align 8
878//
879// We won't be able to form a single SCEVAddRecExpr from this since the
880// address for each loop iteration depends on %cmp. We could potentially
881// produce multiple valid SCEVAddRecExprs, though, and check all of them for
882// memory safety/aliasing if needed.
883//
884// If we encounter some IR we don't yet handle, or something obviously fine
885// like a constant, then we just add the SCEV for that term to the list passed
886// in by the caller. If we have a node that may potentially yield a valid
887// SCEVAddRecExpr then we decompose it into parts and build the SCEV terms
888// ourselves before adding to the list.
889static void findForkedSCEVs(
890 ScalarEvolution *SE, const Loop *L, Value *Ptr,
892 unsigned Depth) {
893 // If our Value is a SCEVAddRecExpr, loop invariant, not an instruction, or
894 // we've exceeded our limit on recursion, just return whatever we have
895 // regardless of whether it can be used for a forked pointer or not, along
896 // with an indication of whether it might be a poison or undef value.
897 const SCEV *Scev = SE->getSCEV(Ptr);
898 if (isa<SCEVAddRecExpr>(Scev) || L->isLoopInvariant(Ptr) ||
899 !isa<Instruction>(Ptr) || Depth == 0) {
900 ScevList.emplace_back(Scev, !isGuaranteedNotToBeUndefOrPoison(Ptr));
901 return;
902 }
903
904 Depth--;
905
906 auto UndefPoisonCheck = [](PointerIntPair<const SCEV *, 1, bool> S) {
907 return get<1>(S);
908 };
909
910 auto GetBinOpExpr = [&SE](unsigned Opcode, const SCEV *L, const SCEV *R) {
911 switch (Opcode) {
912 case Instruction::Add:
913 return SE->getAddExpr(L, R);
914 case Instruction::Sub:
915 return SE->getMinusSCEV(L, R);
916 default:
917 llvm_unreachable("Unexpected binary operator when walking ForkedPtrs");
918 }
919 };
920
921 Instruction *I = cast<Instruction>(Ptr);
922 unsigned Opcode = I->getOpcode();
923 switch (Opcode) {
924 case Instruction::GetElementPtr: {
925 GetElementPtrInst *GEP = cast<GetElementPtrInst>(I);
926 Type *SourceTy = GEP->getSourceElementType();
927 // We only handle base + single offset GEPs here for now.
928 // Not dealing with preexisting gathers yet, so no vectors.
929 if (I->getNumOperands() != 2 || SourceTy->isVectorTy()) {
930 ScevList.emplace_back(Scev, !isGuaranteedNotToBeUndefOrPoison(GEP));
931 break;
932 }
935 findForkedSCEVs(SE, L, I->getOperand(0), BaseScevs, Depth);
936 findForkedSCEVs(SE, L, I->getOperand(1), OffsetScevs, Depth);
937
938 // See if we need to freeze our fork...
939 bool NeedsFreeze = any_of(BaseScevs, UndefPoisonCheck) ||
940 any_of(OffsetScevs, UndefPoisonCheck);
941
942 // Check that we only have a single fork, on either the base or the offset.
943 // Copy the SCEV across for the one without a fork in order to generate
944 // the full SCEV for both sides of the GEP.
945 if (OffsetScevs.size() == 2 && BaseScevs.size() == 1)
946 BaseScevs.push_back(BaseScevs[0]);
947 else if (BaseScevs.size() == 2 && OffsetScevs.size() == 1)
948 OffsetScevs.push_back(OffsetScevs[0]);
949 else {
950 ScevList.emplace_back(Scev, NeedsFreeze);
951 break;
952 }
953
954 // Find the pointer type we need to extend to.
955 Type *IntPtrTy = SE->getEffectiveSCEVType(
956 SE->getSCEV(GEP->getPointerOperand())->getType());
957
958 // Find the size of the type being pointed to. We only have a single
959 // index term (guarded above) so we don't need to index into arrays or
960 // structures, just get the size of the scalar value.
961 const SCEV *Size = SE->getSizeOfExpr(IntPtrTy, SourceTy);
962
963 // Scale up the offsets by the size of the type, then add to the bases.
964 const SCEV *Scaled1 = SE->getMulExpr(
965 Size, SE->getTruncateOrSignExtend(get<0>(OffsetScevs[0]), IntPtrTy));
966 const SCEV *Scaled2 = SE->getMulExpr(
967 Size, SE->getTruncateOrSignExtend(get<0>(OffsetScevs[1]), IntPtrTy));
968 ScevList.emplace_back(SE->getAddExpr(get<0>(BaseScevs[0]), Scaled1),
969 NeedsFreeze);
970 ScevList.emplace_back(SE->getAddExpr(get<0>(BaseScevs[1]), Scaled2),
971 NeedsFreeze);
972 break;
973 }
974 case Instruction::Select: {
976 // A select means we've found a forked pointer, but we currently only
977 // support a single select per pointer so if there's another behind this
978 // then we just bail out and return the generic SCEV.
979 findForkedSCEVs(SE, L, I->getOperand(1), ChildScevs, Depth);
980 findForkedSCEVs(SE, L, I->getOperand(2), ChildScevs, Depth);
981 if (ChildScevs.size() == 2) {
982 ScevList.push_back(ChildScevs[0]);
983 ScevList.push_back(ChildScevs[1]);
984 } else
985 ScevList.emplace_back(Scev, !isGuaranteedNotToBeUndefOrPoison(Ptr));
986 break;
987 }
988 case Instruction::PHI: {
990 // A phi means we've found a forked pointer, but we currently only
991 // support a single phi per pointer so if there's another behind this
992 // then we just bail out and return the generic SCEV.
993 if (I->getNumOperands() == 2) {
994 findForkedSCEVs(SE, L, I->getOperand(0), ChildScevs, Depth);
995 findForkedSCEVs(SE, L, I->getOperand(1), ChildScevs, Depth);
996 }
997 if (ChildScevs.size() == 2) {
998 ScevList.push_back(ChildScevs[0]);
999 ScevList.push_back(ChildScevs[1]);
1000 } else
1001 ScevList.emplace_back(Scev, !isGuaranteedNotToBeUndefOrPoison(Ptr));
1002 break;
1003 }
1004 case Instruction::Add:
1005 case Instruction::Sub: {
1008 findForkedSCEVs(SE, L, I->getOperand(0), LScevs, Depth);
1009 findForkedSCEVs(SE, L, I->getOperand(1), RScevs, Depth);
1010
1011 // See if we need to freeze our fork...
1012 bool NeedsFreeze =
1013 any_of(LScevs, UndefPoisonCheck) || any_of(RScevs, UndefPoisonCheck);
1014
1015 // Check that we only have a single fork, on either the left or right side.
1016 // Copy the SCEV across for the one without a fork in order to generate
1017 // the full SCEV for both sides of the BinOp.
1018 if (LScevs.size() == 2 && RScevs.size() == 1)
1019 RScevs.push_back(RScevs[0]);
1020 else if (RScevs.size() == 2 && LScevs.size() == 1)
1021 LScevs.push_back(LScevs[0]);
1022 else {
1023 ScevList.emplace_back(Scev, NeedsFreeze);
1024 break;
1025 }
1026
1027 ScevList.emplace_back(
1028 GetBinOpExpr(Opcode, get<0>(LScevs[0]), get<0>(RScevs[0])),
1029 NeedsFreeze);
1030 ScevList.emplace_back(
1031 GetBinOpExpr(Opcode, get<0>(LScevs[1]), get<0>(RScevs[1])),
1032 NeedsFreeze);
1033 break;
1034 }
1035 default:
1036 // Just return the current SCEV if we haven't handled the instruction yet.
1037 LLVM_DEBUG(dbgs() << "ForkedPtr unhandled instruction: " << *I << "\n");
1038 ScevList.emplace_back(Scev, !isGuaranteedNotToBeUndefOrPoison(Ptr));
1039 break;
1040 }
1041}
1042
1045 const DenseMap<Value *, const SCEV *> &StridesMap, Value *Ptr,
1046 const Loop *L) {
1047 ScalarEvolution *SE = PSE.getSE();
1048 assert(SE->isSCEVable(Ptr->getType()) && "Value is not SCEVable!");
1050 findForkedSCEVs(SE, L, Ptr, Scevs, MaxForkedSCEVDepth);
1051
1052 // For now, we will only accept a forked pointer with two possible SCEVs
1053 // that are either SCEVAddRecExprs or loop invariant.
1054 if (Scevs.size() == 2 &&
1055 (isa<SCEVAddRecExpr>(get<0>(Scevs[0])) ||
1056 SE->isLoopInvariant(get<0>(Scevs[0]), L)) &&
1057 (isa<SCEVAddRecExpr>(get<0>(Scevs[1])) ||
1058 SE->isLoopInvariant(get<0>(Scevs[1]), L))) {
1059 LLVM_DEBUG(dbgs() << "LAA: Found forked pointer: " << *Ptr << "\n");
1060 LLVM_DEBUG(dbgs() << "\t(1) " << *get<0>(Scevs[0]) << "\n");
1061 LLVM_DEBUG(dbgs() << "\t(2) " << *get<0>(Scevs[1]) << "\n");
1062 return Scevs;
1063 }
1064
1065 return {{replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr), false}};
1066}
1067
1068bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck,
1069 MemAccessInfo Access, Type *AccessTy,
1070 const DenseMap<Value *, const SCEV *> &StridesMap,
1072 Loop *TheLoop, unsigned &RunningDepId,
1073 unsigned ASId, bool ShouldCheckWrap,
1074 bool Assume) {
1075 Value *Ptr = Access.getPointer();
1076
1078 findForkedPointer(PSE, StridesMap, Ptr, TheLoop);
1079
1080 for (auto &P : TranslatedPtrs) {
1081 const SCEV *PtrExpr = get<0>(P);
1082 if (!hasComputableBounds(PSE, Ptr, PtrExpr, TheLoop, Assume))
1083 return false;
1084
1085 // When we run after a failing dependency check we have to make sure
1086 // we don't have wrapping pointers.
1087 if (ShouldCheckWrap) {
1088 // Skip wrap checking when translating pointers.
1089 if (TranslatedPtrs.size() > 1)
1090 return false;
1091
1092 if (!isNoWrap(PSE, StridesMap, Ptr, AccessTy, TheLoop)) {
1093 auto *Expr = PSE.getSCEV(Ptr);
1094 if (!Assume || !isa<SCEVAddRecExpr>(Expr))
1095 return false;
1097 }
1098 }
1099 // If there's only one option for Ptr, look it up after bounds and wrap
1100 // checking, because assumptions might have been added to PSE.
1101 if (TranslatedPtrs.size() == 1)
1102 TranslatedPtrs[0] = {replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr),
1103 false};
1104 }
1105
1106 for (auto [PtrExpr, NeedsFreeze] : TranslatedPtrs) {
1107 // The id of the dependence set.
1108 unsigned DepId;
1109
1110 if (isDependencyCheckNeeded()) {
1111 Value *Leader = DepCands.getLeaderValue(Access).getPointer();
1112 unsigned &LeaderId = DepSetId[Leader];
1113 if (!LeaderId)
1114 LeaderId = RunningDepId++;
1115 DepId = LeaderId;
1116 } else
1117 // Each access has its own dependence set.
1118 DepId = RunningDepId++;
1119
1120 bool IsWrite = Access.getInt();
1121 RtCheck.insert(TheLoop, Ptr, PtrExpr, AccessTy, IsWrite, DepId, ASId, PSE,
1122 NeedsFreeze);
1123 LLVM_DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n');
1124 }
1125
1126 return true;
1127}
1128
1129bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
1130 ScalarEvolution *SE, Loop *TheLoop,
1131 const DenseMap<Value *, const SCEV *> &StridesMap,
1132 Value *&UncomputablePtr, bool ShouldCheckWrap) {
1133 // Find pointers with computable bounds. We are going to use this information
1134 // to place a runtime bound check.
1135 bool CanDoRT = true;
1136
1137 bool MayNeedRTCheck = false;
1138 if (!IsRTCheckAnalysisNeeded) return true;
1139
1140 bool IsDepCheckNeeded = isDependencyCheckNeeded();
1141
1142 // We assign a consecutive id to access from different alias sets.
1143 // Accesses between different groups doesn't need to be checked.
1144 unsigned ASId = 0;
1145 for (auto &AS : AST) {
1146 int NumReadPtrChecks = 0;
1147 int NumWritePtrChecks = 0;
1148 bool CanDoAliasSetRT = true;
1149 ++ASId;
1150 auto ASPointers = AS.getPointers();
1151
1152 // We assign consecutive id to access from different dependence sets.
1153 // Accesses within the same set don't need a runtime check.
1154 unsigned RunningDepId = 1;
1156
1158
1159 // First, count how many write and read accesses are in the alias set. Also
1160 // collect MemAccessInfos for later.
1162 for (const Value *Ptr_ : ASPointers) {
1163 Value *Ptr = const_cast<Value *>(Ptr_);
1164 bool IsWrite = Accesses.count(MemAccessInfo(Ptr, true));
1165 if (IsWrite)
1166 ++NumWritePtrChecks;
1167 else
1168 ++NumReadPtrChecks;
1169 AccessInfos.emplace_back(Ptr, IsWrite);
1170 }
1171
1172 // We do not need runtime checks for this alias set, if there are no writes
1173 // or a single write and no reads.
1174 if (NumWritePtrChecks == 0 ||
1175 (NumWritePtrChecks == 1 && NumReadPtrChecks == 0)) {
1176 assert((ASPointers.size() <= 1 ||
1177 all_of(ASPointers,
1178 [this](const Value *Ptr) {
1179 MemAccessInfo AccessWrite(const_cast<Value *>(Ptr),
1180 true);
1181 return DepCands.findValue(AccessWrite) == DepCands.end();
1182 })) &&
1183 "Can only skip updating CanDoRT below, if all entries in AS "
1184 "are reads or there is at most 1 entry");
1185 continue;
1186 }
1187
1188 for (auto &Access : AccessInfos) {
1189 for (const auto &AccessTy : Accesses[Access]) {
1190 if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
1191 DepSetId, TheLoop, RunningDepId, ASId,
1192 ShouldCheckWrap, false)) {
1193 LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:"
1194 << *Access.getPointer() << '\n');
1195 Retries.push_back({Access, AccessTy});
1196 CanDoAliasSetRT = false;
1197 }
1198 }
1199 }
1200
1201 // Note that this function computes CanDoRT and MayNeedRTCheck
1202 // independently. For example CanDoRT=false, MayNeedRTCheck=false means that
1203 // we have a pointer for which we couldn't find the bounds but we don't
1204 // actually need to emit any checks so it does not matter.
1205 //
1206 // We need runtime checks for this alias set, if there are at least 2
1207 // dependence sets (in which case RunningDepId > 2) or if we need to re-try
1208 // any bound checks (because in that case the number of dependence sets is
1209 // incomplete).
1210 bool NeedsAliasSetRTCheck = RunningDepId > 2 || !Retries.empty();
1211
1212 // We need to perform run-time alias checks, but some pointers had bounds
1213 // that couldn't be checked.
1214 if (NeedsAliasSetRTCheck && !CanDoAliasSetRT) {
1215 // Reset the CanDoSetRt flag and retry all accesses that have failed.
1216 // We know that we need these checks, so we can now be more aggressive
1217 // and add further checks if required (overflow checks).
1218 CanDoAliasSetRT = true;
1219 for (auto Retry : Retries) {
1220 MemAccessInfo Access = Retry.first;
1221 Type *AccessTy = Retry.second;
1222 if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
1223 DepSetId, TheLoop, RunningDepId, ASId,
1224 ShouldCheckWrap, /*Assume=*/true)) {
1225 CanDoAliasSetRT = false;
1226 UncomputablePtr = Access.getPointer();
1227 break;
1228 }
1229 }
1230 }
1231
1232 CanDoRT &= CanDoAliasSetRT;
1233 MayNeedRTCheck |= NeedsAliasSetRTCheck;
1234 ++ASId;
1235 }
1236
1237 // If the pointers that we would use for the bounds comparison have different
1238 // address spaces, assume the values aren't directly comparable, so we can't
1239 // use them for the runtime check. We also have to assume they could
1240 // overlap. In the future there should be metadata for whether address spaces
1241 // are disjoint.
1242 unsigned NumPointers = RtCheck.Pointers.size();
1243 for (unsigned i = 0; i < NumPointers; ++i) {
1244 for (unsigned j = i + 1; j < NumPointers; ++j) {
1245 // Only need to check pointers between two different dependency sets.
1246 if (RtCheck.Pointers[i].DependencySetId ==
1247 RtCheck.Pointers[j].DependencySetId)
1248 continue;
1249 // Only need to check pointers in the same alias set.
1250 if (RtCheck.Pointers[i].AliasSetId != RtCheck.Pointers[j].AliasSetId)
1251 continue;
1252
1253 Value *PtrI = RtCheck.Pointers[i].PointerValue;
1254 Value *PtrJ = RtCheck.Pointers[j].PointerValue;
1255
1256 unsigned ASi = PtrI->getType()->getPointerAddressSpace();
1257 unsigned ASj = PtrJ->getType()->getPointerAddressSpace();
1258 if (ASi != ASj) {
1259 LLVM_DEBUG(
1260 dbgs() << "LAA: Runtime check would require comparison between"
1261 " different address spaces\n");
1262 return false;
1263 }
1264 }
1265 }
1266
1267 if (MayNeedRTCheck && CanDoRT)
1268 RtCheck.generateChecks(DepCands, IsDepCheckNeeded);
1269
1270 LLVM_DEBUG(dbgs() << "LAA: We need to do " << RtCheck.getNumberOfChecks()
1271 << " pointer comparisons.\n");
1272
1273 // If we can do run-time checks, but there are no checks, no runtime checks
1274 // are needed. This can happen when all pointers point to the same underlying
1275 // object for example.
1276 RtCheck.Need = CanDoRT ? RtCheck.getNumberOfChecks() != 0 : MayNeedRTCheck;
1277
1278 bool CanDoRTIfNeeded = !RtCheck.Need || CanDoRT;
1279 if (!CanDoRTIfNeeded)
1280 RtCheck.reset();
1281 return CanDoRTIfNeeded;
1282}
1283
1284void AccessAnalysis::processMemAccesses() {
1285 // We process the set twice: first we process read-write pointers, last we
1286 // process read-only pointers. This allows us to skip dependence tests for
1287 // read-only pointers.
1288
1289 LLVM_DEBUG(dbgs() << "LAA: Processing memory accesses...\n");
1290 LLVM_DEBUG(dbgs() << " AST: "; AST.dump());
1291 LLVM_DEBUG(dbgs() << "LAA: Accesses(" << Accesses.size() << "):\n");
1292 LLVM_DEBUG({
1293 for (auto A : Accesses)
1294 dbgs() << "\t" << *A.first.getPointer() << " ("
1295 << (A.first.getInt()
1296 ? "write"
1297 : (ReadOnlyPtr.count(A.first.getPointer()) ? "read-only"
1298 : "read"))
1299 << ")\n";
1300 });
1301
1302 // The AliasSetTracker has nicely partitioned our pointers by metadata
1303 // compatibility and potential for underlying-object overlap. As a result, we
1304 // only need to check for potential pointer dependencies within each alias
1305 // set.
1306 for (const auto &AS : AST) {
1307 // Note that both the alias-set tracker and the alias sets themselves used
1308 // ordered collections internally and so the iteration order here is
1309 // deterministic.
1310 auto ASPointers = AS.getPointers();
1311
1312 bool SetHasWrite = false;
1313
1314 // Map of pointers to last access encountered.
1315 typedef DenseMap<const Value*, MemAccessInfo> UnderlyingObjToAccessMap;
1316 UnderlyingObjToAccessMap ObjToLastAccess;
1317
1318 // Set of access to check after all writes have been processed.
1319 PtrAccessMap DeferredAccesses;
1320
1321 // Iterate over each alias set twice, once to process read/write pointers,
1322 // and then to process read-only pointers.
1323 for (int SetIteration = 0; SetIteration < 2; ++SetIteration) {
1324 bool UseDeferred = SetIteration > 0;
1325 PtrAccessMap &S = UseDeferred ? DeferredAccesses : Accesses;
1326
1327 for (const Value *Ptr_ : ASPointers) {
1328 Value *Ptr = const_cast<Value *>(Ptr_);
1329
1330 // For a single memory access in AliasSetTracker, Accesses may contain
1331 // both read and write, and they both need to be handled for CheckDeps.
1332 for (const auto &AC : S) {
1333 if (AC.first.getPointer() != Ptr)
1334 continue;
1335
1336 bool IsWrite = AC.first.getInt();
1337
1338 // If we're using the deferred access set, then it contains only
1339 // reads.
1340 bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite;
1341 if (UseDeferred && !IsReadOnlyPtr)
1342 continue;
1343 // Otherwise, the pointer must be in the PtrAccessSet, either as a
1344 // read or a write.
1345 assert(((IsReadOnlyPtr && UseDeferred) || IsWrite ||
1346 S.count(MemAccessInfo(Ptr, false))) &&
1347 "Alias-set pointer not in the access set?");
1348
1349 MemAccessInfo Access(Ptr, IsWrite);
1350 DepCands.insert(Access);
1351
1352 // Memorize read-only pointers for later processing and skip them in
1353 // the first round (they need to be checked after we have seen all
1354 // write pointers). Note: we also mark pointer that are not
1355 // consecutive as "read-only" pointers (so that we check
1356 // "a[b[i]] +="). Hence, we need the second check for "!IsWrite".
1357 if (!UseDeferred && IsReadOnlyPtr) {
1358 // We only use the pointer keys, the types vector values don't
1359 // matter.
1360 DeferredAccesses.insert({Access, {}});
1361 continue;
1362 }
1363
1364 // If this is a write - check other reads and writes for conflicts. If
1365 // this is a read only check other writes for conflicts (but only if
1366 // there is no other write to the ptr - this is an optimization to
1367 // catch "a[i] = a[i] + " without having to do a dependence check).
1368 if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) {
1369 CheckDeps.push_back(Access);
1370 IsRTCheckAnalysisNeeded = true;
1371 }
1372
1373 if (IsWrite)
1374 SetHasWrite = true;
1375
1376 // Create sets of pointers connected by a shared alias set and
1377 // underlying object.
1378 typedef SmallVector<const Value *, 16> ValueVector;
1379 ValueVector TempObjects;
1380
1381 UnderlyingObjects[Ptr] = {};
1382 SmallVector<const Value *, 16> &UOs = UnderlyingObjects[Ptr];
1383 ::getUnderlyingObjects(Ptr, UOs, LI);
1385 << "Underlying objects for pointer " << *Ptr << "\n");
1386 for (const Value *UnderlyingObj : UOs) {
1387 // nullptr never alias, don't join sets for pointer that have "null"
1388 // in their UnderlyingObjects list.
1389 if (isa<ConstantPointerNull>(UnderlyingObj) &&
1391 TheLoop->getHeader()->getParent(),
1392 UnderlyingObj->getType()->getPointerAddressSpace()))
1393 continue;
1394
1395 UnderlyingObjToAccessMap::iterator Prev =
1396 ObjToLastAccess.find(UnderlyingObj);
1397 if (Prev != ObjToLastAccess.end())
1398 DepCands.unionSets(Access, Prev->second);
1399
1400 ObjToLastAccess[UnderlyingObj] = Access;
1401 LLVM_DEBUG(dbgs() << " " << *UnderlyingObj << "\n");
1402 }
1403 }
1404 }
1405 }
1406 }
1407}
1408
1409/// Return true if an AddRec pointer \p Ptr is unsigned non-wrapping,
1410/// i.e. monotonically increasing/decreasing.
1411static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR,
1412 PredicatedScalarEvolution &PSE, const Loop *L) {
1413
1414 // FIXME: This should probably only return true for NUW.
1416 return true;
1417
1419 return true;
1420
1421 // Scalar evolution does not propagate the non-wrapping flags to values that
1422 // are derived from a non-wrapping induction variable because non-wrapping
1423 // could be flow-sensitive.
1424 //
1425 // Look through the potentially overflowing instruction to try to prove
1426 // non-wrapping for the *specific* value of Ptr.
1427
1428 // The arithmetic implied by an inbounds GEP can't overflow.
1429 auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
1430 if (!GEP || !GEP->isInBounds())
1431 return false;
1432
1433 // Make sure there is only one non-const index and analyze that.
1434 Value *NonConstIndex = nullptr;
1435 for (Value *Index : GEP->indices())
1436 if (!isa<ConstantInt>(Index)) {
1437 if (NonConstIndex)
1438 return false;
1439 NonConstIndex = Index;
1440 }
1441 if (!NonConstIndex)
1442 // The recurrence is on the pointer, ignore for now.
1443 return false;
1444
1445 // The index in GEP is signed. It is non-wrapping if it's derived from a NSW
1446 // AddRec using a NSW operation.
1447 if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(NonConstIndex))
1448 if (OBO->hasNoSignedWrap() &&
1449 // Assume constant for other the operand so that the AddRec can be
1450 // easily found.
1451 isa<ConstantInt>(OBO->getOperand(1))) {
1452 auto *OpScev = PSE.getSCEV(OBO->getOperand(0));
1453
1454 if (auto *OpAR = dyn_cast<SCEVAddRecExpr>(OpScev))
1455 return OpAR->getLoop() == L && OpAR->getNoWrapFlags(SCEV::FlagNSW);
1456 }
1457
1458 return false;
1459}
1460
1461/// Check whether the access through \p Ptr has a constant stride.
1463 Type *AccessTy, Value *Ptr,
1464 const Loop *Lp,
1465 const DenseMap<Value *, const SCEV *> &StridesMap,
1466 bool Assume, bool ShouldCheckWrap) {
1467 Type *Ty = Ptr->getType();
1468 assert(Ty->isPointerTy() && "Unexpected non-ptr");
1469
1470 if (isa<ScalableVectorType>(AccessTy)) {
1471 LLVM_DEBUG(dbgs() << "LAA: Bad stride - Scalable object: " << *AccessTy
1472 << "\n");
1473 return std::nullopt;
1474 }
1475
1476 const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr);
1477
1478 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
1479 if (Assume && !AR)
1480 AR = PSE.getAsAddRec(Ptr);
1481
1482 if (!AR) {
1483 LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not an AddRecExpr pointer " << *Ptr
1484 << " SCEV: " << *PtrScev << "\n");
1485 return std::nullopt;
1486 }
1487
1488 // The access function must stride over the innermost loop.
1489 if (Lp != AR->getLoop()) {
1490 LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not striding over innermost loop "
1491 << *Ptr << " SCEV: " << *AR << "\n");
1492 return std::nullopt;
1493 }
1494
1495 // Check the step is constant.
1496 const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
1497
1498 // Calculate the pointer stride and check if it is constant.
1499 const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
1500 if (!C) {
1501 LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not a constant strided " << *Ptr
1502 << " SCEV: " << *AR << "\n");
1503 return std::nullopt;
1504 }
1505
1506 auto &DL = Lp->getHeader()->getModule()->getDataLayout();
1507 TypeSize AllocSize = DL.getTypeAllocSize(AccessTy);
1508 int64_t Size = AllocSize.getFixedValue();
1509 const APInt &APStepVal = C->getAPInt();
1510
1511 // Huge step value - give up.
1512 if (APStepVal.getBitWidth() > 64)
1513 return std::nullopt;
1514
1515 int64_t StepVal = APStepVal.getSExtValue();
1516
1517 // Strided access.
1518 int64_t Stride = StepVal / Size;
1519 int64_t Rem = StepVal % Size;
1520 if (Rem)
1521 return std::nullopt;
1522
1523 if (!ShouldCheckWrap)
1524 return Stride;
1525
1526 // The address calculation must not wrap. Otherwise, a dependence could be
1527 // inverted.
1528 if (isNoWrapAddRec(Ptr, AR, PSE, Lp))
1529 return Stride;
1530
1531 // An inbounds getelementptr that is a AddRec with a unit stride
1532 // cannot wrap per definition. If it did, the result would be poison
1533 // and any memory access dependent on it would be immediate UB
1534 // when executed.
1535 if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
1536 GEP && GEP->isInBounds() && (Stride == 1 || Stride == -1))
1537 return Stride;
1538
1539 // If the null pointer is undefined, then a access sequence which would
1540 // otherwise access it can be assumed not to unsigned wrap. Note that this
1541 // assumes the object in memory is aligned to the natural alignment.
1542 unsigned AddrSpace = Ty->getPointerAddressSpace();
1543 if (!NullPointerIsDefined(Lp->getHeader()->getParent(), AddrSpace) &&
1544 (Stride == 1 || Stride == -1))
1545 return Stride;
1546
1547 if (Assume) {
1549 LLVM_DEBUG(dbgs() << "LAA: Pointer may wrap:\n"
1550 << "LAA: Pointer: " << *Ptr << "\n"
1551 << "LAA: SCEV: " << *AR << "\n"
1552 << "LAA: Added an overflow assumption\n");
1553 return Stride;
1554 }
1555 LLVM_DEBUG(
1556 dbgs() << "LAA: Bad stride - Pointer may wrap in the address space "
1557 << *Ptr << " SCEV: " << *AR << "\n");
1558 return std::nullopt;
1559}
1560
1561std::optional<int> llvm::getPointersDiff(Type *ElemTyA, Value *PtrA,
1562 Type *ElemTyB, Value *PtrB,
1563 const DataLayout &DL,
1564 ScalarEvolution &SE, bool StrictCheck,
1565 bool CheckType) {
1566 assert(PtrA && PtrB && "Expected non-nullptr pointers.");
1567
1568 // Make sure that A and B are different pointers.
1569 if (PtrA == PtrB)
1570 return 0;
1571
1572 // Make sure that the element types are the same if required.
1573 if (CheckType && ElemTyA != ElemTyB)
1574 return std::nullopt;
1575
1576 unsigned ASA = PtrA->getType()->getPointerAddressSpace();
1577 unsigned ASB = PtrB->getType()->getPointerAddressSpace();
1578
1579 // Check that the address spaces match.
1580 if (ASA != ASB)
1581 return std::nullopt;
1582 unsigned IdxWidth = DL.getIndexSizeInBits(ASA);
1583
1584 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
1585 Value *PtrA1 = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
1586 Value *PtrB1 = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
1587
1588 int Val;
1589 if (PtrA1 == PtrB1) {
1590 // Retrieve the address space again as pointer stripping now tracks through
1591 // `addrspacecast`.
1592 ASA = cast<PointerType>(PtrA1->getType())->getAddressSpace();
1593 ASB = cast<PointerType>(PtrB1->getType())->getAddressSpace();
1594 // Check that the address spaces match and that the pointers are valid.
1595 if (ASA != ASB)
1596 return std::nullopt;
1597
1598 IdxWidth = DL.getIndexSizeInBits(ASA);
1599 OffsetA = OffsetA.sextOrTrunc(IdxWidth);
1600 OffsetB = OffsetB.sextOrTrunc(IdxWidth);
1601
1602 OffsetB -= OffsetA;
1603 Val = OffsetB.getSExtValue();
1604 } else {
1605 // Otherwise compute the distance with SCEV between the base pointers.
1606 const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
1607 const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
1608 const auto *Diff =
1609 dyn_cast<SCEVConstant>(SE.getMinusSCEV(PtrSCEVB, PtrSCEVA));
1610 if (!Diff)
1611 return std::nullopt;
1612 Val = Diff->getAPInt().getSExtValue();
1613 }
1614 int Size = DL.getTypeStoreSize(ElemTyA);
1615 int Dist = Val / Size;
1616
1617 // Ensure that the calculated distance matches the type-based one after all
1618 // the bitcasts removal in the provided pointers.
1619 if (!StrictCheck || Dist * Size == Val)
1620 return Dist;
1621 return std::nullopt;
1622}
1623
1625 const DataLayout &DL, ScalarEvolution &SE,
1626 SmallVectorImpl<unsigned> &SortedIndices) {
1628 VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
1629 "Expected list of pointer operands.");
1630 // Walk over the pointers, and map each of them to an offset relative to
1631 // first pointer in the array.
1632 Value *Ptr0 = VL[0];
1633
1634 using DistOrdPair = std::pair<int64_t, int>;
1635 auto Compare = llvm::less_first();
1636 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
1637 Offsets.emplace(0, 0);
1638 int Cnt = 1;
1639 bool IsConsecutive = true;
1640 for (auto *Ptr : VL.drop_front()) {
1641 std::optional<int> Diff = getPointersDiff(ElemTy, Ptr0, ElemTy, Ptr, DL, SE,
1642 /*StrictCheck=*/true);
1643 if (!Diff)
1644 return false;
1645
1646 // Check if the pointer with the same offset is found.
1647 int64_t Offset = *Diff;
1648 auto Res = Offsets.emplace(Offset, Cnt);
1649 if (!Res.second)
1650 return false;
1651 // Consecutive order if the inserted element is the last one.
1652 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
1653 ++Cnt;
1654 }
1655 SortedIndices.clear();
1656 if (!IsConsecutive) {
1657 // Fill SortedIndices array only if it is non-consecutive.
1658 SortedIndices.resize(VL.size());
1659 Cnt = 0;
1660 for (const std::pair<int64_t, int> &Pair : Offsets) {
1661 SortedIndices[Cnt] = Pair.second;
1662 ++Cnt;
1663 }
1664 }
1665 return true;
1666}
1667
1668/// Returns true if the memory operations \p A and \p B are consecutive.
1670 ScalarEvolution &SE, bool CheckType) {
1673 if (!PtrA || !PtrB)
1674 return false;
1675 Type *ElemTyA = getLoadStoreType(A);
1676 Type *ElemTyB = getLoadStoreType(B);
1677 std::optional<int> Diff =
1678 getPointersDiff(ElemTyA, PtrA, ElemTyB, PtrB, DL, SE,
1679 /*StrictCheck=*/true, CheckType);
1680 return Diff && *Diff == 1;
1681}
1682
1684 visitPointers(SI->getPointerOperand(), *InnermostLoop,
1685 [this, SI](Value *Ptr) {
1686 Accesses[MemAccessInfo(Ptr, true)].push_back(AccessIdx);
1687 InstMap.push_back(SI);
1688 ++AccessIdx;
1689 });
1690}
1691
1693 visitPointers(LI->getPointerOperand(), *InnermostLoop,
1694 [this, LI](Value *Ptr) {
1695 Accesses[MemAccessInfo(Ptr, false)].push_back(AccessIdx);
1696 InstMap.push_back(LI);
1697 ++AccessIdx;
1698 });
1699}
1700
1703 switch (Type) {
1704 case NoDep:
1705 case Forward:
1708
1709 case Unknown:
1712 case Backward:
1714 case IndirectUnsafe:
1716 }
1717 llvm_unreachable("unexpected DepType!");
1718}
1719
1721 switch (Type) {
1722 case NoDep:
1723 case Forward:
1724 case ForwardButPreventsForwarding:
1725 case Unknown:
1726 case IndirectUnsafe:
1727 return false;
1728
1729 case BackwardVectorizable:
1730 case Backward:
1731 case BackwardVectorizableButPreventsForwarding:
1732 return true;
1733 }
1734 llvm_unreachable("unexpected DepType!");
1735}
1736
1738 return isBackward() || Type == Unknown;
1739}
1740
1742 switch (Type) {
1743 case Forward:
1744 case ForwardButPreventsForwarding:
1745 return true;
1746
1747 case NoDep:
1748 case Unknown:
1749 case BackwardVectorizable:
1750 case Backward:
1751 case BackwardVectorizableButPreventsForwarding:
1752 case IndirectUnsafe:
1753 return false;
1754 }
1755 llvm_unreachable("unexpected DepType!");
1756}
1757
1758bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance,
1759 uint64_t TypeByteSize) {
1760 // If loads occur at a distance that is not a multiple of a feasible vector
1761 // factor store-load forwarding does not take place.
1762 // Positive dependences might cause troubles because vectorizing them might
1763 // prevent store-load forwarding making vectorized code run a lot slower.
1764 // a[i] = a[i-3] ^ a[i-8];
1765 // The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and
1766 // hence on your typical architecture store-load forwarding does not take
1767 // place. Vectorizing in such cases does not make sense.
1768 // Store-load forwarding distance.
1769
1770 // After this many iterations store-to-load forwarding conflicts should not
1771 // cause any slowdowns.
1772 const uint64_t NumItersForStoreLoadThroughMemory = 8 * TypeByteSize;
1773 // Maximum vector factor.
1774 uint64_t MaxVFWithoutSLForwardIssues = std::min(
1775 VectorizerParams::MaxVectorWidth * TypeByteSize, MinDepDistBytes);
1776
1777 // Compute the smallest VF at which the store and load would be misaligned.
1778 for (uint64_t VF = 2 * TypeByteSize; VF <= MaxVFWithoutSLForwardIssues;
1779 VF *= 2) {
1780 // If the number of vector iteration between the store and the load are
1781 // small we could incur conflicts.
1782 if (Distance % VF && Distance / VF < NumItersForStoreLoadThroughMemory) {
1783 MaxVFWithoutSLForwardIssues = (VF >> 1);
1784 break;
1785 }
1786 }
1787
1788 if (MaxVFWithoutSLForwardIssues < 2 * TypeByteSize) {
1789 LLVM_DEBUG(
1790 dbgs() << "LAA: Distance " << Distance
1791 << " that could cause a store-load forwarding conflict\n");
1792 return true;
1793 }
1794
1795 if (MaxVFWithoutSLForwardIssues < MinDepDistBytes &&
1796 MaxVFWithoutSLForwardIssues !=
1797 VectorizerParams::MaxVectorWidth * TypeByteSize)
1798 MinDepDistBytes = MaxVFWithoutSLForwardIssues;
1799 return false;
1800}
1801
1802void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) {
1803 if (Status < S)
1804 Status = S;
1805}
1806
1807/// Given a dependence-distance \p Dist between two
1808/// memory accesses, that have the same stride whose absolute value is given
1809/// in \p Stride, and that have the same type size \p TypeByteSize,
1810/// in a loop whose takenCount is \p BackedgeTakenCount, check if it is
1811/// possible to prove statically that the dependence distance is larger
1812/// than the range that the accesses will travel through the execution of
1813/// the loop. If so, return true; false otherwise. This is useful for
1814/// example in loops such as the following (PR31098):
1815/// for (i = 0; i < D; ++i) {
1816/// = out[i];
1817/// out[i+D] =
1818/// }
1820 const SCEV &BackedgeTakenCount,
1821 const SCEV &Dist, uint64_t Stride,
1822 uint64_t TypeByteSize) {
1823
1824 // If we can prove that
1825 // (**) |Dist| > BackedgeTakenCount * Step
1826 // where Step is the absolute stride of the memory accesses in bytes,
1827 // then there is no dependence.
1828 //
1829 // Rationale:
1830 // We basically want to check if the absolute distance (|Dist/Step|)
1831 // is >= the loop iteration count (or > BackedgeTakenCount).
1832 // This is equivalent to the Strong SIV Test (Practical Dependence Testing,
1833 // Section 4.2.1); Note, that for vectorization it is sufficient to prove
1834 // that the dependence distance is >= VF; This is checked elsewhere.
1835 // But in some cases we can prune dependence distances early, and
1836 // even before selecting the VF, and without a runtime test, by comparing
1837 // the distance against the loop iteration count. Since the vectorized code
1838 // will be executed only if LoopCount >= VF, proving distance >= LoopCount
1839 // also guarantees that distance >= VF.
1840 //
1841 const uint64_t ByteStride = Stride * TypeByteSize;
1842 const SCEV *Step = SE.getConstant(BackedgeTakenCount.getType(), ByteStride);
1843 const SCEV *Product = SE.getMulExpr(&BackedgeTakenCount, Step);
1844
1845 const SCEV *CastedDist = &Dist;
1846 const SCEV *CastedProduct = Product;
1847 uint64_t DistTypeSizeBits = DL.getTypeSizeInBits(Dist.getType());
1848 uint64_t ProductTypeSizeBits = DL.getTypeSizeInBits(Product->getType());
1849
1850 // The dependence distance can be positive/negative, so we sign extend Dist;
1851 // The multiplication of the absolute stride in bytes and the
1852 // backedgeTakenCount is non-negative, so we zero extend Product.
1853 if (DistTypeSizeBits > ProductTypeSizeBits)
1854 CastedProduct = SE.getZeroExtendExpr(Product, Dist.getType());
1855 else
1856 CastedDist = SE.getNoopOrSignExtend(&Dist, Product->getType());
1857
1858 // Is Dist - (BackedgeTakenCount * Step) > 0 ?
1859 // (If so, then we have proven (**) because |Dist| >= Dist)
1860 const SCEV *Minus = SE.getMinusSCEV(CastedDist, CastedProduct);
1861 if (SE.isKnownPositive(Minus))
1862 return true;
1863
1864 // Second try: Is -Dist - (BackedgeTakenCount * Step) > 0 ?
1865 // (If so, then we have proven (**) because |Dist| >= -1*Dist)
1866 const SCEV *NegDist = SE.getNegativeSCEV(CastedDist);
1867 Minus = SE.getMinusSCEV(NegDist, CastedProduct);
1868 if (SE.isKnownPositive(Minus))
1869 return true;
1870
1871 return false;
1872}
1873
1874/// Check the dependence for two accesses with the same stride \p Stride.
1875/// \p Distance is the positive distance and \p TypeByteSize is type size in
1876/// bytes.
1877///
1878/// \returns true if they are independent.
1880 uint64_t TypeByteSize) {
1881 assert(Stride > 1 && "The stride must be greater than 1");
1882 assert(TypeByteSize > 0 && "The type size in byte must be non-zero");
1883 assert(Distance > 0 && "The distance must be non-zero");
1884
1885 // Skip if the distance is not multiple of type byte size.
1886 if (Distance % TypeByteSize)
1887 return false;
1888
1889 uint64_t ScaledDist = Distance / TypeByteSize;
1890
1891 // No dependence if the scaled distance is not multiple of the stride.
1892 // E.g.
1893 // for (i = 0; i < 1024 ; i += 4)
1894 // A[i+2] = A[i] + 1;
1895 //
1896 // Two accesses in memory (scaled distance is 2, stride is 4):
1897 // | A[0] | | | | A[4] | | | |
1898 // | | | A[2] | | | | A[6] | |
1899 //
1900 // E.g.
1901 // for (i = 0; i < 1024 ; i += 3)
1902 // A[i+4] = A[i] + 1;
1903 //
1904 // Two accesses in memory (scaled distance is 4, stride is 3):
1905 // | A[0] | | | A[3] | | | A[6] | | |
1906 // | | | | | A[4] | | | A[7] | |
1907 return ScaledDist % Stride;
1908}
1909
1910/// Returns true if any of the underlying objects has a loop varying address,
1911/// i.e. may change in \p L.
1912static bool
1914 ScalarEvolution &SE, const Loop *L) {
1915 return any_of(UnderlyingObjects, [&SE, L](const Value *UO) {
1916 return !SE.isLoopInvariant(SE.getSCEV(const_cast<Value *>(UO)), L);
1917 });
1918}
1919
1920namespace {
1921struct DepDistanceStrideAndSizeInfo {
1922 const SCEV *Dist;
1923 uint64_t StrideA;
1924 uint64_t StrideB;
1925 uint64_t TypeByteSize;
1926 bool AIsWrite;
1927 bool BIsWrite;
1928
1929 DepDistanceStrideAndSizeInfo(const SCEV *Dist, uint64_t StrideA,
1930 uint64_t StrideB, uint64_t TypeByteSize,
1931 bool AIsWrite, bool BIsWrite)
1932 : Dist(Dist), StrideA(StrideA), StrideB(StrideB),
1933 TypeByteSize(TypeByteSize), AIsWrite(AIsWrite), BIsWrite(BIsWrite) {}
1934};
1935} // namespace
1936
1937// Get the dependence distance, strides, type size and whether it is a write for
1938// the dependence between A and B. Returns a DepType, if we can prove there's
1939// no dependence or the analysis fails. Outlined to lambda to limit he scope
1940// of various temporary variables, like A/BPtr, StrideA/BPtr and others.
1941// Returns either the dependence result, if it could already be determined, or a
1942// struct containing (Distance, Stride, TypeSize, AIsWrite, BIsWrite).
1943static std::variant<MemoryDepChecker::Dependence::DepType,
1944 DepDistanceStrideAndSizeInfo>
1948 const DenseMap<Value *, const SCEV *> &Strides,
1949 const DenseMap<Value *, SmallVector<const Value *, 16>> &UnderlyingObjects,
1950 PredicatedScalarEvolution &PSE, const Loop *InnermostLoop) {
1951 auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout();
1952 auto &SE = *PSE.getSE();
1953 auto [APtr, AIsWrite] = A;
1954 auto [BPtr, BIsWrite] = B;
1955
1956 // Two reads are independent.
1957 if (!AIsWrite && !BIsWrite)
1959
1960 Type *ATy = getLoadStoreType(AInst);
1961 Type *BTy = getLoadStoreType(BInst);
1962
1963 // We cannot check pointers in different address spaces.
1964 if (APtr->getType()->getPointerAddressSpace() !=
1965 BPtr->getType()->getPointerAddressSpace())
1967
1968 int64_t StrideAPtr =
1969 getPtrStride(PSE, ATy, APtr, InnermostLoop, Strides, true).value_or(0);
1970 int64_t StrideBPtr =
1971 getPtrStride(PSE, BTy, BPtr, InnermostLoop, Strides, true).value_or(0);
1972
1973 const SCEV *Src = PSE.getSCEV(APtr);
1974 const SCEV *Sink = PSE.getSCEV(BPtr);
1975
1976 // If the induction step is negative we have to invert source and sink of the
1977 // dependence when measuring the distance between them. We should not swap
1978 // AIsWrite with BIsWrite, as their uses expect them in program order.
1979 if (StrideAPtr < 0) {
1980 std::swap(Src, Sink);
1981 std::swap(AInst, BInst);
1982 }
1983
1984 const SCEV *Dist = SE.getMinusSCEV(Sink, Src);
1985
1986 LLVM_DEBUG(dbgs() << "LAA: Src Scev: " << *Src << "Sink Scev: " << *Sink
1987 << "(Induction step: " << StrideAPtr << ")\n");
1988 LLVM_DEBUG(dbgs() << "LAA: Distance for " << *AInst << " to " << *BInst
1989 << ": " << *Dist << "\n");
1990
1991 // Needs accesses where the addresses of the accessed underlying objects do
1992 // not change within the loop.
1993 if (isLoopVariantIndirectAddress(UnderlyingObjects.find(APtr)->second, SE,
1994 InnermostLoop) ||
1995 isLoopVariantIndirectAddress(UnderlyingObjects.find(BPtr)->second, SE,
1996 InnermostLoop))
1998
1999 // Need accesses with constant strides and the same direction. We don't want
2000 // to vectorize "A[B[i]] += ..." and similar code or pointer arithmetic that
2001 // could wrap in the address space.
2002 if (!StrideAPtr || !StrideBPtr || (StrideAPtr > 0 && StrideBPtr < 0) ||
2003 (StrideAPtr < 0 && StrideBPtr > 0)) {
2004 LLVM_DEBUG(dbgs() << "Pointer access with non-constant stride\n");
2006 }
2007
2008 uint64_t TypeByteSize = DL.getTypeAllocSize(ATy);
2009 bool HasSameSize =
2010 DL.getTypeStoreSizeInBits(ATy) == DL.getTypeStoreSizeInBits(BTy);
2011 if (!HasSameSize)
2012 TypeByteSize = 0;
2013 return DepDistanceStrideAndSizeInfo(Dist, std::abs(StrideAPtr),
2014 std::abs(StrideBPtr), TypeByteSize,
2015 AIsWrite, BIsWrite);
2016}
2017
2018MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
2019 const MemAccessInfo &A, unsigned AIdx, const MemAccessInfo &B,
2020 unsigned BIdx, const DenseMap<Value *, const SCEV *> &Strides,
2022 &UnderlyingObjects) {
2023 assert(AIdx < BIdx && "Must pass arguments in program order");
2024
2025 // Get the dependence distance, stride, type size and what access writes for
2026 // the dependence between A and B.
2028 A, InstMap[AIdx], B, InstMap[BIdx], Strides, UnderlyingObjects, PSE,
2029 InnermostLoop);
2030 if (std::holds_alternative<Dependence::DepType>(Res))
2031 return std::get<Dependence::DepType>(Res);
2032
2033 const auto &[Dist, StrideA, StrideB, TypeByteSize, AIsWrite, BIsWrite] =
2034 std::get<DepDistanceStrideAndSizeInfo>(Res);
2035 bool HasSameSize = TypeByteSize > 0;
2036
2037 std::optional<uint64_t> CommonStride =
2038 StrideA == StrideB ? std::make_optional(StrideA) : std::nullopt;
2039 if (isa<SCEVCouldNotCompute>(Dist)) {
2040 // TODO: Relax requirement that there is a common stride to retry with
2041 // non-constant distance dependencies.
2042 FoundNonConstantDistanceDependence |= !!CommonStride;
2043 LLVM_DEBUG(dbgs() << "LAA: Dependence because of uncomputable distance.\n");
2044 return Dependence::Unknown;
2045 }
2046
2047 ScalarEvolution &SE = *PSE.getSE();
2048 auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout();
2049
2050 // If the distance between the acecsses is larger than their absolute stride
2051 // multiplied by the backedge taken count, the accesses are independet, i.e.
2052 // they are far enough appart that accesses won't access the same location
2053 // across all loop ierations.
2054 if (HasSameSize && CommonStride &&
2056 *CommonStride, TypeByteSize))
2057 return Dependence::NoDep;
2058
2059 const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
2060
2061 // Attempt to prove strided accesses independent.
2062 if (C) {
2063 const APInt &Val = C->getAPInt();
2064 int64_t Distance = Val.getSExtValue();
2065
2066 // If the distance between accesses and their strides are known constants,
2067 // check whether the accesses interlace each other.
2068 if (std::abs(Distance) > 0 && CommonStride && *CommonStride > 1 &&
2069 HasSameSize &&
2070 areStridedAccessesIndependent(std::abs(Distance), *CommonStride,
2071 TypeByteSize)) {
2072 LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n");
2073 return Dependence::NoDep;
2074 }
2075 }
2076
2077 // Negative distances are not plausible dependencies.
2078 if (SE.isKnownNonPositive(Dist)) {
2079 if (SE.isKnownNonNegative(Dist)) {
2080 if (HasSameSize) {
2081 // Write to the same location with the same size.
2082 return Dependence::Forward;
2083 } else {
2084 LLVM_DEBUG(dbgs() << "LAA: possibly zero dependence difference but "
2085 "different type sizes\n");
2086 return Dependence::Unknown;
2087 }
2088 }
2089
2090 bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
2091 // Check if the first access writes to a location that is read in a later
2092 // iteration, where the distance between them is not a multiple of a vector
2093 // factor and relatively small.
2094 //
2095 // NOTE: There is no need to update MaxSafeVectorWidthInBits after call to
2096 // couldPreventStoreLoadForward, even if it changed MinDepDistBytes, since a
2097 // forward dependency will allow vectorization using any width.
2098
2099 if (IsTrueDataDependence && EnableForwardingConflictDetection) {
2100 if (!C) {
2101 // TODO: FoundNonConstantDistanceDependence is used as a necessary
2102 // condition to consider retrying with runtime checks. Historically, we
2103 // did not set it when strides were different but there is no inherent
2104 // reason to.
2105 FoundNonConstantDistanceDependence |= CommonStride.has_value();
2106 return Dependence::Unknown;
2107 }
2108 if (!HasSameSize ||
2109 couldPreventStoreLoadForward(C->getAPInt().abs().getZExtValue(),
2110 TypeByteSize)) {
2111 LLVM_DEBUG(
2112 dbgs() << "LAA: Forward but may prevent st->ld forwarding\n");
2114 }
2115 }
2116
2117 LLVM_DEBUG(dbgs() << "LAA: Dependence is negative\n");
2118 return Dependence::Forward;
2119 }
2120
2121 if (!C) {
2122 // TODO: FoundNonConstantDistanceDependence is used as a necessary condition
2123 // to consider retrying with runtime checks. Historically, we did not set it
2124 // when strides were different but there is no inherent reason to.
2125 FoundNonConstantDistanceDependence |= CommonStride.has_value();
2126 LLVM_DEBUG(dbgs() << "LAA: Dependence because of non-constant distance\n");
2127 return Dependence::Unknown;
2128 }
2129
2130 if (!SE.isKnownPositive(Dist))
2131 return Dependence::Unknown;
2132
2133 if (!HasSameSize) {
2134 LLVM_DEBUG(dbgs() << "LAA: ReadWrite-Write positive dependency with "
2135 "different type sizes\n");
2136 return Dependence::Unknown;
2137 }
2138
2139 // The logic below currently only supports StrideA == StrideB, i.e. there's a
2140 // common stride.
2141 if (!CommonStride)
2142 return Dependence::Unknown;
2143
2144 const APInt &Val = C->getAPInt();
2145 int64_t Distance = Val.getSExtValue();
2146
2147 // Bail out early if passed-in parameters make vectorization not feasible.
2148 unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ?
2150 unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ?
2152 // The minimum number of iterations for a vectorized/unrolled version.
2153 unsigned MinNumIter = std::max(ForcedFactor * ForcedUnroll, 2U);
2154
2155 // It's not vectorizable if the distance is smaller than the minimum distance
2156 // needed for a vectroized/unrolled version. Vectorizing one iteration in
2157 // front needs TypeByteSize * Stride. Vectorizing the last iteration needs
2158 // TypeByteSize (No need to plus the last gap distance).
2159 //
2160 // E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
2161 // foo(int *A) {
2162 // int *B = (int *)((char *)A + 14);
2163 // for (i = 0 ; i < 1024 ; i += 2)
2164 // B[i] = A[i] + 1;
2165 // }
2166 //
2167 // Two accesses in memory (stride is 2):
2168 // | A[0] | | A[2] | | A[4] | | A[6] | |
2169 // | B[0] | | B[2] | | B[4] |
2170 //
2171 // Distance needs for vectorizing iterations except the last iteration:
2172 // 4 * 2 * (MinNumIter - 1). Distance needs for the last iteration: 4.
2173 // So the minimum distance needed is: 4 * 2 * (MinNumIter - 1) + 4.
2174 //
2175 // If MinNumIter is 2, it is vectorizable as the minimum distance needed is
2176 // 12, which is less than distance.
2177 //
2178 // If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4),
2179 // the minimum distance needed is 28, which is greater than distance. It is
2180 // not safe to do vectorization.
2181 uint64_t MinDistanceNeeded =
2182 TypeByteSize * (*CommonStride) * (MinNumIter - 1) + TypeByteSize;
2183 if (MinDistanceNeeded > static_cast<uint64_t>(Distance)) {
2184 LLVM_DEBUG(dbgs() << "LAA: Failure because of positive distance "
2185 << Distance << '\n');
2186 return Dependence::Backward;
2187 }
2188
2189 // Unsafe if the minimum distance needed is greater than smallest dependence
2190 // distance distance.
2191 if (MinDistanceNeeded > MinDepDistBytes) {
2192 LLVM_DEBUG(dbgs() << "LAA: Failure because it needs at least "
2193 << MinDistanceNeeded << " size in bytes\n");
2194 return Dependence::Backward;
2195 }
2196
2197 // Positive distance bigger than max vectorization factor.
2198 // FIXME: Should use max factor instead of max distance in bytes, which could
2199 // not handle different types.
2200 // E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
2201 // void foo (int *A, char *B) {
2202 // for (unsigned i = 0; i < 1024; i++) {
2203 // A[i+2] = A[i] + 1;
2204 // B[i+2] = B[i] + 1;
2205 // }
2206 // }
2207 //
2208 // This case is currently unsafe according to the max safe distance. If we
2209 // analyze the two accesses on array B, the max safe dependence distance
2210 // is 2. Then we analyze the accesses on array A, the minimum distance needed
2211 // is 8, which is less than 2 and forbidden vectorization, But actually
2212 // both A and B could be vectorized by 2 iterations.
2213 MinDepDistBytes =
2214 std::min(static_cast<uint64_t>(Distance), MinDepDistBytes);
2215
2216 bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
2217 uint64_t MinDepDistBytesOld = MinDepDistBytes;
2218 if (IsTrueDataDependence && EnableForwardingConflictDetection &&
2219 couldPreventStoreLoadForward(Distance, TypeByteSize)) {
2220 // Sanity check that we didn't update MinDepDistBytes when calling
2221 // couldPreventStoreLoadForward
2222 assert(MinDepDistBytes == MinDepDistBytesOld &&
2223 "An update to MinDepDistBytes requires an update to "
2224 "MaxSafeVectorWidthInBits");
2225 (void)MinDepDistBytesOld;
2227 }
2228
2229 // An update to MinDepDistBytes requires an update to MaxSafeVectorWidthInBits
2230 // since there is a backwards dependency.
2231 uint64_t MaxVF = MinDepDistBytes / (TypeByteSize * (*CommonStride));
2232 LLVM_DEBUG(dbgs() << "LAA: Positive distance " << Val.getSExtValue()
2233 << " with max VF = " << MaxVF << '\n');
2234 uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8;
2235 MaxSafeVectorWidthInBits = std::min(MaxSafeVectorWidthInBits, MaxVFInBits);
2237}
2238
2240 DepCandidates &AccessSets, MemAccessInfoList &CheckDeps,
2241 const DenseMap<Value *, const SCEV *> &Strides,
2243 &UnderlyingObjects) {
2244
2245 MinDepDistBytes = -1;
2247 for (MemAccessInfo CurAccess : CheckDeps) {
2248 if (Visited.count(CurAccess))
2249 continue;
2250
2251 // Get the relevant memory access set.
2253 AccessSets.findValue(AccessSets.getLeaderValue(CurAccess));
2254
2255 // Check accesses within this set.
2257 AccessSets.member_begin(I);
2259 AccessSets.member_end();
2260
2261 // Check every access pair.
2262 while (AI != AE) {
2263 Visited.insert(*AI);
2264 bool AIIsWrite = AI->getInt();
2265 // Check loads only against next equivalent class, but stores also against
2266 // other stores in the same equivalence class - to the same address.
2268 (AIIsWrite ? AI : std::next(AI));
2269 while (OI != AE) {
2270 // Check every accessing instruction pair in program order.
2271 for (std::vector<unsigned>::iterator I1 = Accesses[*AI].begin(),
2272 I1E = Accesses[*AI].end(); I1 != I1E; ++I1)
2273 // Scan all accesses of another equivalence class, but only the next
2274 // accesses of the same equivalent class.
2275 for (std::vector<unsigned>::iterator
2276 I2 = (OI == AI ? std::next(I1) : Accesses[*OI].begin()),
2277 I2E = (OI == AI ? I1E : Accesses[*OI].end());
2278 I2 != I2E; ++I2) {
2279 auto A = std::make_pair(&*AI, *I1);
2280 auto B = std::make_pair(&*OI, *I2);
2281
2282 assert(*I1 != *I2);
2283 if (*I1 > *I2)
2284 std::swap(A, B);
2285
2287 isDependent(*A.first, A.second, *B.first, B.second, Strides,
2288 UnderlyingObjects);
2290
2291 // Gather dependences unless we accumulated MaxDependences
2292 // dependences. In that case return as soon as we find the first
2293 // unsafe dependence. This puts a limit on this quadratic
2294 // algorithm.
2295 if (RecordDependences) {
2296 if (Type != Dependence::NoDep)
2297 Dependences.push_back(Dependence(A.second, B.second, Type));
2298
2299 if (Dependences.size() >= MaxDependences) {
2300 RecordDependences = false;
2301 Dependences.clear();
2303 << "Too many dependences, stopped recording\n");
2304 }
2305 }
2306 if (!RecordDependences && !isSafeForVectorization())
2307 return false;
2308 }
2309 ++OI;
2310 }
2311 AI++;
2312 }
2313 }
2314
2315 LLVM_DEBUG(dbgs() << "Total Dependences: " << Dependences.size() << "\n");
2316 return isSafeForVectorization();
2317}
2318
2321 MemAccessInfo Access(Ptr, isWrite);
2322 auto &IndexVector = Accesses.find(Access)->second;
2323
2325 transform(IndexVector,
2326 std::back_inserter(Insts),
2327 [&](unsigned Idx) { return this->InstMap[Idx]; });
2328 return Insts;
2329}
2330
2332 "NoDep",
2333 "Unknown",
2334 "IndirectUnsafe",
2335 "Forward",
2336 "ForwardButPreventsForwarding",
2337 "Backward",
2338 "BackwardVectorizable",
2339 "BackwardVectorizableButPreventsForwarding"};
2340
2342 raw_ostream &OS, unsigned Depth,
2343 const SmallVectorImpl<Instruction *> &Instrs) const {
2344 OS.indent(Depth) << DepName[Type] << ":\n";
2345 OS.indent(Depth + 2) << *Instrs[Source] << " -> \n";
2346 OS.indent(Depth + 2) << *Instrs[Destination] << "\n";
2347}
2348
2349bool LoopAccessInfo::canAnalyzeLoop() {
2350 // We need to have a loop header.
2351 LLVM_DEBUG(dbgs() << "LAA: Found a loop in "
2352 << TheLoop->getHeader()->getParent()->getName() << ": "
2353 << TheLoop->getHeader()->getName() << '\n');
2354
2355 // We can only analyze innermost loops.
2356 if (!TheLoop->isInnermost()) {
2357 LLVM_DEBUG(dbgs() << "LAA: loop is not the innermost loop\n");
2358 recordAnalysis("NotInnerMostLoop") << "loop is not the innermost loop";
2359 return false;
2360 }
2361
2362 // We must have a single backedge.
2363 if (TheLoop->getNumBackEdges() != 1) {
2364 LLVM_DEBUG(
2365 dbgs() << "LAA: loop control flow is not understood by analyzer\n");
2366 recordAnalysis("CFGNotUnderstood")
2367 << "loop control flow is not understood by analyzer";
2368 return false;
2369 }
2370
2371 // ScalarEvolution needs to be able to find the exit count.
2372 const SCEV *ExitCount = PSE->getBackedgeTakenCount();
2373 if (isa<SCEVCouldNotCompute>(ExitCount)) {
2374 recordAnalysis("CantComputeNumberOfIterations")
2375 << "could not determine number of loop iterations";
2376 LLVM_DEBUG(dbgs() << "LAA: SCEV could not compute the loop exit count.\n");
2377 return false;
2378 }
2379
2380 return true;
2381}
2382
2383void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
2384 const TargetLibraryInfo *TLI,
2385 DominatorTree *DT) {
2386 // Holds the Load and Store instructions.
2389 SmallPtrSet<MDNode *, 8> LoopAliasScopes;
2390
2391 // Holds all the different accesses in the loop.
2392 unsigned NumReads = 0;
2393 unsigned NumReadWrites = 0;
2394
2395 bool HasComplexMemInst = false;
2396
2397 // A runtime check is only legal to insert if there are no convergent calls.
2398 HasConvergentOp = false;
2399
2400 PtrRtChecking->Pointers.clear();
2401 PtrRtChecking->Need = false;
2402
2403 const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
2404
2405 const bool EnableMemAccessVersioningOfLoop =
2407 !TheLoop->getHeader()->getParent()->hasOptSize();
2408
2409 // Traverse blocks in fixed RPOT order, regardless of their storage in the
2410 // loop info, as it may be arbitrary.
2411 LoopBlocksRPO RPOT(TheLoop);
2412 RPOT.perform(LI);
2413 for (BasicBlock *BB : RPOT) {
2414 // Scan the BB and collect legal loads and stores. Also detect any
2415 // convergent instructions.
2416 for (Instruction &I : *BB) {
2417 if (auto *Call = dyn_cast<CallBase>(&I)) {
2418 if (Call->isConvergent())
2419 HasConvergentOp = true;
2420 }
2421
2422 // With both a non-vectorizable memory instruction and a convergent
2423 // operation, found in this loop, no reason to continue the search.
2424 if (HasComplexMemInst && HasConvergentOp) {
2425 CanVecMem = false;
2426 return;
2427 }
2428
2429 // Avoid hitting recordAnalysis multiple times.
2430 if (HasComplexMemInst)
2431 continue;
2432
2433 // Record alias scopes defined inside the loop.
2434 if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I))
2435 for (Metadata *Op : Decl->getScopeList()->operands())
2436 LoopAliasScopes.insert(cast<MDNode>(Op));
2437
2438 // Many math library functions read the rounding mode. We will only
2439 // vectorize a loop if it contains known function calls that don't set
2440 // the flag. Therefore, it is safe to ignore this read from memory.
2441 auto *Call = dyn_cast<CallInst>(&I);
2442 if (Call && getVectorIntrinsicIDForCall(Call, TLI))
2443 continue;
2444
2445 // If this is a load, save it. If this instruction can read from memory
2446 // but is not a load, then we quit. Notice that we don't handle function
2447 // calls that read or write.
2448 if (I.mayReadFromMemory()) {
2449 // If the function has an explicit vectorized counterpart, we can safely
2450 // assume that it can be vectorized.
2451 if (Call && !Call->isNoBuiltin() && Call->getCalledFunction() &&
2452 !VFDatabase::getMappings(*Call).empty())
2453 continue;
2454
2455 auto *Ld = dyn_cast<LoadInst>(&I);
2456 if (!Ld) {
2457 recordAnalysis("CantVectorizeInstruction", Ld)
2458 << "instruction cannot be vectorized";
2459 HasComplexMemInst = true;
2460 continue;
2461 }
2462 if (!Ld->isSimple() && !IsAnnotatedParallel) {
2463 recordAnalysis("NonSimpleLoad", Ld)
2464 << "read with atomic ordering or volatile read";
2465 LLVM_DEBUG(dbgs() << "LAA: Found a non-simple load.\n");
2466 HasComplexMemInst = true;
2467 continue;
2468 }
2469 NumLoads++;
2470 Loads.push_back(Ld);
2471 DepChecker->addAccess(Ld);
2472 if (EnableMemAccessVersioningOfLoop)
2473 collectStridedAccess(Ld);
2474 continue;
2475 }
2476
2477 // Save 'store' instructions. Abort if other instructions write to memory.
2478 if (I.mayWriteToMemory()) {
2479 auto *St = dyn_cast<StoreInst>(&I);
2480 if (!St) {
2481 recordAnalysis("CantVectorizeInstruction", St)
2482 << "instruction cannot be vectorized";
2483 HasComplexMemInst = true;
2484 continue;
2485 }
2486 if (!St->isSimple() && !IsAnnotatedParallel) {
2487 recordAnalysis("NonSimpleStore", St)
2488 << "write with atomic ordering or volatile write";
2489 LLVM_DEBUG(dbgs() << "LAA: Found a non-simple store.\n");
2490 HasComplexMemInst = true;
2491 continue;
2492 }
2493 NumStores++;
2494 Stores.push_back(St);
2495 DepChecker->addAccess(St);
2496 if (EnableMemAccessVersioningOfLoop)
2497 collectStridedAccess(St);
2498 }
2499 } // Next instr.
2500 } // Next block.
2501
2502 if (HasComplexMemInst) {
2503 CanVecMem = false;
2504 return;
2505 }
2506
2507 // Now we have two lists that hold the loads and the stores.
2508 // Next, we find the pointers that they use.
2509
2510 // Check if we see any stores. If there are no stores, then we don't
2511 // care if the pointers are *restrict*.
2512 if (!Stores.size()) {
2513 LLVM_DEBUG(dbgs() << "LAA: Found a read-only loop!\n");
2514 CanVecMem = true;
2515 return;
2516 }
2517
2518 MemoryDepChecker::DepCandidates DependentAccesses;
2519 AccessAnalysis Accesses(TheLoop, AA, LI, DependentAccesses, *PSE,
2520 LoopAliasScopes);
2521
2522 // Holds the analyzed pointers. We don't want to call getUnderlyingObjects
2523 // multiple times on the same object. If the ptr is accessed twice, once
2524 // for read and once for write, it will only appear once (on the write
2525 // list). This is okay, since we are going to check for conflicts between
2526 // writes and between reads and writes, but not between reads and reads.
2528
2529 // Record uniform store addresses to identify if we have multiple stores
2530 // to the same address.
2531 SmallPtrSet<Value *, 16> UniformStores;
2532
2533 for (StoreInst *ST : Stores) {
2534 Value *Ptr = ST->getPointerOperand();
2535
2536 if (isInvariant(Ptr)) {
2537 // Record store instructions to loop invariant addresses
2538 StoresToInvariantAddresses.push_back(ST);
2539 HasDependenceInvolvingLoopInvariantAddress |=
2540 !UniformStores.insert(Ptr).second;
2541 }
2542
2543 // If we did *not* see this pointer before, insert it to the read-write
2544 // list. At this phase it is only a 'write' list.
2545 Type *AccessTy = getLoadStoreType(ST);
2546 if (Seen.insert({Ptr, AccessTy}).second) {
2547 ++NumReadWrites;
2548
2550 // The TBAA metadata could have a control dependency on the predication
2551 // condition, so we cannot rely on it when determining whether or not we
2552 // need runtime pointer checks.
2553 if (blockNeedsPredication(ST->getParent(), TheLoop, DT))
2554 Loc.AATags.TBAA = nullptr;
2555
2556 visitPointers(const_cast<Value *>(Loc.Ptr), *TheLoop,
2557 [&Accesses, AccessTy, Loc](Value *Ptr) {
2558 MemoryLocation NewLoc = Loc.getWithNewPtr(Ptr);
2559 Accesses.addStore(NewLoc, AccessTy);
2560 });
2561 }
2562 }
2563
2564 if (IsAnnotatedParallel) {
2565 LLVM_DEBUG(
2566 dbgs() << "LAA: A loop annotated parallel, ignore memory dependency "
2567 << "checks.\n");
2568 CanVecMem = true;
2569 return;
2570 }
2571
2572 for (LoadInst *LD : Loads) {
2573 Value *Ptr = LD->getPointerOperand();
2574 // If we did *not* see this pointer before, insert it to the
2575 // read list. If we *did* see it before, then it is already in
2576 // the read-write list. This allows us to vectorize expressions
2577 // such as A[i] += x; Because the address of A[i] is a read-write
2578 // pointer. This only works if the index of A[i] is consecutive.
2579 // If the address of i is unknown (for example A[B[i]]) then we may
2580 // read a few words, modify, and write a few words, and some of the
2581 // words may be written to the same address.
2582 bool IsReadOnlyPtr = false;
2583 Type *AccessTy = getLoadStoreType(LD);
2584 if (Seen.insert({Ptr, AccessTy}).second ||
2585 !getPtrStride(*PSE, LD->getType(), Ptr, TheLoop, SymbolicStrides).value_or(0)) {
2586 ++NumReads;
2587 IsReadOnlyPtr = true;
2588 }
2589
2590 // See if there is an unsafe dependency between a load to a uniform address and
2591 // store to the same uniform address.
2592 if (UniformStores.count(Ptr)) {
2593 LLVM_DEBUG(dbgs() << "LAA: Found an unsafe dependency between a uniform "
2594 "load and uniform store to the same address!\n");
2595 HasDependenceInvolvingLoopInvariantAddress = true;
2596 }
2597
2599 // The TBAA metadata could have a control dependency on the predication
2600 // condition, so we cannot rely on it when determining whether or not we
2601 // need runtime pointer checks.
2602 if (blockNeedsPredication(LD->getParent(), TheLoop, DT))
2603 Loc.AATags.TBAA = nullptr;
2604
2605 visitPointers(const_cast<Value *>(Loc.Ptr), *TheLoop,
2606 [&Accesses, AccessTy, Loc, IsReadOnlyPtr](Value *Ptr) {
2607 MemoryLocation NewLoc = Loc.getWithNewPtr(Ptr);
2608 Accesses.addLoad(NewLoc, AccessTy, IsReadOnlyPtr);
2609 });
2610 }
2611
2612 // If we write (or read-write) to a single destination and there are no
2613 // other reads in this loop then is it safe to vectorize.
2614 if (NumReadWrites == 1 && NumReads == 0) {
2615 LLVM_DEBUG(dbgs() << "LAA: Found a write-only loop!\n");
2616 CanVecMem = true;
2617 return;
2618 }
2619
2620 // Build dependence sets and check whether we need a runtime pointer bounds
2621 // check.
2622 Accesses.buildDependenceSets();
2623
2624 // Find pointers with computable bounds. We are going to use this information
2625 // to place a runtime bound check.
2626 Value *UncomputablePtr = nullptr;
2627 bool CanDoRTIfNeeded =
2628 Accesses.canCheckPtrAtRT(*PtrRtChecking, PSE->getSE(), TheLoop,
2629 SymbolicStrides, UncomputablePtr, false);
2630 if (!CanDoRTIfNeeded) {
2631 auto *I = dyn_cast_or_null<Instruction>(UncomputablePtr);
2632 recordAnalysis("CantIdentifyArrayBounds", I)
2633 << "cannot identify array bounds";
2634 LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because we can't find "
2635 << "the array bounds.\n");
2636 CanVecMem = false;
2637 return;
2638 }
2639
2640 LLVM_DEBUG(
2641 dbgs() << "LAA: May be able to perform a memory runtime check if needed.\n");
2642
2643 CanVecMem = true;
2644 if (Accesses.isDependencyCheckNeeded()) {
2645 LLVM_DEBUG(dbgs() << "LAA: Checking memory dependencies\n");
2646 CanVecMem = DepChecker->areDepsSafe(
2647 DependentAccesses, Accesses.getDependenciesToCheck(), SymbolicStrides,
2648 Accesses.getUnderlyingObjects());
2649
2650 if (!CanVecMem && DepChecker->shouldRetryWithRuntimeCheck()) {
2651 LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n");
2652
2653 // Clear the dependency checks. We assume they are not needed.
2654 Accesses.resetDepChecks(*DepChecker);
2655
2656 PtrRtChecking->reset();
2657 PtrRtChecking->Need = true;
2658
2659 auto *SE = PSE->getSE();
2660 UncomputablePtr = nullptr;
2661 CanDoRTIfNeeded = Accesses.canCheckPtrAtRT(
2662 *PtrRtChecking, SE, TheLoop, SymbolicStrides, UncomputablePtr, true);
2663
2664 // Check that we found the bounds for the pointer.
2665 if (!CanDoRTIfNeeded) {
2666 auto *I = dyn_cast_or_null<Instruction>(UncomputablePtr);
2667 recordAnalysis("CantCheckMemDepsAtRunTime", I)
2668 << "cannot check memory dependencies at runtime";
2669 LLVM_DEBUG(dbgs() << "LAA: Can't vectorize with memory checks\n");
2670 CanVecMem = false;
2671 return;
2672 }
2673
2674 CanVecMem = true;
2675 }
2676 }
2677
2678 if (HasConvergentOp) {
2679 recordAnalysis("CantInsertRuntimeCheckWithConvergent")
2680 << "cannot add control dependency to convergent operation";
2681 LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because a runtime check "
2682 "would be needed with a convergent operation\n");
2683 CanVecMem = false;
2684 return;
2685 }
2686
2687 if (CanVecMem)
2688 LLVM_DEBUG(
2689 dbgs() << "LAA: No unsafe dependent memory operations in loop. We"
2690 << (PtrRtChecking->Need ? "" : " don't")
2691 << " need runtime memory checks.\n");
2692 else
2693 emitUnsafeDependenceRemark();
2694}
2695
2696void LoopAccessInfo::emitUnsafeDependenceRemark() {
2697 auto Deps = getDepChecker().getDependences();
2698 if (!Deps)
2699 return;
2700 auto Found = llvm::find_if(*Deps, [](const MemoryDepChecker::Dependence &D) {
2703 });
2704 if (Found == Deps->end())
2705 return;
2706 MemoryDepChecker::Dependence Dep = *Found;
2707
2708 LLVM_DEBUG(dbgs() << "LAA: unsafe dependent memory operations in loop\n");
2709
2710 // Emit remark for first unsafe dependence
2711 bool HasForcedDistribution = false;
2712 std::optional<const MDOperand *> Value =
2713 findStringMetadataForLoop(TheLoop, "llvm.loop.distribute.enable");
2714 if (Value) {
2715 const MDOperand *Op = *Value;
2716 assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata");
2717 HasForcedDistribution = mdconst::extract<ConstantInt>(*Op)->getZExtValue();
2718 }
2719
2720 const std::string Info =
2721 HasForcedDistribution
2722 ? "unsafe dependent memory operations in loop."
2723 : "unsafe dependent memory operations in loop. Use "
2724 "#pragma clang loop distribute(enable) to allow loop distribution "
2725 "to attempt to isolate the offending operations into a separate "
2726 "loop";
2728 recordAnalysis("UnsafeDep", Dep.getDestination(*this)) << Info;
2729
2730 switch (Dep.Type) {
2734 llvm_unreachable("Unexpected dependence");
2736 R << "\nBackward loop carried data dependence.";
2737 break;
2739 R << "\nForward loop carried data dependence that prevents "
2740 "store-to-load forwarding.";
2741 break;
2743 R << "\nBackward loop carried data dependence that prevents "
2744 "store-to-load forwarding.";
2745 break;
2747 R << "\nUnsafe indirect dependence.";
2748 break;
2750 R << "\nUnknown data dependence.";
2751 break;
2752 }
2753
2754 if (Instruction *I = Dep.getSource(*this)) {
2755 DebugLoc SourceLoc = I->getDebugLoc();
2756 if (auto *DD = dyn_cast_or_null<Instruction>(getPointerOperand(I)))
2757 SourceLoc = DD->getDebugLoc();
2758 if (SourceLoc)
2759 R << " Memory location is the same as accessed at "
2760 << ore::NV("Location", SourceLoc);
2761 }
2762}
2763
2765 DominatorTree *DT) {
2766 assert(TheLoop->contains(BB) && "Unknown block used");
2767
2768 // Blocks that do not dominate the latch need predication.
2769 BasicBlock* Latch = TheLoop->getLoopLatch();
2770 return !DT->dominates(BB, Latch);
2771}
2772
2773OptimizationRemarkAnalysis &LoopAccessInfo::recordAnalysis(StringRef RemarkName,
2774 Instruction *I) {
2775 assert(!Report && "Multiple reports generated");
2776
2777 Value *CodeRegion = TheLoop->getHeader();
2778 DebugLoc DL = TheLoop->getStartLoc();
2779
2780 if (I) {
2781 CodeRegion = I->getParent();
2782 // If there is no debug location attached to the instruction, revert back to
2783 // using the loop's.
2784 if (I->getDebugLoc())
2785 DL = I->getDebugLoc();
2786 }
2787
2788 Report = std::make_unique<OptimizationRemarkAnalysis>(DEBUG_TYPE, RemarkName, DL,
2789 CodeRegion);
2790 return *Report;
2791}
2792
2794 auto *SE = PSE->getSE();
2795 // TODO: Is this really what we want? Even without FP SCEV, we may want some
2796 // trivially loop-invariant FP values to be considered invariant.
2797 if (!SE->isSCEVable(V->getType()))
2798 return false;
2799 const SCEV *S = SE->getSCEV(V);
2800 return SE->isLoopInvariant(S, TheLoop);
2801}
2802
2803/// Find the operand of the GEP that should be checked for consecutive
2804/// stores. This ignores trailing indices that have no effect on the final
2805/// pointer.
2806static unsigned getGEPInductionOperand(const GetElementPtrInst *Gep) {
2807 const DataLayout &DL = Gep->getModule()->getDataLayout();
2808 unsigned LastOperand = Gep->getNumOperands() - 1;
2809 TypeSize GEPAllocSize = DL.getTypeAllocSize(Gep->getResultElementType());
2810
2811 // Walk backwards and try to peel off zeros.
2812 while (LastOperand > 1 && match(Gep->getOperand(LastOperand), m_Zero())) {
2813 // Find the type we're currently indexing into.
2814 gep_type_iterator GEPTI = gep_type_begin(Gep);
2815 std::advance(GEPTI, LastOperand - 2);
2816
2817 // If it's a type with the same allocation size as the result of the GEP we
2818 // can peel off the zero index.
2819 TypeSize ElemSize = GEPTI.isStruct()
2820 ? DL.getTypeAllocSize(GEPTI.getIndexedType())
2822 if (ElemSize != GEPAllocSize)
2823 break;
2824 --LastOperand;
2825 }
2826
2827 return LastOperand;
2828}
2829
2830/// If the argument is a GEP, then returns the operand identified by
2831/// getGEPInductionOperand. However, if there is some other non-loop-invariant
2832/// operand, it returns that instead.
2834 GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
2835 if (!GEP)
2836 return Ptr;
2837
2838 unsigned InductionOperand = getGEPInductionOperand(GEP);
2839
2840 // Check that all of the gep indices are uniform except for our induction
2841 // operand.
2842 for (unsigned i = 0, e = GEP->getNumOperands(); i != e; ++i)
2843 if (i != InductionOperand &&
2844 !SE->isLoopInvariant(SE->getSCEV(GEP->getOperand(i)), Lp))
2845 return Ptr;
2846 return GEP->getOperand(InductionOperand);
2847}
2848
2849/// If a value has only one user that is a CastInst, return it.
2851 Value *UniqueCast = nullptr;
2852 for (User *U : Ptr->users()) {
2853 CastInst *CI = dyn_cast<CastInst>(U);
2854 if (CI && CI->getType() == Ty) {
2855 if (!UniqueCast)
2856 UniqueCast = CI;
2857 else
2858 return nullptr;
2859 }
2860 }
2861 return UniqueCast;
2862}
2863
2864/// Get the stride of a pointer access in a loop. Looks for symbolic
2865/// strides "a[i*stride]". Returns the symbolic stride, or null otherwise.
2867 auto *PtrTy = dyn_cast<PointerType>(Ptr->getType());
2868 if (!PtrTy || PtrTy->isAggregateType())
2869 return nullptr;
2870
2871 // Try to remove a gep instruction to make the pointer (actually index at this
2872 // point) easier analyzable. If OrigPtr is equal to Ptr we are analyzing the
2873 // pointer, otherwise, we are analyzing the index.
2874 Value *OrigPtr = Ptr;
2875
2876 // The size of the pointer access.
2877 int64_t PtrAccessSize = 1;
2878
2879 Ptr = stripGetElementPtr(Ptr, SE, Lp);
2880 const SCEV *V = SE->getSCEV(Ptr);
2881
2882 if (Ptr != OrigPtr)
2883 // Strip off casts.
2884 while (const SCEVIntegralCastExpr *C = dyn_cast<SCEVIntegralCastExpr>(V))
2885 V = C->getOperand();
2886
2887 const SCEVAddRecExpr *S = dyn_cast<SCEVAddRecExpr>(V);
2888 if (!S)
2889 return nullptr;
2890
2891 // If the pointer is invariant then there is no stride and it makes no
2892 // sense to add it here.
2893 if (Lp != S->getLoop())
2894 return nullptr;
2895
2896 V = S->getStepRecurrence(*SE);
2897 if (!V)
2898 return nullptr;
2899
2900 // Strip off the size of access multiplication if we are still analyzing the
2901 // pointer.
2902 if (OrigPtr == Ptr) {
2903 if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(V)) {
2904 if (M->getOperand(0)->getSCEVType() != scConstant)
2905 return nullptr;
2906
2907 const APInt &APStepVal = cast<SCEVConstant>(M->getOperand(0))->getAPInt();
2908
2909 // Huge step value - give up.
2910 if (APStepVal.getBitWidth() > 64)
2911 return nullptr;
2912
2913 int64_t StepVal = APStepVal.getSExtValue();
2914 if (PtrAccessSize != StepVal)
2915 return nullptr;
2916 V = M->getOperand(1);
2917 }
2918 }
2919
2920 // Note that the restriction after this loop invariant check are only
2921 // profitability restrictions.
2922 if (!SE->isLoopInvariant(V, Lp))
2923 return nullptr;
2924
2925 // Look for the loop invariant symbolic value.
2926 const SCEVUnknown *U = dyn_cast<SCEVUnknown>(V);
2927 if (!U) {
2928 const auto *C = dyn_cast<SCEVIntegralCastExpr>(V);
2929 if (!C)
2930 return nullptr;
2931 U = dyn_cast<SCEVUnknown>(C->getOperand());
2932 if (!U)
2933 return nullptr;
2934
2935 // Match legacy behavior - this is not needed for correctness
2936 if (!getUniqueCastUse(U->getValue(), Lp, V->getType()))
2937 return nullptr;
2938 }
2939
2940 return V;
2941}
2942
2943void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
2944 Value *Ptr = getLoadStorePointerOperand(MemAccess);
2945 if (!Ptr)
2946 return;
2947
2948 // Note: getStrideFromPointer is a *profitability* heuristic. We
2949 // could broaden the scope of values returned here - to anything
2950 // which happens to be loop invariant and contributes to the
2951 // computation of an interesting IV - but we chose not to as we
2952 // don't have a cost model here, and broadening the scope exposes
2953 // far too many unprofitable cases.
2954 const SCEV *StrideExpr = getStrideFromPointer(Ptr, PSE->getSE(), TheLoop);
2955 if (!StrideExpr)
2956 return;
2957
2958 LLVM_DEBUG(dbgs() << "LAA: Found a strided access that is a candidate for "
2959 "versioning:");
2960 LLVM_DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *StrideExpr << "\n");
2961
2962 if (!SpeculateUnitStride) {
2963 LLVM_DEBUG(dbgs() << " Chose not to due to -laa-speculate-unit-stride\n");
2964 return;
2965 }
2966
2967 // Avoid adding the "Stride == 1" predicate when we know that
2968 // Stride >= Trip-Count. Such a predicate will effectively optimize a single
2969 // or zero iteration loop, as Trip-Count <= Stride == 1.
2970 //
2971 // TODO: We are currently not making a very informed decision on when it is
2972 // beneficial to apply stride versioning. It might make more sense that the
2973 // users of this analysis (such as the vectorizer) will trigger it, based on
2974 // their specific cost considerations; For example, in cases where stride
2975 // versioning does not help resolving memory accesses/dependences, the
2976 // vectorizer should evaluate the cost of the runtime test, and the benefit
2977 // of various possible stride specializations, considering the alternatives
2978 // of using gather/scatters (if available).
2979
2980 const SCEV *BETakenCount = PSE->getBackedgeTakenCount();
2981
2982 // Match the types so we can compare the stride and the BETakenCount.
2983 // The Stride can be positive/negative, so we sign extend Stride;
2984 // The backedgeTakenCount is non-negative, so we zero extend BETakenCount.
2985 const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout();
2986 uint64_t StrideTypeSizeBits = DL.getTypeSizeInBits(StrideExpr->getType());
2987 uint64_t BETypeSizeBits = DL.getTypeSizeInBits(BETakenCount->getType());
2988 const SCEV *CastedStride = StrideExpr;
2989 const SCEV *CastedBECount = BETakenCount;
2990 ScalarEvolution *SE = PSE->getSE();
2991 if (BETypeSizeBits >= StrideTypeSizeBits)
2992 CastedStride = SE->getNoopOrSignExtend(StrideExpr, BETakenCount->getType());
2993 else
2994 CastedBECount = SE->getZeroExtendExpr(BETakenCount, StrideExpr->getType());
2995 const SCEV *StrideMinusBETaken = SE->getMinusSCEV(CastedStride, CastedBECount);
2996 // Since TripCount == BackEdgeTakenCount + 1, checking:
2997 // "Stride >= TripCount" is equivalent to checking:
2998 // Stride - BETakenCount > 0
2999 if (SE->isKnownPositive(StrideMinusBETaken)) {
3000 LLVM_DEBUG(
3001 dbgs() << "LAA: Stride>=TripCount; No point in versioning as the "
3002 "Stride==1 predicate will imply that the loop executes "
3003 "at most once.\n");
3004 return;
3005 }
3006 LLVM_DEBUG(dbgs() << "LAA: Found a strided access that we can version.\n");
3007
3008 // Strip back off the integer cast, and check that our result is a
3009 // SCEVUnknown as we expect.
3010 const SCEV *StrideBase = StrideExpr;
3011 if (const auto *C = dyn_cast<SCEVIntegralCastExpr>(StrideBase))
3012 StrideBase = C->getOperand();
3013 SymbolicStrides[Ptr] = cast<SCEVUnknown>(StrideBase);
3014}
3015
3017 const TargetLibraryInfo *TLI, AAResults *AA,
3018 DominatorTree *DT, LoopInfo *LI)
3019 : PSE(std::make_unique<PredicatedScalarEvolution>(*SE, *L)),
3020 PtrRtChecking(nullptr),
3021 DepChecker(std::make_unique<MemoryDepChecker>(*PSE, L)), TheLoop(L) {
3022 PtrRtChecking = std::make_unique<RuntimePointerChecking>(*DepChecker, SE);
3023 if (canAnalyzeLoop()) {
3024 analyzeLoop(AA, LI, TLI, DT);
3025 }
3026}
3027
3029 if (CanVecMem) {
3030 OS.indent(Depth) << "Memory dependences are safe";
3031 const MemoryDepChecker &DC = getDepChecker();
3032 if (!DC.isSafeForAnyVectorWidth())
3033 OS << " with a maximum safe vector width of "
3034 << DC.getMaxSafeVectorWidthInBits() << " bits";
3035 if (PtrRtChecking->Need)
3036 OS << " with run-time checks";
3037 OS << "\n";
3038 }
3039
3040 if (HasConvergentOp)
3041 OS.indent(Depth) << "Has convergent operation in loop\n";
3042
3043 if (Report)
3044 OS.indent(Depth) << "Report: " << Report->getMsg() << "\n";
3045
3046 if (auto *Dependences = DepChecker->getDependences()) {
3047 OS.indent(Depth) << "Dependences:\n";
3048 for (const auto &Dep : *Dependences) {
3049 Dep.print(OS, Depth + 2, DepChecker->getMemoryInstructions());
3050 OS << "\n";
3051 }
3052 } else
3053 OS.indent(Depth) << "Too many dependences, not recorded\n";
3054
3055 // List the pair of accesses need run-time checks to prove independence.
3056 PtrRtChecking->print(OS, Depth);
3057 OS << "\n";
3058
3059 OS.indent(Depth) << "Non vectorizable stores to invariant address were "
3060 << (HasDependenceInvolvingLoopInvariantAddress ? "" : "not ")
3061 << "found in loop.\n";
3062
3063 OS.indent(Depth) << "SCEV assumptions:\n";
3064 PSE->getPredicate().print(OS, Depth);
3065
3066 OS << "\n";
3067
3068 OS.indent(Depth) << "Expressions re-written:\n";
3069 PSE->print(OS, Depth);
3070}
3071
3073 auto I = LoopAccessInfoMap.insert({&L, nullptr});
3074
3075 if (I.second)
3076 I.first->second =
3077 std::make_unique<LoopAccessInfo>(&L, &SE, TLI, &AA, &DT, &LI);
3078
3079 return *I.first->second;
3080}
3081
3083 Function &F, const PreservedAnalyses &PA,
3085 // Check whether our analysis is preserved.
3086 auto PAC = PA.getChecker<LoopAccessAnalysis>();
3087 if (!PAC.preserved() && !PAC.preservedSet<AllAnalysesOn<Function>>())
3088 // If not, give up now.
3089 return true;
3090
3091 // Check whether the analyses we depend on became invalid for any reason.
3092 // Skip checking TargetLibraryAnalysis as it is immutable and can't become
3093 // invalid.
3094 return Inv.invalidate<AAManager>(F, PA) ||
3096 Inv.invalidate<LoopAnalysis>(F, PA) ||
3098}
3099
3103 auto &AA = FAM.getResult<AAManager>(F);
3104 auto &DT = FAM.getResult<DominatorTreeAnalysis>(F);
3105 auto &LI = FAM.getResult<LoopAnalysis>(F);
3106 auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
3107 return LoopAccessInfoManager(SE, AA, DT, LI, &TLI);
3108}
3109
3110AnalysisKey LoopAccessAnalysis::Key;
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file implements a class to represent arbitrary precision integral constant values and operations...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
Generic implementation of equivalence classes through the use Tarjan's efficient union-find algorithm...
#define Check(C,...)
#define DEBUG_TYPE
Hexagon Common GEP
IRTranslator LLVM IR MI
static cl::opt< unsigned > MaxDependences("max-dependences", cl::Hidden, cl::desc("Maximum number of dependences collected by " "loop-access analysis (default = 100)"), cl::init(100))
We collect dependences up to this threshold.
static cl::opt< bool > EnableForwardingConflictDetection("store-to-load-forwarding-conflict-detection", cl::Hidden, cl::desc("Enable conflict detection in loop-access analysis"), cl::init(true))
Enable store-to-load forwarding conflict detection.
static void findForkedSCEVs(ScalarEvolution *SE, const Loop *L, Value *Ptr, SmallVectorImpl< PointerIntPair< const SCEV *, 1, bool > > &ScevList, unsigned Depth)
static std::variant< MemoryDepChecker::Dependence::DepType, DepDistanceStrideAndSizeInfo > getDependenceDistanceStrideAndSize(const AccessAnalysis::MemAccessInfo &A, Instruction *AInst, const AccessAnalysis::MemAccessInfo &B, Instruction *BInst, const DenseMap< Value *, const SCEV * > &Strides, const DenseMap< Value *, SmallVector< const Value *, 16 > > &UnderlyingObjects, PredicatedScalarEvolution &PSE, const Loop *InnermostLoop)
static bool hasComputableBounds(PredicatedScalarEvolution &PSE, Value *Ptr, const SCEV *PtrScev, Loop *L, bool Assume)
Check whether a pointer can participate in a runtime bounds check.
static cl::opt< unsigned > MemoryCheckMergeThreshold("memory-check-merge-threshold", cl::Hidden, cl::desc("Maximum number of comparisons done when trying to merge " "runtime memory checks. (default = 100)"), cl::init(100))
The maximum iterations used to merge memory checks.
static bool isNoWrap(PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &Strides, Value *Ptr, Type *AccessTy, Loop *L)
Check whether a pointer address cannot wrap.
static const SCEV * getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp)
Get the stride of a pointer access in a loop.
static unsigned getGEPInductionOperand(const GetElementPtrInst *Gep)
Find the operand of the GEP that should be checked for consecutive stores.
static cl::opt< unsigned, true > VectorizationInterleave("force-vector-interleave", cl::Hidden, cl::desc("Sets the vectorization interleave count. " "Zero is autoselect."), cl::location(VectorizerParams::VectorizationInterleave))
static bool isLoopVariantIndirectAddress(ArrayRef< const Value * > UnderlyingObjects, ScalarEvolution &SE, const Loop *L)
Returns true if any of the underlying objects has a loop varying address, i.e.
static Value * getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty)
If a value has only one user that is a CastInst, return it.
static cl::opt< bool, true > HoistRuntimeChecks("hoist-runtime-checks", cl::Hidden, cl::desc("Hoist inner loop runtime memory checks to outer loop if possible"), cl::location(VectorizerParams::HoistRuntimeChecks), cl::init(true))
static cl::opt< unsigned, true > VectorizationFactor("force-vector-width", cl::Hidden, cl::desc("Sets the SIMD width. Zero is autoselect."), cl::location(VectorizerParams::VectorizationFactor))
static cl::opt< unsigned, true > RuntimeMemoryCheckThreshold("runtime-memory-check-threshold", cl::Hidden, cl::desc("When performing memory disambiguation checks at runtime do not " "generate more than this number of comparisons (default = 8)."), cl::location(VectorizerParams::RuntimeMemoryCheckThreshold), cl::init(8))
static void visitPointers(Value *StartPtr, const Loop &InnermostLoop, function_ref< void(Value *)> AddPointer)
static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR, PredicatedScalarEvolution &PSE, const Loop *L)
Return true if an AddRec pointer Ptr is unsigned non-wrapping, i.e.
static Value * stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp)
If the argument is a GEP, then returns the operand identified by getGEPInductionOperand.
static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE, const SCEV &BackedgeTakenCount, const SCEV &Dist, uint64_t Stride, uint64_t TypeByteSize)
Given a dependence-distance Dist between two memory accesses, that have the same stride whose absolut...
static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride, uint64_t TypeByteSize)
Check the dependence for two accesses with the same stride Stride.
static const SCEV * getMinFromExprs(const SCEV *I, const SCEV *J, ScalarEvolution *SE)
Compare I and J and return the minimum.
static cl::opt< unsigned > MaxForkedSCEVDepth("max-forked-scev-depth", cl::Hidden, cl::desc("Maximum recursion depth when finding forked SCEVs (default = 5)"), cl::init(5))
static cl::opt< bool > SpeculateUnitStride("laa-speculate-unit-stride", cl::Hidden, cl::desc("Speculate that non-constant strides are unit in LAA"), cl::init(true))
static SmallVector< PointerIntPair< const SCEV *, 1, bool > > findForkedPointer(PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap, Value *Ptr, const Loop *L)
static cl::opt< bool > EnableMemAccessVersioning("enable-mem-access-versioning", cl::init(true), cl::Hidden, cl::desc("Enable symbolic stride memory access versioning"))
This enables versioning on the strides of symbolically striding memory accesses in code like the foll...
This header provides classes for managing per-loop analyses.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file provides utility analysis objects describing memory locations.
uint64_t High
#define P(N)
FunctionAnalysisManager FAM
This header defines various interfaces for pass management in LLVM.
This file defines the PointerIntPair class.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
static LLVM_ATTRIBUTE_ALWAYS_INLINE bool CheckType(MVT::SimpleValueType VT, SDValue N, const TargetLowering *TLI, const DataLayout &DL)
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
static const X86InstrFMA3Group Groups[]
A manager for alias analyses.
Class for arbitrary precision integers.
Definition: APInt.h:76
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1439
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1513
This templated class represents "all analyses that operate over <a particular IR unit>" (e....
Definition: Analysis.h:47
API to communicate dependencies between analyses during invalidation.
Definition: PassManager.h:360
bool invalidate(IRUnitT &IR, const PreservedAnalyses &PA)
Trigger the invalidation of some other analysis pass if not already handled and return whether it was...
Definition: PassManager.h:378
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:321
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:473
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:204
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:206
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
Definition: BasicBlock.cpp:289
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:601
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
A debug info location.
Definition: DebugLoc.h:33
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
iterator end()
Definition: DenseMap.h:84
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
EquivalenceClasses - This represents a collection of equivalence classes and supports three efficient...
iterator findValue(const ElemTy &V) const
findValue - Return an iterator to the specified value.
iterator insert(const ElemTy &Data)
insert - Insert a new value into the union/find set, ignoring the request if the value already exists...
member_iterator member_end() const
typename std::set< ECValue, ECValueComparator >::const_iterator iterator
iterator* - Provides a way to iterate over all values in the set.
member_iterator member_begin(iterator I) const
member_iterator unionSets(const ElemTy &V1, const ElemTy &V2)
union - Merge the two equivalence sets for the specified values, inserting them if they do not alread...
const ElemTy & getLeaderValue(const ElemTy &V) const
getLeaderValue - Return the leader for the specified value that is in the set.
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:685
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:973
Type * getResultElementType() const
PointerType * getType() const
Global values are always pointers.
Definition: GlobalValue.h:294
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:83
Class to represent integer types.
Definition: DerivedTypes.h:40
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
An instruction for reading from memory.
Definition: Instructions.h:184
Value * getPointerOperand()
Definition: Instructions.h:280
static constexpr LocationSize beforeOrAfterPointer()
Any location before or after the base pointer (but still within the underlying object).
This analysis provides dependence information for the memory accesses of a loop.
Result run(Function &F, FunctionAnalysisManager &AM)
bool invalidate(Function &F, const PreservedAnalyses &PA, FunctionAnalysisManager::Invalidator &Inv)
const LoopAccessInfo & getInfo(Loop &L)
Drive the analysis of memory accesses in the loop.
const MemoryDepChecker & getDepChecker() const
the Memory Dependence Checker which can determine the loop-independent and loop-carried dependences b...
bool isInvariant(Value *V) const
Returns true if value V is loop invariant.
void print(raw_ostream &OS, unsigned Depth=0) const
Print the information about the memory accesses in the loop.
LoopAccessInfo(Loop *L, ScalarEvolution *SE, const TargetLibraryInfo *TLI, AAResults *AA, DominatorTree *DT, LoopInfo *LI)
static bool blockNeedsPredication(BasicBlock *BB, Loop *TheLoop, DominatorTree *DT)
Return true if the block BB needs to be predicated in order for the loop to be vectorized.
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
unsigned getNumBackEdges() const
Calculate the number of back edges to the loop header.
BlockT * getHeader() const
LoopT * getParentLoop() const
Return the parent loop if it exists or nullptr for top level loops.
Wrapper class to LoopBlocksDFS that provides a standard begin()/end() interface for the DFS reverse p...
Definition: LoopIterator.h:172
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
bool isAnnotatedParallel() const
Returns true if the loop is annotated parallel.
Definition: LoopInfo.cpp:564
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:631
Metadata node.
Definition: Metadata.h:1067
ArrayRef< MDOperand > operands() const
Definition: Metadata.h:1426
Tracking metadata reference owned by Metadata.
Definition: Metadata.h:889
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
Checks memory dependences among accesses to the same underlying object to determine whether there vec...
ArrayRef< unsigned > getOrderForAccess(Value *Ptr, bool IsWrite) const
Return the program order indices for the access location (Ptr, IsWrite).
bool isSafeForAnyVectorWidth() const
Return true if the number of elements that are safe to operate on simultaneously is not bounded.
bool areDepsSafe(DepCandidates &AccessSets, MemAccessInfoList &CheckDeps, const DenseMap< Value *, const SCEV * > &Strides, const DenseMap< Value *, SmallVector< const Value *, 16 > > &UnderlyingObjects)
Check whether the dependencies between the accesses are safe.
const SmallVectorImpl< Instruction * > & getMemoryInstructions() const
The vector of memory access instructions.
const Loop * getInnermostLoop() const
uint64_t getMaxSafeVectorWidthInBits() const
Return the number of elements that are safe to operate on simultaneously, multiplied by the size of t...
bool isSafeForVectorization() const
No memory dependence was encountered that would inhibit vectorization.
const SmallVectorImpl< Dependence > * getDependences() const
Returns the memory dependences.
SmallVector< Instruction *, 4 > getInstructionsForAccess(Value *Ptr, bool isWrite) const
Find the set of instructions that read or write via Ptr.
VectorizationSafetyStatus
Type to keep track of the status of the dependence check.
bool shouldRetryWithRuntimeCheck() const
In same cases when the dependency check fails we can still vectorize the loop with a dynamic array ac...
void addAccess(StoreInst *SI)
Register the location (instructions are given increasing numbers) of a write access.
PointerIntPair< Value *, 1, bool > MemAccessInfo
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
LocationSize Size
The maximum size of the location, in address-units, or UnknownSize if the size is not known.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
const Value * Ptr
The address of the start of the location.
Root of the metadata hierarchy.
Definition: Metadata.h:62
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:293
Diagnostic information for optimization analysis remarks.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
void addPredicate(const SCEVPredicate &Pred)
Adds a new predicate.
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
const SCEVPredicate & getPredicate() const
bool hasNoOverflow(Value *V, SCEVWrapPredicate::IncrementWrapFlags Flags)
Returns true if we've proved that V doesn't wrap by means of a SCEV predicate.
void setNoOverflow(Value *V, SCEVWrapPredicate::IncrementWrapFlags Flags)
Proves that V doesn't overflow by adding SCEV predicate.
void print(raw_ostream &OS, unsigned Depth) const
Print the SCEV mappings done by the Predicated Scalar Evolution.
const SCEVAddRecExpr * getAsAddRec(Value *V)
Attempts to produce an AddRecExpr for V by adding additional SCEV predicates.
const SCEV * getBackedgeTakenCount()
Get the (predicated) backedge count for the analyzed loop.
const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:109
PreservedAnalysisChecker getChecker() const
Build a checker for this PreservedAnalyses and the specified analysis type.
Definition: Analysis.h:264
Holds information about the memory runtime legality checks to verify that a group of pointers do not ...
bool Need
This flag indicates if we need to add the runtime check.
void reset()
Reset the state of the pointer runtime information.
unsigned getNumberOfChecks() const
Returns the number of run-time checks required according to needsChecking.
void printChecks(raw_ostream &OS, const SmallVectorImpl< RuntimePointerCheck > &Checks, unsigned Depth=0) const
Print Checks.
bool needsChecking(const RuntimeCheckingPtrGroup &M, const RuntimeCheckingPtrGroup &N) const
Decide if we need to add a check between two groups of pointers, according to needsChecking.
void print(raw_ostream &OS, unsigned Depth=0) const
Print the list run-time memory checks necessary.
SmallVector< RuntimeCheckingPtrGroup, 2 > CheckingGroups
Holds a partitioning of pointers into "check groups".
void generateChecks(MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies)
Generate the checks and store it.
static bool arePointersInSamePartition(const SmallVectorImpl< int > &PtrToPartition, unsigned PtrIdx1, unsigned PtrIdx2)
Check if pointers are in the same partition.
SmallVector< PointerInfo, 2 > Pointers
Information about the pointers that may require checking.
void insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr, Type *AccessTy, bool WritePtr, unsigned DepSetId, unsigned ASId, PredicatedScalarEvolution &PSE, bool NeedsFreeze)
Insert a pointer and calculate the start and end SCEVs.
This node represents a polynomial recurrence on the trip count of the specified loop.
const SCEV * evaluateAtIteration(const SCEV *It, ScalarEvolution &SE) const
Return the value of this chain of recurrences at the specified iteration number.
const SCEV * getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents a constant integer value.
This is the base class for unary integral cast operator classes.
This node represents multiplication of some number of SCEVs.
NoWrapFlags getNoWrapFlags(NoWrapFlags Mask=NoWrapMask) const
virtual void print(raw_ostream &OS, unsigned Depth=0) const =0
Prints a textual representation of this predicate with an indentation of Depth.
This means that we are dealing with an entirely unknown SCEV value, and only represent it as its LLVM...
This class represents an analyzed expression in the program.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
bool isKnownNonNegative(const SCEV *S)
Test if the given expression is known to be non-negative.
const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
bool isKnownNonPositive(const SCEV *S)
Test if the given expression is known to be non-positive.
const SCEV * getUMaxExpr(const SCEV *LHS, const SCEV *RHS)
const SCEVPredicate * getEqualPredicate(const SCEV *LHS, const SCEV *RHS)
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getNoopOrSignExtend(const SCEV *V, Type *Ty)
Return a SCEV corresponding to a conversion of the input value to the specified type.
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
const SCEV * getPtrToIntExpr(const SCEV *Op, Type *Ty)
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
bool isKnownPositive(const SCEV *S)
Test if the given expression is known to be positive.
const SCEV * getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth=0)
bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
Type * getEffectiveSCEVType(Type *Ty) const
Return a type with the same bitwidth as the given type and which represents how SCEV will treat the g...
const SCEV * getUMinExpr(const SCEV *LHS, const SCEV *RHS, bool Sequential=false)
const SCEV * getStoreSizeOfExpr(Type *IntTy, Type *StoreTy)
Return an expression for the store size of StoreTy that is type IntTy.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getSizeOfExpr(Type *IntTy, TypeSize Size)
Return an expression for a TypeSize.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
const SCEV * getTruncateOrSignExtend(const SCEV *V, Type *Ty, unsigned Depth=0)
Return a SCEV corresponding to a conversion of the input value to the specified type.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:321
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:360
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:317
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
Value * getOperand(unsigned i) const
Definition: User.h:169
unsigned getNumOperands() const
Definition: User.h:191
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:70
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition: Value.h:736
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:187
An efficient, type-erasing, non-owning reference to a callable.
TypeSize getSequentialElementStride(const DataLayout &DL) const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
friend const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:236
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:573
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
LocationClass< Ty > location(Ty &L)
Definition: CommandLine.h:470
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:456
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
unsigned getPointerAddressSpace(const Type *T)
Definition: SPIRVUtils.h:122
std::optional< const MDOperand * > findStringMetadataForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for loop.
Definition: LoopInfo.cpp:1053
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
AddressSpace
Definition: NVPTXBaseInfo.h:21
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1928
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
bool NullPointerIsDefined(const Function *F, unsigned AS=0)
Check whether null pointer dereferencing is considered undefined behavior for a given function or an ...
Definition: Function.cpp:2047
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool isPointerTy(const Type *T)
Definition: SPIRVUtils.h:116
std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, LoopInfo *LI=nullptr, unsigned MaxLookup=6)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
const SCEV * replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &PtrToStride, Value *Ptr)
Return the SCEV corresponding to a pointer with the symbolic stride replaced with constant one,...
bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
bool isGuaranteedNotToBeUndefOrPoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Return true if this function can prove that V does not have undef bits and is never poison.
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1824
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
gep_type_iterator gep_type_begin(const User *GEP)
Type * getLoadStoreType(Value *I)
A helper function that returns the type of a load or store instruction.
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition: Metadata.h:783
MDNode * TBAA
The tag for type-based alias analysis.
Definition: Metadata.h:777
MDNode * NoAlias
The tag specifying the noalias scope.
Definition: Metadata.h:786
A special type used by analysis passes to provide an address that identifies that particular analysis...
Definition: Analysis.h:26
Dependece between memory access instructions.
DepType Type
The type of the dependence.
bool isPossiblyBackward() const
May be a lexically backward dependence type (includes Unknown).
bool isForward() const
Lexically forward dependence.
bool isBackward() const
Lexically backward dependence.
void print(raw_ostream &OS, unsigned Depth, const SmallVectorImpl< Instruction * > &Instrs) const
Print the dependence.
Instruction * getDestination(const LoopAccessInfo &LAI) const
Return the destination instruction of the dependence.
Instruction * getSource(const LoopAccessInfo &LAI) const
Return the source instruction of the dependence.
DepType
The type of the dependence.
static const char * DepName[]
String version of the types.
static VectorizationSafetyStatus isSafeForVectorization(DepType Type)
Dependence types that don't prevent vectorization.
unsigned AddressSpace
Address space of the involved pointers.
bool addPointer(unsigned Index, RuntimePointerChecking &RtCheck)
Tries to add the pointer recorded in RtCheck at index Index to this pointer checking group.
bool NeedsFreeze
Whether the pointer needs to be frozen after expansion, e.g.
const SCEV * High
The SCEV expression which represents the upper bound of all the pointers in this group.
SmallVector< unsigned, 2 > Members
Indices of all the pointers that constitute this grouping.
RuntimeCheckingPtrGroup(unsigned Index, RuntimePointerChecking &RtCheck)
Create a new pointer checking group containing a single pointer, with index Index in RtCheck.
const SCEV * Low
The SCEV expression which represents the lower bound of all the pointers in this group.
bool IsWritePtr
Holds the information if this pointer is used for writing to memory.
unsigned DependencySetId
Holds the id of the set of pointers that could be dependent because of a shared underlying object.
unsigned AliasSetId
Holds the id of the disjoint alias set to which this pointer belongs.
static const unsigned MaxVectorWidth
Maximum SIMD width.
static unsigned VectorizationFactor
VF as overridden by the user.
static unsigned RuntimeMemoryCheckThreshold
\When performing memory disambiguation checks at runtime do not make more than this number of compari...
static bool isInterleaveForced()
True if force-vector-interleave was specified by the user.
static unsigned VectorizationInterleave
Interleave factor as overridden by the user.
Function object to check whether the first component of a container supported by std::get (like std::...
Definition: STLExtras.h:1450