LLVM 17.0.0git
LoopAccessAnalysis.cpp
Go to the documentation of this file.
1//===- LoopAccessAnalysis.cpp - Loop Access Analysis Implementation --------==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// The implementation for the loop memory dependence that was originally
10// developed for the loop vectorizer.
11//
12//===----------------------------------------------------------------------===//
13
15#include "llvm/ADT/APInt.h"
16#include "llvm/ADT/DenseMap.h"
20#include "llvm/ADT/STLExtras.h"
21#include "llvm/ADT/SetVector.h"
23#include "llvm/ADT/SmallSet.h"
38#include "llvm/IR/BasicBlock.h"
39#include "llvm/IR/Constants.h"
40#include "llvm/IR/DataLayout.h"
41#include "llvm/IR/DebugLoc.h"
44#include "llvm/IR/Dominators.h"
45#include "llvm/IR/Function.h"
46#include "llvm/IR/InstrTypes.h"
47#include "llvm/IR/Instruction.h"
49#include "llvm/IR/Operator.h"
50#include "llvm/IR/PassManager.h"
52#include "llvm/IR/Type.h"
53#include "llvm/IR/Value.h"
54#include "llvm/IR/ValueHandle.h"
56#include "llvm/Pass.h"
59#include "llvm/Support/Debug.h"
62#include <algorithm>
63#include <cassert>
64#include <cstdint>
65#include <iterator>
66#include <utility>
67#include <vector>
68
69using namespace llvm;
70using namespace llvm::PatternMatch;
71
72#define DEBUG_TYPE "loop-accesses"
73
75VectorizationFactor("force-vector-width", cl::Hidden,
76 cl::desc("Sets the SIMD width. Zero is autoselect."),
79
81VectorizationInterleave("force-vector-interleave", cl::Hidden,
82 cl::desc("Sets the vectorization interleave count. "
83 "Zero is autoselect."),
87
89 "runtime-memory-check-threshold", cl::Hidden,
90 cl::desc("When performing memory disambiguation checks at runtime do not "
91 "generate more than this number of comparisons (default = 8)."),
94
95/// The maximum iterations used to merge memory checks
97 "memory-check-merge-threshold", cl::Hidden,
98 cl::desc("Maximum number of comparisons done when trying to merge "
99 "runtime memory checks. (default = 100)"),
100 cl::init(100));
101
102/// Maximum SIMD width.
103const unsigned VectorizerParams::MaxVectorWidth = 64;
104
105/// We collect dependences up to this threshold.
107 MaxDependences("max-dependences", cl::Hidden,
108 cl::desc("Maximum number of dependences collected by "
109 "loop-access analysis (default = 100)"),
110 cl::init(100));
111
112/// This enables versioning on the strides of symbolically striding memory
113/// accesses in code like the following.
114/// for (i = 0; i < N; ++i)
115/// A[i * Stride1] += B[i * Stride2] ...
116///
117/// Will be roughly translated to
118/// if (Stride1 == 1 && Stride2 == 1) {
119/// for (i = 0; i < N; i+=4)
120/// A[i:i+3] += ...
121/// } else
122/// ...
124 "enable-mem-access-versioning", cl::init(true), cl::Hidden,
125 cl::desc("Enable symbolic stride memory access versioning"));
126
127/// Enable store-to-load forwarding conflict detection. This option can
128/// be disabled for correctness testing.
130 "store-to-load-forwarding-conflict-detection", cl::Hidden,
131 cl::desc("Enable conflict detection in loop-access analysis"),
132 cl::init(true));
133
135 "max-forked-scev-depth", cl::Hidden,
136 cl::desc("Maximum recursion depth when finding forked SCEVs (default = 5)"),
137 cl::init(5));
138
140 return ::VectorizationInterleave.getNumOccurrences() > 0;
141}
142
144 if (auto *CI = dyn_cast<CastInst>(V))
145 if (CI->getOperand(0)->getType()->isIntegerTy())
146 return CI->getOperand(0);
147 return V;
148}
149
151 const ValueToValueMap &PtrToStride,
152 Value *Ptr) {
153 const SCEV *OrigSCEV = PSE.getSCEV(Ptr);
154
155 // If there is an entry in the map return the SCEV of the pointer with the
156 // symbolic stride replaced by one.
158 if (SI == PtrToStride.end())
159 // For a non-symbolic stride, just return the original expression.
160 return OrigSCEV;
161
162 Value *StrideVal = stripIntegerCast(SI->second);
163
164 ScalarEvolution *SE = PSE.getSE();
165 const auto *U = cast<SCEVUnknown>(SE->getSCEV(StrideVal));
166 const auto *CT =
167 static_cast<const SCEVConstant *>(SE->getOne(StrideVal->getType()));
168
169 PSE.addPredicate(*SE->getEqualPredicate(U, CT));
170 auto *Expr = PSE.getSCEV(Ptr);
171
172 LLVM_DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV
173 << " by: " << *Expr << "\n");
174 return Expr;
175}
176
178 unsigned Index, RuntimePointerChecking &RtCheck)
179 : High(RtCheck.Pointers[Index].End), Low(RtCheck.Pointers[Index].Start),
180 AddressSpace(RtCheck.Pointers[Index]
181 .PointerValue->getType()
182 ->getPointerAddressSpace()),
183 NeedsFreeze(RtCheck.Pointers[Index].NeedsFreeze) {
185}
186
187/// Calculate Start and End points of memory access.
188/// Let's assume A is the first access and B is a memory access on N-th loop
189/// iteration. Then B is calculated as:
190/// B = A + Step*N .
191/// Step value may be positive or negative.
192/// N is a calculated back-edge taken count:
193/// N = (TripCount > 0) ? RoundDown(TripCount -1 , VF) : 0
194/// Start and End points are calculated in the following way:
195/// Start = UMIN(A, B) ; End = UMAX(A, B) + SizeOfElt,
196/// where SizeOfElt is the size of single memory access in bytes.
197///
198/// There is no conflict when the intervals are disjoint:
199/// NoConflict = (P2.Start >= P1.End) || (P1.Start >= P2.End)
201 Type *AccessTy, bool WritePtr,
202 unsigned DepSetId, unsigned ASId,
204 bool NeedsFreeze) {
205 ScalarEvolution *SE = PSE.getSE();
206
207 const SCEV *ScStart;
208 const SCEV *ScEnd;
209
210 if (SE->isLoopInvariant(PtrExpr, Lp)) {
211 ScStart = ScEnd = PtrExpr;
212 } else {
213 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrExpr);
214 assert(AR && "Invalid addrec expression");
215 const SCEV *Ex = PSE.getBackedgeTakenCount();
216
217 ScStart = AR->getStart();
218 ScEnd = AR->evaluateAtIteration(Ex, *SE);
219 const SCEV *Step = AR->getStepRecurrence(*SE);
220
221 // For expressions with negative step, the upper bound is ScStart and the
222 // lower bound is ScEnd.
223 if (const auto *CStep = dyn_cast<SCEVConstant>(Step)) {
224 if (CStep->getValue()->isNegative())
225 std::swap(ScStart, ScEnd);
226 } else {
227 // Fallback case: the step is not constant, but we can still
228 // get the upper and lower bounds of the interval by using min/max
229 // expressions.
230 ScStart = SE->getUMinExpr(ScStart, ScEnd);
231 ScEnd = SE->getUMaxExpr(AR->getStart(), ScEnd);
232 }
233 }
234 // Add the size of the pointed element to ScEnd.
235 auto &DL = Lp->getHeader()->getModule()->getDataLayout();
236 Type *IdxTy = DL.getIndexType(Ptr->getType());
237 const SCEV *EltSizeSCEV = SE->getStoreSizeOfExpr(IdxTy, AccessTy);
238 ScEnd = SE->getAddExpr(ScEnd, EltSizeSCEV);
239
240 Pointers.emplace_back(Ptr, ScStart, ScEnd, WritePtr, DepSetId, ASId, PtrExpr,
241 NeedsFreeze);
242}
243
244void RuntimePointerChecking::tryToCreateDiffCheck(
245 const RuntimeCheckingPtrGroup &CGI, const RuntimeCheckingPtrGroup &CGJ) {
246 if (!CanUseDiffCheck)
247 return;
248
249 // If either group contains multiple different pointers, bail out.
250 // TODO: Support multiple pointers by using the minimum or maximum pointer,
251 // depending on src & sink.
252 if (CGI.Members.size() != 1 || CGJ.Members.size() != 1) {
253 CanUseDiffCheck = false;
254 return;
255 }
256
257 PointerInfo *Src = &Pointers[CGI.Members[0]];
258 PointerInfo *Sink = &Pointers[CGJ.Members[0]];
259
260 // If either pointer is read and written, multiple checks may be needed. Bail
261 // out.
262 if (!DC.getOrderForAccess(Src->PointerValue, !Src->IsWritePtr).empty() ||
263 !DC.getOrderForAccess(Sink->PointerValue, !Sink->IsWritePtr).empty()) {
264 CanUseDiffCheck = false;
265 return;
266 }
267
268 ArrayRef<unsigned> AccSrc =
269 DC.getOrderForAccess(Src->PointerValue, Src->IsWritePtr);
270 ArrayRef<unsigned> AccSink =
271 DC.getOrderForAccess(Sink->PointerValue, Sink->IsWritePtr);
272 // If either pointer is accessed multiple times, there may not be a clear
273 // src/sink relation. Bail out for now.
274 if (AccSrc.size() != 1 || AccSink.size() != 1) {
275 CanUseDiffCheck = false;
276 return;
277 }
278 // If the sink is accessed before src, swap src/sink.
279 if (AccSink[0] < AccSrc[0])
280 std::swap(Src, Sink);
281
282 auto *SrcAR = dyn_cast<SCEVAddRecExpr>(Src->Expr);
283 auto *SinkAR = dyn_cast<SCEVAddRecExpr>(Sink->Expr);
284 if (!SrcAR || !SinkAR || SrcAR->getLoop() != DC.getInnermostLoop() ||
285 SinkAR->getLoop() != DC.getInnermostLoop()) {
286 CanUseDiffCheck = false;
287 return;
288 }
289
291 DC.getInstructionsForAccess(Src->PointerValue, Src->IsWritePtr);
293 DC.getInstructionsForAccess(Sink->PointerValue, Sink->IsWritePtr);
294 Type *SrcTy = getLoadStoreType(SrcInsts[0]);
295 Type *DstTy = getLoadStoreType(SinkInsts[0]);
296 if (isa<ScalableVectorType>(SrcTy) || isa<ScalableVectorType>(DstTy)) {
297 CanUseDiffCheck = false;
298 return;
299 }
300 const DataLayout &DL =
301 SinkAR->getLoop()->getHeader()->getModule()->getDataLayout();
302 unsigned AllocSize =
303 std::max(DL.getTypeAllocSize(SrcTy), DL.getTypeAllocSize(DstTy));
304
305 // Only matching constant steps matching the AllocSize are supported at the
306 // moment. This simplifies the difference computation. Can be extended in the
307 // future.
308 auto *Step = dyn_cast<SCEVConstant>(SinkAR->getStepRecurrence(*SE));
309 if (!Step || Step != SrcAR->getStepRecurrence(*SE) ||
310 Step->getAPInt().abs() != AllocSize) {
311 CanUseDiffCheck = false;
312 return;
313 }
314
315 IntegerType *IntTy =
316 IntegerType::get(Src->PointerValue->getContext(),
317 DL.getPointerSizeInBits(CGI.AddressSpace));
318
319 // When counting down, the dependence distance needs to be swapped.
320 if (Step->getValue()->isNegative())
321 std::swap(SinkAR, SrcAR);
322
323 const SCEV *SinkStartInt = SE->getPtrToIntExpr(SinkAR->getStart(), IntTy);
324 const SCEV *SrcStartInt = SE->getPtrToIntExpr(SrcAR->getStart(), IntTy);
325 if (isa<SCEVCouldNotCompute>(SinkStartInt) ||
326 isa<SCEVCouldNotCompute>(SrcStartInt)) {
327 CanUseDiffCheck = false;
328 return;
329 }
330 DiffChecks.emplace_back(SrcStartInt, SinkStartInt, AllocSize,
331 Src->NeedsFreeze || Sink->NeedsFreeze);
332}
333
334SmallVector<RuntimePointerCheck, 4> RuntimePointerChecking::generateChecks() {
336
337 for (unsigned I = 0; I < CheckingGroups.size(); ++I) {
338 for (unsigned J = I + 1; J < CheckingGroups.size(); ++J) {
341
342 if (needsChecking(CGI, CGJ)) {
343 tryToCreateDiffCheck(CGI, CGJ);
344 Checks.push_back(std::make_pair(&CGI, &CGJ));
345 }
346 }
347 }
348 return Checks;
349}
350
351void RuntimePointerChecking::generateChecks(
352 MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) {
353 assert(Checks.empty() && "Checks is not empty");
354 groupChecks(DepCands, UseDependencies);
355 Checks = generateChecks();
356}
357
359 const RuntimeCheckingPtrGroup &M, const RuntimeCheckingPtrGroup &N) const {
360 for (unsigned I = 0, EI = M.Members.size(); EI != I; ++I)
361 for (unsigned J = 0, EJ = N.Members.size(); EJ != J; ++J)
362 if (needsChecking(M.Members[I], N.Members[J]))
363 return true;
364 return false;
365}
366
367/// Compare \p I and \p J and return the minimum.
368/// Return nullptr in case we couldn't find an answer.
369static const SCEV *getMinFromExprs(const SCEV *I, const SCEV *J,
370 ScalarEvolution *SE) {
371 const SCEV *Diff = SE->getMinusSCEV(J, I);
372 const SCEVConstant *C = dyn_cast<const SCEVConstant>(Diff);
373
374 if (!C)
375 return nullptr;
376 if (C->getValue()->isNegative())
377 return J;
378 return I;
379}
380
382 RuntimePointerChecking &RtCheck) {
383 return addPointer(
384 Index, RtCheck.Pointers[Index].Start, RtCheck.Pointers[Index].End,
385 RtCheck.Pointers[Index].PointerValue->getType()->getPointerAddressSpace(),
386 RtCheck.Pointers[Index].NeedsFreeze, *RtCheck.SE);
387}
388
390 const SCEV *End, unsigned AS,
391 bool NeedsFreeze,
392 ScalarEvolution &SE) {
393 assert(AddressSpace == AS &&
394 "all pointers in a checking group must be in the same address space");
395
396 // Compare the starts and ends with the known minimum and maximum
397 // of this set. We need to know how we compare against the min/max
398 // of the set in order to be able to emit memchecks.
399 const SCEV *Min0 = getMinFromExprs(Start, Low, &SE);
400 if (!Min0)
401 return false;
402
403 const SCEV *Min1 = getMinFromExprs(End, High, &SE);
404 if (!Min1)
405 return false;
406
407 // Update the low bound expression if we've found a new min value.
408 if (Min0 == Start)
409 Low = Start;
410
411 // Update the high bound expression if we've found a new max value.
412 if (Min1 != End)
413 High = End;
414
416 this->NeedsFreeze |= NeedsFreeze;
417 return true;
418}
419
420void RuntimePointerChecking::groupChecks(
421 MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) {
422 // We build the groups from dependency candidates equivalence classes
423 // because:
424 // - We know that pointers in the same equivalence class share
425 // the same underlying object and therefore there is a chance
426 // that we can compare pointers
427 // - We wouldn't be able to merge two pointers for which we need
428 // to emit a memcheck. The classes in DepCands are already
429 // conveniently built such that no two pointers in the same
430 // class need checking against each other.
431
432 // We use the following (greedy) algorithm to construct the groups
433 // For every pointer in the equivalence class:
434 // For each existing group:
435 // - if the difference between this pointer and the min/max bounds
436 // of the group is a constant, then make the pointer part of the
437 // group and update the min/max bounds of that group as required.
438
439 CheckingGroups.clear();
440
441 // If we need to check two pointers to the same underlying object
442 // with a non-constant difference, we shouldn't perform any pointer
443 // grouping with those pointers. This is because we can easily get
444 // into cases where the resulting check would return false, even when
445 // the accesses are safe.
446 //
447 // The following example shows this:
448 // for (i = 0; i < 1000; ++i)
449 // a[5000 + i * m] = a[i] + a[i + 9000]
450 //
451 // Here grouping gives a check of (5000, 5000 + 1000 * m) against
452 // (0, 10000) which is always false. However, if m is 1, there is no
453 // dependence. Not grouping the checks for a[i] and a[i + 9000] allows
454 // us to perform an accurate check in this case.
455 //
456 // The above case requires that we have an UnknownDependence between
457 // accesses to the same underlying object. This cannot happen unless
458 // FoundNonConstantDistanceDependence is set, and therefore UseDependencies
459 // is also false. In this case we will use the fallback path and create
460 // separate checking groups for all pointers.
461
462 // If we don't have the dependency partitions, construct a new
463 // checking pointer group for each pointer. This is also required
464 // for correctness, because in this case we can have checking between
465 // pointers to the same underlying object.
466 if (!UseDependencies) {
467 for (unsigned I = 0; I < Pointers.size(); ++I)
468 CheckingGroups.push_back(RuntimeCheckingPtrGroup(I, *this));
469 return;
470 }
471
472 unsigned TotalComparisons = 0;
473
475 for (unsigned Index = 0; Index < Pointers.size(); ++Index) {
476 auto Iter = PositionMap.insert({Pointers[Index].PointerValue, {}});
477 Iter.first->second.push_back(Index);
478 }
479
480 // We need to keep track of what pointers we've already seen so we
481 // don't process them twice.
483
484 // Go through all equivalence classes, get the "pointer check groups"
485 // and add them to the overall solution. We use the order in which accesses
486 // appear in 'Pointers' to enforce determinism.
487 for (unsigned I = 0; I < Pointers.size(); ++I) {
488 // We've seen this pointer before, and therefore already processed
489 // its equivalence class.
490 if (Seen.count(I))
491 continue;
492
493 MemoryDepChecker::MemAccessInfo Access(Pointers[I].PointerValue,
494 Pointers[I].IsWritePtr);
495
497 auto LeaderI = DepCands.findValue(DepCands.getLeaderValue(Access));
498
499 // Because DepCands is constructed by visiting accesses in the order in
500 // which they appear in alias sets (which is deterministic) and the
501 // iteration order within an equivalence class member is only dependent on
502 // the order in which unions and insertions are performed on the
503 // equivalence class, the iteration order is deterministic.
504 for (auto MI = DepCands.member_begin(LeaderI), ME = DepCands.member_end();
505 MI != ME; ++MI) {
506 auto PointerI = PositionMap.find(MI->getPointer());
507 assert(PointerI != PositionMap.end() &&
508 "pointer in equivalence class not found in PositionMap");
509 for (unsigned Pointer : PointerI->second) {
510 bool Merged = false;
511 // Mark this pointer as seen.
512 Seen.insert(Pointer);
513
514 // Go through all the existing sets and see if we can find one
515 // which can include this pointer.
516 for (RuntimeCheckingPtrGroup &Group : Groups) {
517 // Don't perform more than a certain amount of comparisons.
518 // This should limit the cost of grouping the pointers to something
519 // reasonable. If we do end up hitting this threshold, the algorithm
520 // will create separate groups for all remaining pointers.
521 if (TotalComparisons > MemoryCheckMergeThreshold)
522 break;
523
524 TotalComparisons++;
525
526 if (Group.addPointer(Pointer, *this)) {
527 Merged = true;
528 break;
529 }
530 }
531
532 if (!Merged)
533 // We couldn't add this pointer to any existing set or the threshold
534 // for the number of comparisons has been reached. Create a new group
535 // to hold the current pointer.
536 Groups.push_back(RuntimeCheckingPtrGroup(Pointer, *this));
537 }
538 }
539
540 // We've computed the grouped checks for this partition.
541 // Save the results and continue with the next one.
542 llvm::copy(Groups, std::back_inserter(CheckingGroups));
543 }
544}
545
547 const SmallVectorImpl<int> &PtrToPartition, unsigned PtrIdx1,
548 unsigned PtrIdx2) {
549 return (PtrToPartition[PtrIdx1] != -1 &&
550 PtrToPartition[PtrIdx1] == PtrToPartition[PtrIdx2]);
551}
552
553bool RuntimePointerChecking::needsChecking(unsigned I, unsigned J) const {
554 const PointerInfo &PointerI = Pointers[I];
555 const PointerInfo &PointerJ = Pointers[J];
556
557 // No need to check if two readonly pointers intersect.
558 if (!PointerI.IsWritePtr && !PointerJ.IsWritePtr)
559 return false;
560
561 // Only need to check pointers between two different dependency sets.
562 if (PointerI.DependencySetId == PointerJ.DependencySetId)
563 return false;
564
565 // Only need to check pointers in the same alias set.
566 if (PointerI.AliasSetId != PointerJ.AliasSetId)
567 return false;
568
569 return true;
570}
571
574 unsigned Depth) const {
575 unsigned N = 0;
576 for (const auto &Check : Checks) {
577 const auto &First = Check.first->Members, &Second = Check.second->Members;
578
579 OS.indent(Depth) << "Check " << N++ << ":\n";
580
581 OS.indent(Depth + 2) << "Comparing group (" << Check.first << "):\n";
582 for (unsigned K = 0; K < First.size(); ++K)
583 OS.indent(Depth + 2) << *Pointers[First[K]].PointerValue << "\n";
584
585 OS.indent(Depth + 2) << "Against group (" << Check.second << "):\n";
586 for (unsigned K = 0; K < Second.size(); ++K)
587 OS.indent(Depth + 2) << *Pointers[Second[K]].PointerValue << "\n";
588 }
589}
590
592
593 OS.indent(Depth) << "Run-time memory checks:\n";
594 printChecks(OS, Checks, Depth);
595
596 OS.indent(Depth) << "Grouped accesses:\n";
597 for (unsigned I = 0; I < CheckingGroups.size(); ++I) {
598 const auto &CG = CheckingGroups[I];
599
600 OS.indent(Depth + 2) << "Group " << &CG << ":\n";
601 OS.indent(Depth + 4) << "(Low: " << *CG.Low << " High: " << *CG.High
602 << ")\n";
603 for (unsigned J = 0; J < CG.Members.size(); ++J) {
604 OS.indent(Depth + 6) << "Member: " << *Pointers[CG.Members[J]].Expr
605 << "\n";
606 }
607 }
608}
609
610namespace {
611
612/// Analyses memory accesses in a loop.
613///
614/// Checks whether run time pointer checks are needed and builds sets for data
615/// dependence checking.
616class AccessAnalysis {
617public:
618 /// Read or write access location.
619 typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
620 typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList;
621
622 AccessAnalysis(Loop *TheLoop, AAResults *AA, LoopInfo *LI,
625 : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DepCands(DA), PSE(PSE) {
626 // We're analyzing dependences across loop iterations.
627 BAA.enableCrossIterationMode();
628 }
629
630 /// Register a load and whether it is only read from.
631 void addLoad(MemoryLocation &Loc, Type *AccessTy, bool IsReadOnly) {
632 Value *Ptr = const_cast<Value*>(Loc.Ptr);
634 Accesses[MemAccessInfo(Ptr, false)].insert(AccessTy);
635 if (IsReadOnly)
636 ReadOnlyPtr.insert(Ptr);
637 }
638
639 /// Register a store.
640 void addStore(MemoryLocation &Loc, Type *AccessTy) {
641 Value *Ptr = const_cast<Value*>(Loc.Ptr);
643 Accesses[MemAccessInfo(Ptr, true)].insert(AccessTy);
644 }
645
646 /// Check if we can emit a run-time no-alias check for \p Access.
647 ///
648 /// Returns true if we can emit a run-time no alias check for \p Access.
649 /// If we can check this access, this also adds it to a dependence set and
650 /// adds a run-time to check for it to \p RtCheck. If \p Assume is true,
651 /// we will attempt to use additional run-time checks in order to get
652 /// the bounds of the pointer.
653 bool createCheckForAccess(RuntimePointerChecking &RtCheck,
654 MemAccessInfo Access, Type *AccessTy,
655 const ValueToValueMap &Strides,
657 Loop *TheLoop, unsigned &RunningDepId,
658 unsigned ASId, bool ShouldCheckStride, bool Assume);
659
660 /// Check whether we can check the pointers at runtime for
661 /// non-intersection.
662 ///
663 /// Returns true if we need no check or if we do and we can generate them
664 /// (i.e. the pointers have computable bounds).
665 bool canCheckPtrAtRT(RuntimePointerChecking &RtCheck, ScalarEvolution *SE,
666 Loop *TheLoop, const ValueToValueMap &Strides,
667 Value *&UncomputablePtr, bool ShouldCheckWrap = false);
668
669 /// Goes over all memory accesses, checks whether a RT check is needed
670 /// and builds sets of dependent accesses.
671 void buildDependenceSets() {
672 processMemAccesses();
673 }
674
675 /// Initial processing of memory accesses determined that we need to
676 /// perform dependency checking.
677 ///
678 /// Note that this can later be cleared if we retry memcheck analysis without
679 /// dependency checking (i.e. FoundNonConstantDistanceDependence).
680 bool isDependencyCheckNeeded() { return !CheckDeps.empty(); }
681
682 /// We decided that no dependence analysis would be used. Reset the state.
683 void resetDepChecks(MemoryDepChecker &DepChecker) {
684 CheckDeps.clear();
685 DepChecker.clearDependences();
686 }
687
688 MemAccessInfoList &getDependenciesToCheck() { return CheckDeps; }
689
690private:
692
693 /// Go over all memory access and check whether runtime pointer checks
694 /// are needed and build sets of dependency check candidates.
695 void processMemAccesses();
696
697 /// Map of all accesses. Values are the types used to access memory pointed to
698 /// by the pointer.
699 PtrAccessMap Accesses;
700
701 /// The loop being checked.
702 const Loop *TheLoop;
703
704 /// List of accesses that need a further dependence check.
705 MemAccessInfoList CheckDeps;
706
707 /// Set of pointers that are read only.
708 SmallPtrSet<Value*, 16> ReadOnlyPtr;
709
710 /// Batched alias analysis results.
711 BatchAAResults BAA;
712
713 /// An alias set tracker to partition the access set by underlying object and
714 //intrinsic property (such as TBAA metadata).
715 AliasSetTracker AST;
716
717 LoopInfo *LI;
718
719 /// Sets of potentially dependent accesses - members of one set share an
720 /// underlying pointer. The set "CheckDeps" identfies which sets really need a
721 /// dependence check.
723
724 /// Initial processing of memory accesses determined that we may need
725 /// to add memchecks. Perform the analysis to determine the necessary checks.
726 ///
727 /// Note that, this is different from isDependencyCheckNeeded. When we retry
728 /// memcheck analysis without dependency checking
729 /// (i.e. FoundNonConstantDistanceDependence), isDependencyCheckNeeded is
730 /// cleared while this remains set if we have potentially dependent accesses.
731 bool IsRTCheckAnalysisNeeded = false;
732
733 /// The SCEV predicate containing all the SCEV-related assumptions.
735};
736
737} // end anonymous namespace
738
739/// Check whether a pointer can participate in a runtime bounds check.
740/// If \p Assume, try harder to prove that we can compute the bounds of \p Ptr
741/// by adding run-time checks (overflow checks) if necessary.
743 const SCEV *PtrScev, Loop *L, bool Assume) {
744 // The bounds for loop-invariant pointer is trivial.
745 if (PSE.getSE()->isLoopInvariant(PtrScev, L))
746 return true;
747
748 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
749
750 if (!AR && Assume)
751 AR = PSE.getAsAddRec(Ptr);
752
753 if (!AR)
754 return false;
755
756 return AR->isAffine();
757}
758
759/// Check whether a pointer address cannot wrap.
761 const ValueToValueMap &Strides, Value *Ptr, Type *AccessTy,
762 Loop *L) {
763 const SCEV *PtrScev = PSE.getSCEV(Ptr);
764 if (PSE.getSE()->isLoopInvariant(PtrScev, L))
765 return true;
766
767 int64_t Stride = getPtrStride(PSE, AccessTy, Ptr, L, Strides).value_or(0);
768 if (Stride == 1 || PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW))
769 return true;
770
771 return false;
772}
773
774static void visitPointers(Value *StartPtr, const Loop &InnermostLoop,
775 function_ref<void(Value *)> AddPointer) {
777 SmallVector<Value *> WorkList;
778 WorkList.push_back(StartPtr);
779
780 while (!WorkList.empty()) {
781 Value *Ptr = WorkList.pop_back_val();
782 if (!Visited.insert(Ptr).second)
783 continue;
784 auto *PN = dyn_cast<PHINode>(Ptr);
785 // SCEV does not look through non-header PHIs inside the loop. Such phis
786 // can be analyzed by adding separate accesses for each incoming pointer
787 // value.
788 if (PN && InnermostLoop.contains(PN->getParent()) &&
789 PN->getParent() != InnermostLoop.getHeader()) {
790 for (const Use &Inc : PN->incoming_values())
791 WorkList.push_back(Inc);
792 } else
793 AddPointer(Ptr);
794 }
795}
796
797// Walk back through the IR for a pointer, looking for a select like the
798// following:
799//
800// %offset = select i1 %cmp, i64 %a, i64 %b
801// %addr = getelementptr double, double* %base, i64 %offset
802// %ld = load double, double* %addr, align 8
803//
804// We won't be able to form a single SCEVAddRecExpr from this since the
805// address for each loop iteration depends on %cmp. We could potentially
806// produce multiple valid SCEVAddRecExprs, though, and check all of them for
807// memory safety/aliasing if needed.
808//
809// If we encounter some IR we don't yet handle, or something obviously fine
810// like a constant, then we just add the SCEV for that term to the list passed
811// in by the caller. If we have a node that may potentially yield a valid
812// SCEVAddRecExpr then we decompose it into parts and build the SCEV terms
813// ourselves before adding to the list.
814static void findForkedSCEVs(
815 ScalarEvolution *SE, const Loop *L, Value *Ptr,
817 unsigned Depth) {
818 // If our Value is a SCEVAddRecExpr, loop invariant, not an instruction, or
819 // we've exceeded our limit on recursion, just return whatever we have
820 // regardless of whether it can be used for a forked pointer or not, along
821 // with an indication of whether it might be a poison or undef value.
822 const SCEV *Scev = SE->getSCEV(Ptr);
823 if (isa<SCEVAddRecExpr>(Scev) || L->isLoopInvariant(Ptr) ||
824 !isa<Instruction>(Ptr) || Depth == 0) {
825 ScevList.emplace_back(Scev, !isGuaranteedNotToBeUndefOrPoison(Ptr));
826 return;
827 }
828
829 Depth--;
830
831 auto UndefPoisonCheck = [](PointerIntPair<const SCEV *, 1, bool> S) {
832 return get<1>(S);
833 };
834
835 auto GetBinOpExpr = [&SE](unsigned Opcode, const SCEV *L, const SCEV *R) {
836 switch (Opcode) {
837 case Instruction::Add:
838 return SE->getAddExpr(L, R);
839 case Instruction::Sub:
840 return SE->getMinusSCEV(L, R);
841 default:
842 llvm_unreachable("Unexpected binary operator when walking ForkedPtrs");
843 }
844 };
845
846 Instruction *I = cast<Instruction>(Ptr);
847 unsigned Opcode = I->getOpcode();
848 switch (Opcode) {
849 case Instruction::GetElementPtr: {
850 GetElementPtrInst *GEP = cast<GetElementPtrInst>(I);
851 Type *SourceTy = GEP->getSourceElementType();
852 // We only handle base + single offset GEPs here for now.
853 // Not dealing with preexisting gathers yet, so no vectors.
854 if (I->getNumOperands() != 2 || SourceTy->isVectorTy()) {
855 ScevList.emplace_back(Scev, !isGuaranteedNotToBeUndefOrPoison(GEP));
856 break;
857 }
860 findForkedSCEVs(SE, L, I->getOperand(0), BaseScevs, Depth);
861 findForkedSCEVs(SE, L, I->getOperand(1), OffsetScevs, Depth);
862
863 // See if we need to freeze our fork...
864 bool NeedsFreeze = any_of(BaseScevs, UndefPoisonCheck) ||
865 any_of(OffsetScevs, UndefPoisonCheck);
866
867 // Check that we only have a single fork, on either the base or the offset.
868 // Copy the SCEV across for the one without a fork in order to generate
869 // the full SCEV for both sides of the GEP.
870 if (OffsetScevs.size() == 2 && BaseScevs.size() == 1)
871 BaseScevs.push_back(BaseScevs[0]);
872 else if (BaseScevs.size() == 2 && OffsetScevs.size() == 1)
873 OffsetScevs.push_back(OffsetScevs[0]);
874 else {
875 ScevList.emplace_back(Scev, NeedsFreeze);
876 break;
877 }
878
879 // Find the pointer type we need to extend to.
880 Type *IntPtrTy = SE->getEffectiveSCEVType(
881 SE->getSCEV(GEP->getPointerOperand())->getType());
882
883 // Find the size of the type being pointed to. We only have a single
884 // index term (guarded above) so we don't need to index into arrays or
885 // structures, just get the size of the scalar value.
886 const SCEV *Size = SE->getSizeOfExpr(IntPtrTy, SourceTy);
887
888 // Scale up the offsets by the size of the type, then add to the bases.
889 const SCEV *Scaled1 = SE->getMulExpr(
890 Size, SE->getTruncateOrSignExtend(get<0>(OffsetScevs[0]), IntPtrTy));
891 const SCEV *Scaled2 = SE->getMulExpr(
892 Size, SE->getTruncateOrSignExtend(get<0>(OffsetScevs[1]), IntPtrTy));
893 ScevList.emplace_back(SE->getAddExpr(get<0>(BaseScevs[0]), Scaled1),
894 NeedsFreeze);
895 ScevList.emplace_back(SE->getAddExpr(get<0>(BaseScevs[1]), Scaled2),
896 NeedsFreeze);
897 break;
898 }
899 case Instruction::Select: {
901 // A select means we've found a forked pointer, but we currently only
902 // support a single select per pointer so if there's another behind this
903 // then we just bail out and return the generic SCEV.
904 findForkedSCEVs(SE, L, I->getOperand(1), ChildScevs, Depth);
905 findForkedSCEVs(SE, L, I->getOperand(2), ChildScevs, Depth);
906 if (ChildScevs.size() == 2) {
907 ScevList.push_back(ChildScevs[0]);
908 ScevList.push_back(ChildScevs[1]);
909 } else
910 ScevList.emplace_back(Scev, !isGuaranteedNotToBeUndefOrPoison(Ptr));
911 break;
912 }
913 case Instruction::Add:
914 case Instruction::Sub: {
917 findForkedSCEVs(SE, L, I->getOperand(0), LScevs, Depth);
918 findForkedSCEVs(SE, L, I->getOperand(1), RScevs, Depth);
919
920 // See if we need to freeze our fork...
921 bool NeedsFreeze =
922 any_of(LScevs, UndefPoisonCheck) || any_of(RScevs, UndefPoisonCheck);
923
924 // Check that we only have a single fork, on either the left or right side.
925 // Copy the SCEV across for the one without a fork in order to generate
926 // the full SCEV for both sides of the BinOp.
927 if (LScevs.size() == 2 && RScevs.size() == 1)
928 RScevs.push_back(RScevs[0]);
929 else if (RScevs.size() == 2 && LScevs.size() == 1)
930 LScevs.push_back(LScevs[0]);
931 else {
932 ScevList.emplace_back(Scev, NeedsFreeze);
933 break;
934 }
935
936 ScevList.emplace_back(
937 GetBinOpExpr(Opcode, get<0>(LScevs[0]), get<0>(RScevs[0])),
938 NeedsFreeze);
939 ScevList.emplace_back(
940 GetBinOpExpr(Opcode, get<0>(LScevs[1]), get<0>(RScevs[1])),
941 NeedsFreeze);
942 break;
943 }
944 default:
945 // Just return the current SCEV if we haven't handled the instruction yet.
946 LLVM_DEBUG(dbgs() << "ForkedPtr unhandled instruction: " << *I << "\n");
947 ScevList.emplace_back(Scev, !isGuaranteedNotToBeUndefOrPoison(Ptr));
948 break;
949 }
950}
951
954 const ValueToValueMap &StridesMap, Value *Ptr,
955 const Loop *L) {
956 ScalarEvolution *SE = PSE.getSE();
957 assert(SE->isSCEVable(Ptr->getType()) && "Value is not SCEVable!");
959 findForkedSCEVs(SE, L, Ptr, Scevs, MaxForkedSCEVDepth);
960
961 // For now, we will only accept a forked pointer with two possible SCEVs
962 // that are either SCEVAddRecExprs or loop invariant.
963 if (Scevs.size() == 2 &&
964 (isa<SCEVAddRecExpr>(get<0>(Scevs[0])) ||
965 SE->isLoopInvariant(get<0>(Scevs[0]), L)) &&
966 (isa<SCEVAddRecExpr>(get<0>(Scevs[1])) ||
967 SE->isLoopInvariant(get<0>(Scevs[1]), L))) {
968 LLVM_DEBUG(dbgs() << "LAA: Found forked pointer: " << *Ptr << "\n");
969 LLVM_DEBUG(dbgs() << "\t(1) " << *get<0>(Scevs[0]) << "\n");
970 LLVM_DEBUG(dbgs() << "\t(2) " << *get<0>(Scevs[1]) << "\n");
971 return Scevs;
972 }
973
974 return {{replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr), false}};
975}
976
977bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck,
978 MemAccessInfo Access, Type *AccessTy,
979 const ValueToValueMap &StridesMap,
981 Loop *TheLoop, unsigned &RunningDepId,
982 unsigned ASId, bool ShouldCheckWrap,
983 bool Assume) {
984 Value *Ptr = Access.getPointer();
985
987 findForkedPointer(PSE, StridesMap, Ptr, TheLoop);
988
989 for (auto &P : TranslatedPtrs) {
990 const SCEV *PtrExpr = get<0>(P);
991 if (!hasComputableBounds(PSE, Ptr, PtrExpr, TheLoop, Assume))
992 return false;
993
994 // When we run after a failing dependency check we have to make sure
995 // we don't have wrapping pointers.
996 if (ShouldCheckWrap) {
997 // Skip wrap checking when translating pointers.
998 if (TranslatedPtrs.size() > 1)
999 return false;
1000
1001 if (!isNoWrap(PSE, StridesMap, Ptr, AccessTy, TheLoop)) {
1002 auto *Expr = PSE.getSCEV(Ptr);
1003 if (!Assume || !isa<SCEVAddRecExpr>(Expr))
1004 return false;
1006 }
1007 }
1008 // If there's only one option for Ptr, look it up after bounds and wrap
1009 // checking, because assumptions might have been added to PSE.
1010 if (TranslatedPtrs.size() == 1)
1011 TranslatedPtrs[0] = {replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr),
1012 false};
1013 }
1014
1015 for (auto [PtrExpr, NeedsFreeze] : TranslatedPtrs) {
1016 // The id of the dependence set.
1017 unsigned DepId;
1018
1019 if (isDependencyCheckNeeded()) {
1020 Value *Leader = DepCands.getLeaderValue(Access).getPointer();
1021 unsigned &LeaderId = DepSetId[Leader];
1022 if (!LeaderId)
1023 LeaderId = RunningDepId++;
1024 DepId = LeaderId;
1025 } else
1026 // Each access has its own dependence set.
1027 DepId = RunningDepId++;
1028
1029 bool IsWrite = Access.getInt();
1030 RtCheck.insert(TheLoop, Ptr, PtrExpr, AccessTy, IsWrite, DepId, ASId, PSE,
1031 NeedsFreeze);
1032 LLVM_DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n');
1033 }
1034
1035 return true;
1036}
1037
1038bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
1039 ScalarEvolution *SE, Loop *TheLoop,
1040 const ValueToValueMap &StridesMap,
1041 Value *&UncomputablePtr, bool ShouldCheckWrap) {
1042 // Find pointers with computable bounds. We are going to use this information
1043 // to place a runtime bound check.
1044 bool CanDoRT = true;
1045
1046 bool MayNeedRTCheck = false;
1047 if (!IsRTCheckAnalysisNeeded) return true;
1048
1049 bool IsDepCheckNeeded = isDependencyCheckNeeded();
1050
1051 // We assign a consecutive id to access from different alias sets.
1052 // Accesses between different groups doesn't need to be checked.
1053 unsigned ASId = 0;
1054 for (auto &AS : AST) {
1055 int NumReadPtrChecks = 0;
1056 int NumWritePtrChecks = 0;
1057 bool CanDoAliasSetRT = true;
1058 ++ASId;
1059
1060 // We assign consecutive id to access from different dependence sets.
1061 // Accesses within the same set don't need a runtime check.
1062 unsigned RunningDepId = 1;
1064
1066
1067 // First, count how many write and read accesses are in the alias set. Also
1068 // collect MemAccessInfos for later.
1070 for (const auto &A : AS) {
1071 Value *Ptr = A.getValue();
1072 bool IsWrite = Accesses.count(MemAccessInfo(Ptr, true));
1073
1074 if (IsWrite)
1075 ++NumWritePtrChecks;
1076 else
1077 ++NumReadPtrChecks;
1078 AccessInfos.emplace_back(Ptr, IsWrite);
1079 }
1080
1081 // We do not need runtime checks for this alias set, if there are no writes
1082 // or a single write and no reads.
1083 if (NumWritePtrChecks == 0 ||
1084 (NumWritePtrChecks == 1 && NumReadPtrChecks == 0)) {
1085 assert((AS.size() <= 1 ||
1086 all_of(AS,
1087 [this](auto AC) {
1088 MemAccessInfo AccessWrite(AC.getValue(), true);
1089 return DepCands.findValue(AccessWrite) == DepCands.end();
1090 })) &&
1091 "Can only skip updating CanDoRT below, if all entries in AS "
1092 "are reads or there is at most 1 entry");
1093 continue;
1094 }
1095
1096 for (auto &Access : AccessInfos) {
1097 for (const auto &AccessTy : Accesses[Access]) {
1098 if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
1099 DepSetId, TheLoop, RunningDepId, ASId,
1100 ShouldCheckWrap, false)) {
1101 LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:"
1102 << *Access.getPointer() << '\n');
1103 Retries.push_back({Access, AccessTy});
1104 CanDoAliasSetRT = false;
1105 }
1106 }
1107 }
1108
1109 // Note that this function computes CanDoRT and MayNeedRTCheck
1110 // independently. For example CanDoRT=false, MayNeedRTCheck=false means that
1111 // we have a pointer for which we couldn't find the bounds but we don't
1112 // actually need to emit any checks so it does not matter.
1113 //
1114 // We need runtime checks for this alias set, if there are at least 2
1115 // dependence sets (in which case RunningDepId > 2) or if we need to re-try
1116 // any bound checks (because in that case the number of dependence sets is
1117 // incomplete).
1118 bool NeedsAliasSetRTCheck = RunningDepId > 2 || !Retries.empty();
1119
1120 // We need to perform run-time alias checks, but some pointers had bounds
1121 // that couldn't be checked.
1122 if (NeedsAliasSetRTCheck && !CanDoAliasSetRT) {
1123 // Reset the CanDoSetRt flag and retry all accesses that have failed.
1124 // We know that we need these checks, so we can now be more aggressive
1125 // and add further checks if required (overflow checks).
1126 CanDoAliasSetRT = true;
1127 for (auto Retry : Retries) {
1128 MemAccessInfo Access = Retry.first;
1129 Type *AccessTy = Retry.second;
1130 if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
1131 DepSetId, TheLoop, RunningDepId, ASId,
1132 ShouldCheckWrap, /*Assume=*/true)) {
1133 CanDoAliasSetRT = false;
1134 UncomputablePtr = Access.getPointer();
1135 break;
1136 }
1137 }
1138 }
1139
1140 CanDoRT &= CanDoAliasSetRT;
1141 MayNeedRTCheck |= NeedsAliasSetRTCheck;
1142 ++ASId;
1143 }
1144
1145 // If the pointers that we would use for the bounds comparison have different
1146 // address spaces, assume the values aren't directly comparable, so we can't
1147 // use them for the runtime check. We also have to assume they could
1148 // overlap. In the future there should be metadata for whether address spaces
1149 // are disjoint.
1150 unsigned NumPointers = RtCheck.Pointers.size();
1151 for (unsigned i = 0; i < NumPointers; ++i) {
1152 for (unsigned j = i + 1; j < NumPointers; ++j) {
1153 // Only need to check pointers between two different dependency sets.
1154 if (RtCheck.Pointers[i].DependencySetId ==
1155 RtCheck.Pointers[j].DependencySetId)
1156 continue;
1157 // Only need to check pointers in the same alias set.
1158 if (RtCheck.Pointers[i].AliasSetId != RtCheck.Pointers[j].AliasSetId)
1159 continue;
1160
1161 Value *PtrI = RtCheck.Pointers[i].PointerValue;
1162 Value *PtrJ = RtCheck.Pointers[j].PointerValue;
1163
1164 unsigned ASi = PtrI->getType()->getPointerAddressSpace();
1165 unsigned ASj = PtrJ->getType()->getPointerAddressSpace();
1166 if (ASi != ASj) {
1167 LLVM_DEBUG(
1168 dbgs() << "LAA: Runtime check would require comparison between"
1169 " different address spaces\n");
1170 return false;
1171 }
1172 }
1173 }
1174
1175 if (MayNeedRTCheck && CanDoRT)
1176 RtCheck.generateChecks(DepCands, IsDepCheckNeeded);
1177
1178 LLVM_DEBUG(dbgs() << "LAA: We need to do " << RtCheck.getNumberOfChecks()
1179 << " pointer comparisons.\n");
1180
1181 // If we can do run-time checks, but there are no checks, no runtime checks
1182 // are needed. This can happen when all pointers point to the same underlying
1183 // object for example.
1184 RtCheck.Need = CanDoRT ? RtCheck.getNumberOfChecks() != 0 : MayNeedRTCheck;
1185
1186 bool CanDoRTIfNeeded = !RtCheck.Need || CanDoRT;
1187 if (!CanDoRTIfNeeded)
1188 RtCheck.reset();
1189 return CanDoRTIfNeeded;
1190}
1191
1192void AccessAnalysis::processMemAccesses() {
1193 // We process the set twice: first we process read-write pointers, last we
1194 // process read-only pointers. This allows us to skip dependence tests for
1195 // read-only pointers.
1196
1197 LLVM_DEBUG(dbgs() << "LAA: Processing memory accesses...\n");
1198 LLVM_DEBUG(dbgs() << " AST: "; AST.dump());
1199 LLVM_DEBUG(dbgs() << "LAA: Accesses(" << Accesses.size() << "):\n");
1200 LLVM_DEBUG({
1201 for (auto A : Accesses)
1202 dbgs() << "\t" << *A.first.getPointer() << " ("
1203 << (A.first.getInt()
1204 ? "write"
1205 : (ReadOnlyPtr.count(A.first.getPointer()) ? "read-only"
1206 : "read"))
1207 << ")\n";
1208 });
1209
1210 // The AliasSetTracker has nicely partitioned our pointers by metadata
1211 // compatibility and potential for underlying-object overlap. As a result, we
1212 // only need to check for potential pointer dependencies within each alias
1213 // set.
1214 for (const auto &AS : AST) {
1215 // Note that both the alias-set tracker and the alias sets themselves used
1216 // linked lists internally and so the iteration order here is deterministic
1217 // (matching the original instruction order within each set).
1218
1219 bool SetHasWrite = false;
1220
1221 // Map of pointers to last access encountered.
1222 typedef DenseMap<const Value*, MemAccessInfo> UnderlyingObjToAccessMap;
1223 UnderlyingObjToAccessMap ObjToLastAccess;
1224
1225 // Set of access to check after all writes have been processed.
1226 PtrAccessMap DeferredAccesses;
1227
1228 // Iterate over each alias set twice, once to process read/write pointers,
1229 // and then to process read-only pointers.
1230 for (int SetIteration = 0; SetIteration < 2; ++SetIteration) {
1231 bool UseDeferred = SetIteration > 0;
1232 PtrAccessMap &S = UseDeferred ? DeferredAccesses : Accesses;
1233
1234 for (const auto &AV : AS) {
1235 Value *Ptr = AV.getValue();
1236
1237 // For a single memory access in AliasSetTracker, Accesses may contain
1238 // both read and write, and they both need to be handled for CheckDeps.
1239 for (const auto &AC : S) {
1240 if (AC.first.getPointer() != Ptr)
1241 continue;
1242
1243 bool IsWrite = AC.first.getInt();
1244
1245 // If we're using the deferred access set, then it contains only
1246 // reads.
1247 bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite;
1248 if (UseDeferred && !IsReadOnlyPtr)
1249 continue;
1250 // Otherwise, the pointer must be in the PtrAccessSet, either as a
1251 // read or a write.
1252 assert(((IsReadOnlyPtr && UseDeferred) || IsWrite ||
1253 S.count(MemAccessInfo(Ptr, false))) &&
1254 "Alias-set pointer not in the access set?");
1255
1256 MemAccessInfo Access(Ptr, IsWrite);
1257 DepCands.insert(Access);
1258
1259 // Memorize read-only pointers for later processing and skip them in
1260 // the first round (they need to be checked after we have seen all
1261 // write pointers). Note: we also mark pointer that are not
1262 // consecutive as "read-only" pointers (so that we check
1263 // "a[b[i]] +="). Hence, we need the second check for "!IsWrite".
1264 if (!UseDeferred && IsReadOnlyPtr) {
1265 // We only use the pointer keys, the types vector values don't
1266 // matter.
1267 DeferredAccesses.insert({Access, {}});
1268 continue;
1269 }
1270
1271 // If this is a write - check other reads and writes for conflicts. If
1272 // this is a read only check other writes for conflicts (but only if
1273 // there is no other write to the ptr - this is an optimization to
1274 // catch "a[i] = a[i] + " without having to do a dependence check).
1275 if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) {
1276 CheckDeps.push_back(Access);
1277 IsRTCheckAnalysisNeeded = true;
1278 }
1279
1280 if (IsWrite)
1281 SetHasWrite = true;
1282
1283 // Create sets of pointers connected by a shared alias set and
1284 // underlying object.
1285 typedef SmallVector<const Value *, 16> ValueVector;
1286 ValueVector TempObjects;
1287
1288 getUnderlyingObjects(Ptr, TempObjects, LI);
1290 << "Underlying objects for pointer " << *Ptr << "\n");
1291 for (const Value *UnderlyingObj : TempObjects) {
1292 // nullptr never alias, don't join sets for pointer that have "null"
1293 // in their UnderlyingObjects list.
1294 if (isa<ConstantPointerNull>(UnderlyingObj) &&
1296 TheLoop->getHeader()->getParent(),
1297 UnderlyingObj->getType()->getPointerAddressSpace()))
1298 continue;
1299
1300 UnderlyingObjToAccessMap::iterator Prev =
1301 ObjToLastAccess.find(UnderlyingObj);
1302 if (Prev != ObjToLastAccess.end())
1303 DepCands.unionSets(Access, Prev->second);
1304
1305 ObjToLastAccess[UnderlyingObj] = Access;
1306 LLVM_DEBUG(dbgs() << " " << *UnderlyingObj << "\n");
1307 }
1308 }
1309 }
1310 }
1311 }
1312}
1313
1314static bool isInBoundsGep(Value *Ptr) {
1315 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr))
1316 return GEP->isInBounds();
1317 return false;
1318}
1319
1320/// Return true if an AddRec pointer \p Ptr is unsigned non-wrapping,
1321/// i.e. monotonically increasing/decreasing.
1322static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR,
1323 PredicatedScalarEvolution &PSE, const Loop *L) {
1324 // FIXME: This should probably only return true for NUW.
1326 return true;
1327
1328 // Scalar evolution does not propagate the non-wrapping flags to values that
1329 // are derived from a non-wrapping induction variable because non-wrapping
1330 // could be flow-sensitive.
1331 //
1332 // Look through the potentially overflowing instruction to try to prove
1333 // non-wrapping for the *specific* value of Ptr.
1334
1335 // The arithmetic implied by an inbounds GEP can't overflow.
1336 auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
1337 if (!GEP || !GEP->isInBounds())
1338 return false;
1339
1340 // Make sure there is only one non-const index and analyze that.
1341 Value *NonConstIndex = nullptr;
1342 for (Value *Index : GEP->indices())
1343 if (!isa<ConstantInt>(Index)) {
1344 if (NonConstIndex)
1345 return false;
1346 NonConstIndex = Index;
1347 }
1348 if (!NonConstIndex)
1349 // The recurrence is on the pointer, ignore for now.
1350 return false;
1351
1352 // The index in GEP is signed. It is non-wrapping if it's derived from a NSW
1353 // AddRec using a NSW operation.
1354 if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(NonConstIndex))
1355 if (OBO->hasNoSignedWrap() &&
1356 // Assume constant for other the operand so that the AddRec can be
1357 // easily found.
1358 isa<ConstantInt>(OBO->getOperand(1))) {
1359 auto *OpScev = PSE.getSCEV(OBO->getOperand(0));
1360
1361 if (auto *OpAR = dyn_cast<SCEVAddRecExpr>(OpScev))
1362 return OpAR->getLoop() == L && OpAR->getNoWrapFlags(SCEV::FlagNSW);
1363 }
1364
1365 return false;
1366}
1367
1368/// Check whether the access through \p Ptr has a constant stride.
1370 Type *AccessTy, Value *Ptr,
1371 const Loop *Lp,
1372 const ValueToValueMap &StridesMap,
1373 bool Assume, bool ShouldCheckWrap) {
1374 Type *Ty = Ptr->getType();
1375 assert(Ty->isPointerTy() && "Unexpected non-ptr");
1376
1377 if (isa<ScalableVectorType>(AccessTy)) {
1378 LLVM_DEBUG(dbgs() << "LAA: Bad stride - Scalable object: " << *AccessTy
1379 << "\n");
1380 return std::nullopt;
1381 }
1382
1383 const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr);
1384
1385 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
1386 if (Assume && !AR)
1387 AR = PSE.getAsAddRec(Ptr);
1388
1389 if (!AR) {
1390 LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not an AddRecExpr pointer " << *Ptr
1391 << " SCEV: " << *PtrScev << "\n");
1392 return std::nullopt;
1393 }
1394
1395 // The access function must stride over the innermost loop.
1396 if (Lp != AR->getLoop()) {
1397 LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not striding over innermost loop "
1398 << *Ptr << " SCEV: " << *AR << "\n");
1399 return std::nullopt;
1400 }
1401
1402 // The address calculation must not wrap. Otherwise, a dependence could be
1403 // inverted.
1404 // An inbounds getelementptr that is a AddRec with a unit stride
1405 // cannot wrap per definition. The unit stride requirement is checked later.
1406 // An getelementptr without an inbounds attribute and unit stride would have
1407 // to access the pointer value "0" which is undefined behavior in address
1408 // space 0, therefore we can also vectorize this case.
1409 unsigned AddrSpace = Ty->getPointerAddressSpace();
1410 bool IsInBoundsGEP = isInBoundsGep(Ptr);
1411 bool IsNoWrapAddRec = !ShouldCheckWrap ||
1413 isNoWrapAddRec(Ptr, AR, PSE, Lp);
1414 if (!IsNoWrapAddRec && !IsInBoundsGEP &&
1415 NullPointerIsDefined(Lp->getHeader()->getParent(), AddrSpace)) {
1416 if (Assume) {
1418 IsNoWrapAddRec = true;
1419 LLVM_DEBUG(dbgs() << "LAA: Pointer may wrap in the address space:\n"
1420 << "LAA: Pointer: " << *Ptr << "\n"
1421 << "LAA: SCEV: " << *AR << "\n"
1422 << "LAA: Added an overflow assumption\n");
1423 } else {
1424 LLVM_DEBUG(
1425 dbgs() << "LAA: Bad stride - Pointer may wrap in the address space "
1426 << *Ptr << " SCEV: " << *AR << "\n");
1427 return std::nullopt;
1428 }
1429 }
1430
1431 // Check the step is constant.
1432 const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
1433
1434 // Calculate the pointer stride and check if it is constant.
1435 const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
1436 if (!C) {
1437 LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not a constant strided " << *Ptr
1438 << " SCEV: " << *AR << "\n");
1439 return std::nullopt;
1440 }
1441
1442 auto &DL = Lp->getHeader()->getModule()->getDataLayout();
1443 TypeSize AllocSize = DL.getTypeAllocSize(AccessTy);
1444 int64_t Size = AllocSize.getFixedValue();
1445 const APInt &APStepVal = C->getAPInt();
1446
1447 // Huge step value - give up.
1448 if (APStepVal.getBitWidth() > 64)
1449 return std::nullopt;
1450
1451 int64_t StepVal = APStepVal.getSExtValue();
1452
1453 // Strided access.
1454 int64_t Stride = StepVal / Size;
1455 int64_t Rem = StepVal % Size;
1456 if (Rem)
1457 return std::nullopt;
1458
1459 // If the SCEV could wrap but we have an inbounds gep with a unit stride we
1460 // know we can't "wrap around the address space". In case of address space
1461 // zero we know that this won't happen without triggering undefined behavior.
1462 if (!IsNoWrapAddRec && Stride != 1 && Stride != -1 &&
1463 (IsInBoundsGEP || !NullPointerIsDefined(Lp->getHeader()->getParent(),
1464 AddrSpace))) {
1465 if (Assume) {
1466 // We can avoid this case by adding a run-time check.
1467 LLVM_DEBUG(dbgs() << "LAA: Non unit strided pointer which is not either "
1468 << "inbounds or in address space 0 may wrap:\n"
1469 << "LAA: Pointer: " << *Ptr << "\n"
1470 << "LAA: SCEV: " << *AR << "\n"
1471 << "LAA: Added an overflow assumption\n");
1473 } else
1474 return std::nullopt;
1475 }
1476
1477 return Stride;
1478}
1479
1480std::optional<int> llvm::getPointersDiff(Type *ElemTyA, Value *PtrA,
1481 Type *ElemTyB, Value *PtrB,
1482 const DataLayout &DL,
1483 ScalarEvolution &SE, bool StrictCheck,
1484 bool CheckType) {
1485 assert(PtrA && PtrB && "Expected non-nullptr pointers.");
1486 assert(cast<PointerType>(PtrA->getType())
1487 ->isOpaqueOrPointeeTypeMatches(ElemTyA) && "Wrong PtrA type");
1488 assert(cast<PointerType>(PtrB->getType())
1489 ->isOpaqueOrPointeeTypeMatches(ElemTyB) && "Wrong PtrB type");
1490
1491 // Make sure that A and B are different pointers.
1492 if (PtrA == PtrB)
1493 return 0;
1494
1495 // Make sure that the element types are the same if required.
1496 if (CheckType && ElemTyA != ElemTyB)
1497 return std::nullopt;
1498
1499 unsigned ASA = PtrA->getType()->getPointerAddressSpace();
1500 unsigned ASB = PtrB->getType()->getPointerAddressSpace();
1501
1502 // Check that the address spaces match.
1503 if (ASA != ASB)
1504 return std::nullopt;
1505 unsigned IdxWidth = DL.getIndexSizeInBits(ASA);
1506
1507 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
1508 Value *PtrA1 = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
1509 Value *PtrB1 = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
1510
1511 int Val;
1512 if (PtrA1 == PtrB1) {
1513 // Retrieve the address space again as pointer stripping now tracks through
1514 // `addrspacecast`.
1515 ASA = cast<PointerType>(PtrA1->getType())->getAddressSpace();
1516 ASB = cast<PointerType>(PtrB1->getType())->getAddressSpace();
1517 // Check that the address spaces match and that the pointers are valid.
1518 if (ASA != ASB)
1519 return std::nullopt;
1520
1521 IdxWidth = DL.getIndexSizeInBits(ASA);
1522 OffsetA = OffsetA.sextOrTrunc(IdxWidth);
1523 OffsetB = OffsetB.sextOrTrunc(IdxWidth);
1524
1525 OffsetB -= OffsetA;
1526 Val = OffsetB.getSExtValue();
1527 } else {
1528 // Otherwise compute the distance with SCEV between the base pointers.
1529 const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
1530 const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
1531 const auto *Diff =
1532 dyn_cast<SCEVConstant>(SE.getMinusSCEV(PtrSCEVB, PtrSCEVA));
1533 if (!Diff)
1534 return std::nullopt;
1535 Val = Diff->getAPInt().getSExtValue();
1536 }
1537 int Size = DL.getTypeStoreSize(ElemTyA);
1538 int Dist = Val / Size;
1539
1540 // Ensure that the calculated distance matches the type-based one after all
1541 // the bitcasts removal in the provided pointers.
1542 if (!StrictCheck || Dist * Size == Val)
1543 return Dist;
1544 return std::nullopt;
1545}
1546
1548 const DataLayout &DL, ScalarEvolution &SE,
1549 SmallVectorImpl<unsigned> &SortedIndices) {
1551 VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
1552 "Expected list of pointer operands.");
1553 // Walk over the pointers, and map each of them to an offset relative to
1554 // first pointer in the array.
1555 Value *Ptr0 = VL[0];
1556
1557 using DistOrdPair = std::pair<int64_t, int>;
1558 auto Compare = llvm::less_first();
1559 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
1560 Offsets.emplace(0, 0);
1561 int Cnt = 1;
1562 bool IsConsecutive = true;
1563 for (auto *Ptr : VL.drop_front()) {
1564 std::optional<int> Diff = getPointersDiff(ElemTy, Ptr0, ElemTy, Ptr, DL, SE,
1565 /*StrictCheck=*/true);
1566 if (!Diff)
1567 return false;
1568
1569 // Check if the pointer with the same offset is found.
1570 int64_t Offset = *Diff;
1571 auto Res = Offsets.emplace(Offset, Cnt);
1572 if (!Res.second)
1573 return false;
1574 // Consecutive order if the inserted element is the last one.
1575 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
1576 ++Cnt;
1577 }
1578 SortedIndices.clear();
1579 if (!IsConsecutive) {
1580 // Fill SortedIndices array only if it is non-consecutive.
1581 SortedIndices.resize(VL.size());
1582 Cnt = 0;
1583 for (const std::pair<int64_t, int> &Pair : Offsets) {
1584 SortedIndices[Cnt] = Pair.second;
1585 ++Cnt;
1586 }
1587 }
1588 return true;
1589}
1590
1591/// Returns true if the memory operations \p A and \p B are consecutive.
1593 ScalarEvolution &SE, bool CheckType) {
1596 if (!PtrA || !PtrB)
1597 return false;
1598 Type *ElemTyA = getLoadStoreType(A);
1599 Type *ElemTyB = getLoadStoreType(B);
1600 std::optional<int> Diff =
1601 getPointersDiff(ElemTyA, PtrA, ElemTyB, PtrB, DL, SE,
1602 /*StrictCheck=*/true, CheckType);
1603 return Diff && *Diff == 1;
1604}
1605
1607 visitPointers(SI->getPointerOperand(), *InnermostLoop,
1608 [this, SI](Value *Ptr) {
1609 Accesses[MemAccessInfo(Ptr, true)].push_back(AccessIdx);
1610 InstMap.push_back(SI);
1611 ++AccessIdx;
1612 });
1613}
1614
1616 visitPointers(LI->getPointerOperand(), *InnermostLoop,
1617 [this, LI](Value *Ptr) {
1618 Accesses[MemAccessInfo(Ptr, false)].push_back(AccessIdx);
1619 InstMap.push_back(LI);
1620 ++AccessIdx;
1621 });
1622}
1623
1626 switch (Type) {
1627 case NoDep:
1628 case Forward:
1631
1632 case Unknown:
1635 case Backward:
1638 }
1639 llvm_unreachable("unexpected DepType!");
1640}
1641
1643 switch (Type) {
1644 case NoDep:
1645 case Forward:
1646 case ForwardButPreventsForwarding:
1647 case Unknown:
1648 return false;
1649
1650 case BackwardVectorizable:
1651 case Backward:
1652 case BackwardVectorizableButPreventsForwarding:
1653 return true;
1654 }
1655 llvm_unreachable("unexpected DepType!");
1656}
1657
1659 return isBackward() || Type == Unknown;
1660}
1661
1663 switch (Type) {
1664 case Forward:
1665 case ForwardButPreventsForwarding:
1666 return true;
1667
1668 case NoDep:
1669 case Unknown:
1670 case BackwardVectorizable:
1671 case Backward:
1672 case BackwardVectorizableButPreventsForwarding:
1673 return false;
1674 }
1675 llvm_unreachable("unexpected DepType!");
1676}
1677
1678bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance,
1679 uint64_t TypeByteSize) {
1680 // If loads occur at a distance that is not a multiple of a feasible vector
1681 // factor store-load forwarding does not take place.
1682 // Positive dependences might cause troubles because vectorizing them might
1683 // prevent store-load forwarding making vectorized code run a lot slower.
1684 // a[i] = a[i-3] ^ a[i-8];
1685 // The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and
1686 // hence on your typical architecture store-load forwarding does not take
1687 // place. Vectorizing in such cases does not make sense.
1688 // Store-load forwarding distance.
1689
1690 // After this many iterations store-to-load forwarding conflicts should not
1691 // cause any slowdowns.
1692 const uint64_t NumItersForStoreLoadThroughMemory = 8 * TypeByteSize;
1693 // Maximum vector factor.
1694 uint64_t MaxVFWithoutSLForwardIssues = std::min(
1695 VectorizerParams::MaxVectorWidth * TypeByteSize, MaxSafeDepDistBytes);
1696
1697 // Compute the smallest VF at which the store and load would be misaligned.
1698 for (uint64_t VF = 2 * TypeByteSize; VF <= MaxVFWithoutSLForwardIssues;
1699 VF *= 2) {
1700 // If the number of vector iteration between the store and the load are
1701 // small we could incur conflicts.
1702 if (Distance % VF && Distance / VF < NumItersForStoreLoadThroughMemory) {
1703 MaxVFWithoutSLForwardIssues = (VF >> 1);
1704 break;
1705 }
1706 }
1707
1708 if (MaxVFWithoutSLForwardIssues < 2 * TypeByteSize) {
1709 LLVM_DEBUG(
1710 dbgs() << "LAA: Distance " << Distance
1711 << " that could cause a store-load forwarding conflict\n");
1712 return true;
1713 }
1714
1715 if (MaxVFWithoutSLForwardIssues < MaxSafeDepDistBytes &&
1716 MaxVFWithoutSLForwardIssues !=
1717 VectorizerParams::MaxVectorWidth * TypeByteSize)
1718 MaxSafeDepDistBytes = MaxVFWithoutSLForwardIssues;
1719 return false;
1720}
1721
1722void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) {
1723 if (Status < S)
1724 Status = S;
1725}
1726
1727/// Given a dependence-distance \p Dist between two
1728/// memory accesses, that have the same stride whose absolute value is given
1729/// in \p Stride, and that have the same type size \p TypeByteSize,
1730/// in a loop whose takenCount is \p BackedgeTakenCount, check if it is
1731/// possible to prove statically that the dependence distance is larger
1732/// than the range that the accesses will travel through the execution of
1733/// the loop. If so, return true; false otherwise. This is useful for
1734/// example in loops such as the following (PR31098):
1735/// for (i = 0; i < D; ++i) {
1736/// = out[i];
1737/// out[i+D] =
1738/// }
1740 const SCEV &BackedgeTakenCount,
1741 const SCEV &Dist, uint64_t Stride,
1742 uint64_t TypeByteSize) {
1743
1744 // If we can prove that
1745 // (**) |Dist| > BackedgeTakenCount * Step
1746 // where Step is the absolute stride of the memory accesses in bytes,
1747 // then there is no dependence.
1748 //
1749 // Rationale:
1750 // We basically want to check if the absolute distance (|Dist/Step|)
1751 // is >= the loop iteration count (or > BackedgeTakenCount).
1752 // This is equivalent to the Strong SIV Test (Practical Dependence Testing,
1753 // Section 4.2.1); Note, that for vectorization it is sufficient to prove
1754 // that the dependence distance is >= VF; This is checked elsewhere.
1755 // But in some cases we can prune dependence distances early, and
1756 // even before selecting the VF, and without a runtime test, by comparing
1757 // the distance against the loop iteration count. Since the vectorized code
1758 // will be executed only if LoopCount >= VF, proving distance >= LoopCount
1759 // also guarantees that distance >= VF.
1760 //
1761 const uint64_t ByteStride = Stride * TypeByteSize;
1762 const SCEV *Step = SE.getConstant(BackedgeTakenCount.getType(), ByteStride);
1763 const SCEV *Product = SE.getMulExpr(&BackedgeTakenCount, Step);
1764
1765 const SCEV *CastedDist = &Dist;
1766 const SCEV *CastedProduct = Product;
1767 uint64_t DistTypeSizeBits = DL.getTypeSizeInBits(Dist.getType());
1768 uint64_t ProductTypeSizeBits = DL.getTypeSizeInBits(Product->getType());
1769
1770 // The dependence distance can be positive/negative, so we sign extend Dist;
1771 // The multiplication of the absolute stride in bytes and the
1772 // backedgeTakenCount is non-negative, so we zero extend Product.
1773 if (DistTypeSizeBits > ProductTypeSizeBits)
1774 CastedProduct = SE.getZeroExtendExpr(Product, Dist.getType());
1775 else
1776 CastedDist = SE.getNoopOrSignExtend(&Dist, Product->getType());
1777
1778 // Is Dist - (BackedgeTakenCount * Step) > 0 ?
1779 // (If so, then we have proven (**) because |Dist| >= Dist)
1780 const SCEV *Minus = SE.getMinusSCEV(CastedDist, CastedProduct);
1781 if (SE.isKnownPositive(Minus))
1782 return true;
1783
1784 // Second try: Is -Dist - (BackedgeTakenCount * Step) > 0 ?
1785 // (If so, then we have proven (**) because |Dist| >= -1*Dist)
1786 const SCEV *NegDist = SE.getNegativeSCEV(CastedDist);
1787 Minus = SE.getMinusSCEV(NegDist, CastedProduct);
1788 if (SE.isKnownPositive(Minus))
1789 return true;
1790
1791 return false;
1792}
1793
1794/// Check the dependence for two accesses with the same stride \p Stride.
1795/// \p Distance is the positive distance and \p TypeByteSize is type size in
1796/// bytes.
1797///
1798/// \returns true if they are independent.
1800 uint64_t TypeByteSize) {
1801 assert(Stride > 1 && "The stride must be greater than 1");
1802 assert(TypeByteSize > 0 && "The type size in byte must be non-zero");
1803 assert(Distance > 0 && "The distance must be non-zero");
1804
1805 // Skip if the distance is not multiple of type byte size.
1806 if (Distance % TypeByteSize)
1807 return false;
1808
1809 uint64_t ScaledDist = Distance / TypeByteSize;
1810
1811 // No dependence if the scaled distance is not multiple of the stride.
1812 // E.g.
1813 // for (i = 0; i < 1024 ; i += 4)
1814 // A[i+2] = A[i] + 1;
1815 //
1816 // Two accesses in memory (scaled distance is 2, stride is 4):
1817 // | A[0] | | | | A[4] | | | |
1818 // | | | A[2] | | | | A[6] | |
1819 //
1820 // E.g.
1821 // for (i = 0; i < 1024 ; i += 3)
1822 // A[i+4] = A[i] + 1;
1823 //
1824 // Two accesses in memory (scaled distance is 4, stride is 3):
1825 // | A[0] | | | A[3] | | | A[6] | | |
1826 // | | | | | A[4] | | | A[7] | |
1827 return ScaledDist % Stride;
1828}
1829
1831MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
1832 const MemAccessInfo &B, unsigned BIdx,
1833 const ValueToValueMap &Strides) {
1834 assert (AIdx < BIdx && "Must pass arguments in program order");
1835
1836 auto [APtr, AIsWrite] = A;
1837 auto [BPtr, BIsWrite] = B;
1838 Type *ATy = getLoadStoreType(InstMap[AIdx]);
1839 Type *BTy = getLoadStoreType(InstMap[BIdx]);
1840
1841 // Two reads are independent.
1842 if (!AIsWrite && !BIsWrite)
1843 return Dependence::NoDep;
1844
1845 // We cannot check pointers in different address spaces.
1846 if (APtr->getType()->getPointerAddressSpace() !=
1847 BPtr->getType()->getPointerAddressSpace())
1848 return Dependence::Unknown;
1849
1850 int64_t StrideAPtr =
1851 getPtrStride(PSE, ATy, APtr, InnermostLoop, Strides, true).value_or(0);
1852 int64_t StrideBPtr =
1853 getPtrStride(PSE, BTy, BPtr, InnermostLoop, Strides, true).value_or(0);
1854
1855 const SCEV *Src = PSE.getSCEV(APtr);
1856 const SCEV *Sink = PSE.getSCEV(BPtr);
1857
1858 // If the induction step is negative we have to invert source and sink of the
1859 // dependence.
1860 if (StrideAPtr < 0) {
1861 std::swap(APtr, BPtr);
1862 std::swap(ATy, BTy);
1863 std::swap(Src, Sink);
1864 std::swap(AIsWrite, BIsWrite);
1865 std::swap(AIdx, BIdx);
1866 std::swap(StrideAPtr, StrideBPtr);
1867 }
1868
1869 ScalarEvolution &SE = *PSE.getSE();
1870 const SCEV *Dist = SE.getMinusSCEV(Sink, Src);
1871
1872 LLVM_DEBUG(dbgs() << "LAA: Src Scev: " << *Src << "Sink Scev: " << *Sink
1873 << "(Induction step: " << StrideAPtr << ")\n");
1874 LLVM_DEBUG(dbgs() << "LAA: Distance for " << *InstMap[AIdx] << " to "
1875 << *InstMap[BIdx] << ": " << *Dist << "\n");
1876
1877 // Need accesses with constant stride. We don't want to vectorize
1878 // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in
1879 // the address space.
1880 if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr){
1881 LLVM_DEBUG(dbgs() << "Pointer access with non-constant stride\n");
1882 return Dependence::Unknown;
1883 }
1884
1885 auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout();
1886 uint64_t TypeByteSize = DL.getTypeAllocSize(ATy);
1887 bool HasSameSize =
1888 DL.getTypeStoreSizeInBits(ATy) == DL.getTypeStoreSizeInBits(BTy);
1889 uint64_t Stride = std::abs(StrideAPtr);
1890
1891 if (!isa<SCEVCouldNotCompute>(Dist) && HasSameSize &&
1893 Stride, TypeByteSize))
1894 return Dependence::NoDep;
1895
1896 const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
1897 if (!C) {
1898 LLVM_DEBUG(dbgs() << "LAA: Dependence because of non-constant distance\n");
1899 FoundNonConstantDistanceDependence = true;
1900 return Dependence::Unknown;
1901 }
1902
1903 const APInt &Val = C->getAPInt();
1904 int64_t Distance = Val.getSExtValue();
1905
1906 // Attempt to prove strided accesses independent.
1907 if (std::abs(Distance) > 0 && Stride > 1 && HasSameSize &&
1908 areStridedAccessesIndependent(std::abs(Distance), Stride, TypeByteSize)) {
1909 LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n");
1910 return Dependence::NoDep;
1911 }
1912
1913 // Negative distances are not plausible dependencies.
1914 if (Val.isNegative()) {
1915 bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
1916 if (IsTrueDataDependence && EnableForwardingConflictDetection &&
1917 (couldPreventStoreLoadForward(Val.abs().getZExtValue(), TypeByteSize) ||
1918 !HasSameSize)) {
1919 LLVM_DEBUG(dbgs() << "LAA: Forward but may prevent st->ld forwarding\n");
1921 }
1922
1923 LLVM_DEBUG(dbgs() << "LAA: Dependence is negative\n");
1924 return Dependence::Forward;
1925 }
1926
1927 // Write to the same location with the same size.
1928 if (Val == 0) {
1929 if (HasSameSize)
1930 return Dependence::Forward;
1931 LLVM_DEBUG(
1932 dbgs() << "LAA: Zero dependence difference but different type sizes\n");
1933 return Dependence::Unknown;
1934 }
1935
1936 assert(Val.isStrictlyPositive() && "Expect a positive value");
1937
1938 if (!HasSameSize) {
1939 LLVM_DEBUG(dbgs() << "LAA: ReadWrite-Write positive dependency with "
1940 "different type sizes\n");
1941 return Dependence::Unknown;
1942 }
1943
1944 // Bail out early if passed-in parameters make vectorization not feasible.
1945 unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ?
1947 unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ?
1949 // The minimum number of iterations for a vectorized/unrolled version.
1950 unsigned MinNumIter = std::max(ForcedFactor * ForcedUnroll, 2U);
1951
1952 // It's not vectorizable if the distance is smaller than the minimum distance
1953 // needed for a vectroized/unrolled version. Vectorizing one iteration in
1954 // front needs TypeByteSize * Stride. Vectorizing the last iteration needs
1955 // TypeByteSize (No need to plus the last gap distance).
1956 //
1957 // E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
1958 // foo(int *A) {
1959 // int *B = (int *)((char *)A + 14);
1960 // for (i = 0 ; i < 1024 ; i += 2)
1961 // B[i] = A[i] + 1;
1962 // }
1963 //
1964 // Two accesses in memory (stride is 2):
1965 // | A[0] | | A[2] | | A[4] | | A[6] | |
1966 // | B[0] | | B[2] | | B[4] |
1967 //
1968 // Distance needs for vectorizing iterations except the last iteration:
1969 // 4 * 2 * (MinNumIter - 1). Distance needs for the last iteration: 4.
1970 // So the minimum distance needed is: 4 * 2 * (MinNumIter - 1) + 4.
1971 //
1972 // If MinNumIter is 2, it is vectorizable as the minimum distance needed is
1973 // 12, which is less than distance.
1974 //
1975 // If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4),
1976 // the minimum distance needed is 28, which is greater than distance. It is
1977 // not safe to do vectorization.
1978 uint64_t MinDistanceNeeded =
1979 TypeByteSize * Stride * (MinNumIter - 1) + TypeByteSize;
1980 if (MinDistanceNeeded > static_cast<uint64_t>(Distance)) {
1981 LLVM_DEBUG(dbgs() << "LAA: Failure because of positive distance "
1982 << Distance << '\n');
1983 return Dependence::Backward;
1984 }
1985
1986 // Unsafe if the minimum distance needed is greater than max safe distance.
1987 if (MinDistanceNeeded > MaxSafeDepDistBytes) {
1988 LLVM_DEBUG(dbgs() << "LAA: Failure because it needs at least "
1989 << MinDistanceNeeded << " size in bytes\n");
1990 return Dependence::Backward;
1991 }
1992
1993 // Positive distance bigger than max vectorization factor.
1994 // FIXME: Should use max factor instead of max distance in bytes, which could
1995 // not handle different types.
1996 // E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
1997 // void foo (int *A, char *B) {
1998 // for (unsigned i = 0; i < 1024; i++) {
1999 // A[i+2] = A[i] + 1;
2000 // B[i+2] = B[i] + 1;
2001 // }
2002 // }
2003 //
2004 // This case is currently unsafe according to the max safe distance. If we
2005 // analyze the two accesses on array B, the max safe dependence distance
2006 // is 2. Then we analyze the accesses on array A, the minimum distance needed
2007 // is 8, which is less than 2 and forbidden vectorization, But actually
2008 // both A and B could be vectorized by 2 iterations.
2009 MaxSafeDepDistBytes =
2010 std::min(static_cast<uint64_t>(Distance), MaxSafeDepDistBytes);
2011
2012 bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
2013 if (IsTrueDataDependence && EnableForwardingConflictDetection &&
2014 couldPreventStoreLoadForward(Distance, TypeByteSize))
2016
2017 uint64_t MaxVF = MaxSafeDepDistBytes / (TypeByteSize * Stride);
2018 LLVM_DEBUG(dbgs() << "LAA: Positive distance " << Val.getSExtValue()
2019 << " with max VF = " << MaxVF << '\n');
2020 uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8;
2021 MaxSafeVectorWidthInBits = std::min(MaxSafeVectorWidthInBits, MaxVFInBits);
2023}
2024
2026 MemAccessInfoList &CheckDeps,
2027 const ValueToValueMap &Strides) {
2028
2029 MaxSafeDepDistBytes = -1;
2031 for (MemAccessInfo CurAccess : CheckDeps) {
2032 if (Visited.count(CurAccess))
2033 continue;
2034
2035 // Get the relevant memory access set.
2037 AccessSets.findValue(AccessSets.getLeaderValue(CurAccess));
2038
2039 // Check accesses within this set.
2041 AccessSets.member_begin(I);
2043 AccessSets.member_end();
2044
2045 // Check every access pair.
2046 while (AI != AE) {
2047 Visited.insert(*AI);
2048 bool AIIsWrite = AI->getInt();
2049 // Check loads only against next equivalent class, but stores also against
2050 // other stores in the same equivalence class - to the same address.
2052 (AIIsWrite ? AI : std::next(AI));
2053 while (OI != AE) {
2054 // Check every accessing instruction pair in program order.
2055 for (std::vector<unsigned>::iterator I1 = Accesses[*AI].begin(),
2056 I1E = Accesses[*AI].end(); I1 != I1E; ++I1)
2057 // Scan all accesses of another equivalence class, but only the next
2058 // accesses of the same equivalent class.
2059 for (std::vector<unsigned>::iterator
2060 I2 = (OI == AI ? std::next(I1) : Accesses[*OI].begin()),
2061 I2E = (OI == AI ? I1E : Accesses[*OI].end());
2062 I2 != I2E; ++I2) {
2063 auto A = std::make_pair(&*AI, *I1);
2064 auto B = std::make_pair(&*OI, *I2);
2065
2066 assert(*I1 != *I2);
2067 if (*I1 > *I2)
2068 std::swap(A, B);
2069
2071 isDependent(*A.first, A.second, *B.first, B.second, Strides);
2073
2074 // Gather dependences unless we accumulated MaxDependences
2075 // dependences. In that case return as soon as we find the first
2076 // unsafe dependence. This puts a limit on this quadratic
2077 // algorithm.
2078 if (RecordDependences) {
2079 if (Type != Dependence::NoDep)
2080 Dependences.push_back(Dependence(A.second, B.second, Type));
2081
2082 if (Dependences.size() >= MaxDependences) {
2083 RecordDependences = false;
2084 Dependences.clear();
2086 << "Too many dependences, stopped recording\n");
2087 }
2088 }
2089 if (!RecordDependences && !isSafeForVectorization())
2090 return false;
2091 }
2092 ++OI;
2093 }
2094 AI++;
2095 }
2096 }
2097
2098 LLVM_DEBUG(dbgs() << "Total Dependences: " << Dependences.size() << "\n");
2099 return isSafeForVectorization();
2100}
2101
2104 MemAccessInfo Access(Ptr, isWrite);
2105 auto &IndexVector = Accesses.find(Access)->second;
2106
2108 transform(IndexVector,
2109 std::back_inserter(Insts),
2110 [&](unsigned Idx) { return this->InstMap[Idx]; });
2111 return Insts;
2112}
2113
2115 "NoDep", "Unknown", "Forward", "ForwardButPreventsForwarding", "Backward",
2116 "BackwardVectorizable", "BackwardVectorizableButPreventsForwarding"};
2117
2119 raw_ostream &OS, unsigned Depth,
2120 const SmallVectorImpl<Instruction *> &Instrs) const {
2121 OS.indent(Depth) << DepName[Type] << ":\n";
2122 OS.indent(Depth + 2) << *Instrs[Source] << " -> \n";
2123 OS.indent(Depth + 2) << *Instrs[Destination] << "\n";
2124}
2125
2126bool LoopAccessInfo::canAnalyzeLoop() {
2127 // We need to have a loop header.
2128 LLVM_DEBUG(dbgs() << "LAA: Found a loop in "
2129 << TheLoop->getHeader()->getParent()->getName() << ": "
2130 << TheLoop->getHeader()->getName() << '\n');
2131
2132 // We can only analyze innermost loops.
2133 if (!TheLoop->isInnermost()) {
2134 LLVM_DEBUG(dbgs() << "LAA: loop is not the innermost loop\n");
2135 recordAnalysis("NotInnerMostLoop") << "loop is not the innermost loop";
2136 return false;
2137 }
2138
2139 // We must have a single backedge.
2140 if (TheLoop->getNumBackEdges() != 1) {
2141 LLVM_DEBUG(
2142 dbgs() << "LAA: loop control flow is not understood by analyzer\n");
2143 recordAnalysis("CFGNotUnderstood")
2144 << "loop control flow is not understood by analyzer";
2145 return false;
2146 }
2147
2148 // ScalarEvolution needs to be able to find the exit count.
2149 const SCEV *ExitCount = PSE->getBackedgeTakenCount();
2150 if (isa<SCEVCouldNotCompute>(ExitCount)) {
2151 recordAnalysis("CantComputeNumberOfIterations")
2152 << "could not determine number of loop iterations";
2153 LLVM_DEBUG(dbgs() << "LAA: SCEV could not compute the loop exit count.\n");
2154 return false;
2155 }
2156
2157 return true;
2158}
2159
2160void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
2161 const TargetLibraryInfo *TLI,
2162 DominatorTree *DT) {
2163 // Holds the Load and Store instructions.
2166
2167 // Holds all the different accesses in the loop.
2168 unsigned NumReads = 0;
2169 unsigned NumReadWrites = 0;
2170
2171 bool HasComplexMemInst = false;
2172
2173 // A runtime check is only legal to insert if there are no convergent calls.
2174 HasConvergentOp = false;
2175
2176 PtrRtChecking->Pointers.clear();
2177 PtrRtChecking->Need = false;
2178
2179 const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
2180
2181 const bool EnableMemAccessVersioningOfLoop =
2183 !TheLoop->getHeader()->getParent()->hasOptSize();
2184
2185 // Traverse blocks in fixed RPOT order, regardless of their storage in the
2186 // loop info, as it may be arbitrary.
2187 LoopBlocksRPO RPOT(TheLoop);
2188 RPOT.perform(LI);
2189 for (BasicBlock *BB : RPOT) {
2190 // Scan the BB and collect legal loads and stores. Also detect any
2191 // convergent instructions.
2192 for (Instruction &I : *BB) {
2193 if (auto *Call = dyn_cast<CallBase>(&I)) {
2194 if (Call->isConvergent())
2195 HasConvergentOp = true;
2196 }
2197
2198 // With both a non-vectorizable memory instruction and a convergent
2199 // operation, found in this loop, no reason to continue the search.
2200 if (HasComplexMemInst && HasConvergentOp) {
2201 CanVecMem = false;
2202 return;
2203 }
2204
2205 // Avoid hitting recordAnalysis multiple times.
2206 if (HasComplexMemInst)
2207 continue;
2208
2209 // If this is a load, save it. If this instruction can read from memory
2210 // but is not a load, then we quit. Notice that we don't handle function
2211 // calls that read or write.
2212 if (I.mayReadFromMemory()) {
2213 // Many math library functions read the rounding mode. We will only
2214 // vectorize a loop if it contains known function calls that don't set
2215 // the flag. Therefore, it is safe to ignore this read from memory.
2216 auto *Call = dyn_cast<CallInst>(&I);
2217 if (Call && getVectorIntrinsicIDForCall(Call, TLI))
2218 continue;
2219
2220 // If the function has an explicit vectorized counterpart, we can safely
2221 // assume that it can be vectorized.
2222 if (Call && !Call->isNoBuiltin() && Call->getCalledFunction() &&
2223 !VFDatabase::getMappings(*Call).empty())
2224 continue;
2225
2226 auto *Ld = dyn_cast<LoadInst>(&I);
2227 if (!Ld) {
2228 recordAnalysis("CantVectorizeInstruction", Ld)
2229 << "instruction cannot be vectorized";
2230 HasComplexMemInst = true;
2231 continue;
2232 }
2233 if (!Ld->isSimple() && !IsAnnotatedParallel) {
2234 recordAnalysis("NonSimpleLoad", Ld)
2235 << "read with atomic ordering or volatile read";
2236 LLVM_DEBUG(dbgs() << "LAA: Found a non-simple load.\n");
2237 HasComplexMemInst = true;
2238 continue;
2239 }
2240 NumLoads++;
2241 Loads.push_back(Ld);
2242 DepChecker->addAccess(Ld);
2243 if (EnableMemAccessVersioningOfLoop)
2244 collectStridedAccess(Ld);
2245 continue;
2246 }
2247
2248 // Save 'store' instructions. Abort if other instructions write to memory.
2249 if (I.mayWriteToMemory()) {
2250 auto *St = dyn_cast<StoreInst>(&I);
2251 if (!St) {
2252 recordAnalysis("CantVectorizeInstruction", St)
2253 << "instruction cannot be vectorized";
2254 HasComplexMemInst = true;
2255 continue;
2256 }
2257 if (!St->isSimple() && !IsAnnotatedParallel) {
2258 recordAnalysis("NonSimpleStore", St)
2259 << "write with atomic ordering or volatile write";
2260 LLVM_DEBUG(dbgs() << "LAA: Found a non-simple store.\n");
2261 HasComplexMemInst = true;
2262 continue;
2263 }
2264 NumStores++;
2265 Stores.push_back(St);
2266 DepChecker->addAccess(St);
2267 if (EnableMemAccessVersioningOfLoop)
2268 collectStridedAccess(St);
2269 }
2270 } // Next instr.
2271 } // Next block.
2272
2273 if (HasComplexMemInst) {
2274 CanVecMem = false;
2275 return;
2276 }
2277
2278 // Now we have two lists that hold the loads and the stores.
2279 // Next, we find the pointers that they use.
2280
2281 // Check if we see any stores. If there are no stores, then we don't
2282 // care if the pointers are *restrict*.
2283 if (!Stores.size()) {
2284 LLVM_DEBUG(dbgs() << "LAA: Found a read-only loop!\n");
2285 CanVecMem = true;
2286 return;
2287 }
2288
2289 MemoryDepChecker::DepCandidates DependentAccesses;
2290 AccessAnalysis Accesses(TheLoop, AA, LI, DependentAccesses, *PSE);
2291
2292 // Holds the analyzed pointers. We don't want to call getUnderlyingObjects
2293 // multiple times on the same object. If the ptr is accessed twice, once
2294 // for read and once for write, it will only appear once (on the write
2295 // list). This is okay, since we are going to check for conflicts between
2296 // writes and between reads and writes, but not between reads and reads.
2298
2299 // Record uniform store addresses to identify if we have multiple stores
2300 // to the same address.
2301 SmallPtrSet<Value *, 16> UniformStores;
2302
2303 for (StoreInst *ST : Stores) {
2304 Value *Ptr = ST->getPointerOperand();
2305
2306 if (isUniform(Ptr)) {
2307 // Record store instructions to loop invariant addresses
2308 StoresToInvariantAddresses.push_back(ST);
2309 HasDependenceInvolvingLoopInvariantAddress |=
2310 !UniformStores.insert(Ptr).second;
2311 }
2312
2313 // If we did *not* see this pointer before, insert it to the read-write
2314 // list. At this phase it is only a 'write' list.
2315 Type *AccessTy = getLoadStoreType(ST);
2316 if (Seen.insert({Ptr, AccessTy}).second) {
2317 ++NumReadWrites;
2318
2320 // The TBAA metadata could have a control dependency on the predication
2321 // condition, so we cannot rely on it when determining whether or not we
2322 // need runtime pointer checks.
2323 if (blockNeedsPredication(ST->getParent(), TheLoop, DT))
2324 Loc.AATags.TBAA = nullptr;
2325
2326 visitPointers(const_cast<Value *>(Loc.Ptr), *TheLoop,
2327 [&Accesses, AccessTy, Loc](Value *Ptr) {
2328 MemoryLocation NewLoc = Loc.getWithNewPtr(Ptr);
2329 Accesses.addStore(NewLoc, AccessTy);
2330 });
2331 }
2332 }
2333
2334 if (IsAnnotatedParallel) {
2335 LLVM_DEBUG(
2336 dbgs() << "LAA: A loop annotated parallel, ignore memory dependency "
2337 << "checks.\n");
2338 CanVecMem = true;
2339 return;
2340 }
2341
2342 for (LoadInst *LD : Loads) {
2343 Value *Ptr = LD->getPointerOperand();
2344 // If we did *not* see this pointer before, insert it to the
2345 // read list. If we *did* see it before, then it is already in
2346 // the read-write list. This allows us to vectorize expressions
2347 // such as A[i] += x; Because the address of A[i] is a read-write
2348 // pointer. This only works if the index of A[i] is consecutive.
2349 // If the address of i is unknown (for example A[B[i]]) then we may
2350 // read a few words, modify, and write a few words, and some of the
2351 // words may be written to the same address.
2352 bool IsReadOnlyPtr = false;
2353 Type *AccessTy = getLoadStoreType(LD);
2354 if (Seen.insert({Ptr, AccessTy}).second ||
2355 !getPtrStride(*PSE, LD->getType(), Ptr, TheLoop, SymbolicStrides).value_or(0)) {
2356 ++NumReads;
2357 IsReadOnlyPtr = true;
2358 }
2359
2360 // See if there is an unsafe dependency between a load to a uniform address and
2361 // store to the same uniform address.
2362 if (UniformStores.count(Ptr)) {
2363 LLVM_DEBUG(dbgs() << "LAA: Found an unsafe dependency between a uniform "
2364 "load and uniform store to the same address!\n");
2365 HasDependenceInvolvingLoopInvariantAddress = true;
2366 }
2367
2369 // The TBAA metadata could have a control dependency on the predication
2370 // condition, so we cannot rely on it when determining whether or not we
2371 // need runtime pointer checks.
2372 if (blockNeedsPredication(LD->getParent(), TheLoop, DT))
2373 Loc.AATags.TBAA = nullptr;
2374
2375 visitPointers(const_cast<Value *>(Loc.Ptr), *TheLoop,
2376 [&Accesses, AccessTy, Loc, IsReadOnlyPtr](Value *Ptr) {
2377 MemoryLocation NewLoc = Loc.getWithNewPtr(Ptr);
2378 Accesses.addLoad(NewLoc, AccessTy, IsReadOnlyPtr);
2379 });
2380 }
2381
2382 // If we write (or read-write) to a single destination and there are no
2383 // other reads in this loop then is it safe to vectorize.
2384 if (NumReadWrites == 1 && NumReads == 0) {
2385 LLVM_DEBUG(dbgs() << "LAA: Found a write-only loop!\n");
2386 CanVecMem = true;
2387 return;
2388 }
2389
2390 // Build dependence sets and check whether we need a runtime pointer bounds
2391 // check.
2392 Accesses.buildDependenceSets();
2393
2394 // Find pointers with computable bounds. We are going to use this information
2395 // to place a runtime bound check.
2396 Value *UncomputablePtr = nullptr;
2397 bool CanDoRTIfNeeded =
2398 Accesses.canCheckPtrAtRT(*PtrRtChecking, PSE->getSE(), TheLoop,
2399 SymbolicStrides, UncomputablePtr, false);
2400 if (!CanDoRTIfNeeded) {
2401 auto *I = dyn_cast_or_null<Instruction>(UncomputablePtr);
2402 recordAnalysis("CantIdentifyArrayBounds", I)
2403 << "cannot identify array bounds";
2404 LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because we can't find "
2405 << "the array bounds.\n");
2406 CanVecMem = false;
2407 return;
2408 }
2409
2410 LLVM_DEBUG(
2411 dbgs() << "LAA: May be able to perform a memory runtime check if needed.\n");
2412
2413 CanVecMem = true;
2414 if (Accesses.isDependencyCheckNeeded()) {
2415 LLVM_DEBUG(dbgs() << "LAA: Checking memory dependencies\n");
2416 CanVecMem = DepChecker->areDepsSafe(
2417 DependentAccesses, Accesses.getDependenciesToCheck(), SymbolicStrides);
2418 MaxSafeDepDistBytes = DepChecker->getMaxSafeDepDistBytes();
2419
2420 if (!CanVecMem && DepChecker->shouldRetryWithRuntimeCheck()) {
2421 LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n");
2422
2423 // Clear the dependency checks. We assume they are not needed.
2424 Accesses.resetDepChecks(*DepChecker);
2425
2426 PtrRtChecking->reset();
2427 PtrRtChecking->Need = true;
2428
2429 auto *SE = PSE->getSE();
2430 UncomputablePtr = nullptr;
2431 CanDoRTIfNeeded = Accesses.canCheckPtrAtRT(
2432 *PtrRtChecking, SE, TheLoop, SymbolicStrides, UncomputablePtr, true);
2433
2434 // Check that we found the bounds for the pointer.
2435 if (!CanDoRTIfNeeded) {
2436 auto *I = dyn_cast_or_null<Instruction>(UncomputablePtr);
2437 recordAnalysis("CantCheckMemDepsAtRunTime", I)
2438 << "cannot check memory dependencies at runtime";
2439 LLVM_DEBUG(dbgs() << "LAA: Can't vectorize with memory checks\n");
2440 CanVecMem = false;
2441 return;
2442 }
2443
2444 CanVecMem = true;
2445 }
2446 }
2447
2448 if (HasConvergentOp) {
2449 recordAnalysis("CantInsertRuntimeCheckWithConvergent")
2450 << "cannot add control dependency to convergent operation";
2451 LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because a runtime check "
2452 "would be needed with a convergent operation\n");
2453 CanVecMem = false;
2454 return;
2455 }
2456
2457 if (CanVecMem)
2458 LLVM_DEBUG(
2459 dbgs() << "LAA: No unsafe dependent memory operations in loop. We"
2460 << (PtrRtChecking->Need ? "" : " don't")
2461 << " need runtime memory checks.\n");
2462 else
2463 emitUnsafeDependenceRemark();
2464}
2465
2466void LoopAccessInfo::emitUnsafeDependenceRemark() {
2467 auto Deps = getDepChecker().getDependences();
2468 if (!Deps)
2469 return;
2470 auto Found = llvm::find_if(*Deps, [](const MemoryDepChecker::Dependence &D) {
2473 });
2474 if (Found == Deps->end())
2475 return;
2476 MemoryDepChecker::Dependence Dep = *Found;
2477
2478 LLVM_DEBUG(dbgs() << "LAA: unsafe dependent memory operations in loop\n");
2479
2480 // Emit remark for first unsafe dependence
2482 recordAnalysis("UnsafeDep", Dep.getDestination(*this))
2483 << "unsafe dependent memory operations in loop. Use "
2484 "#pragma loop distribute(enable) to allow loop distribution "
2485 "to attempt to isolate the offending operations into a separate "
2486 "loop";
2487
2488 switch (Dep.Type) {
2492 llvm_unreachable("Unexpected dependence");
2494 R << "\nBackward loop carried data dependence.";
2495 break;
2497 R << "\nForward loop carried data dependence that prevents "
2498 "store-to-load forwarding.";
2499 break;
2501 R << "\nBackward loop carried data dependence that prevents "
2502 "store-to-load forwarding.";
2503 break;
2505 R << "\nUnknown data dependence.";
2506 break;
2507 }
2508
2509 if (Instruction *I = Dep.getSource(*this)) {
2510 DebugLoc SourceLoc = I->getDebugLoc();
2511 if (auto *DD = dyn_cast_or_null<Instruction>(getPointerOperand(I)))
2512 SourceLoc = DD->getDebugLoc();
2513 if (SourceLoc)
2514 R << " Memory location is the same as accessed at "
2515 << ore::NV("Location", SourceLoc);
2516 }
2517}
2518
2520 DominatorTree *DT) {
2521 assert(TheLoop->contains(BB) && "Unknown block used");
2522
2523 // Blocks that do not dominate the latch need predication.
2524 BasicBlock* Latch = TheLoop->getLoopLatch();
2525 return !DT->dominates(BB, Latch);
2526}
2527
2528OptimizationRemarkAnalysis &LoopAccessInfo::recordAnalysis(StringRef RemarkName,
2529 Instruction *I) {
2530 assert(!Report && "Multiple reports generated");
2531
2532 Value *CodeRegion = TheLoop->getHeader();
2533 DebugLoc DL = TheLoop->getStartLoc();
2534
2535 if (I) {
2536 CodeRegion = I->getParent();
2537 // If there is no debug location attached to the instruction, revert back to
2538 // using the loop's.
2539 if (I->getDebugLoc())
2540 DL = I->getDebugLoc();
2541 }
2542
2543 Report = std::make_unique<OptimizationRemarkAnalysis>(DEBUG_TYPE, RemarkName, DL,
2544 CodeRegion);
2545 return *Report;
2546}
2547
2549 auto *SE = PSE->getSE();
2550 // Since we rely on SCEV for uniformity, if the type is not SCEVable, it is
2551 // never considered uniform.
2552 // TODO: Is this really what we want? Even without FP SCEV, we may want some
2553 // trivially loop-invariant FP values to be considered uniform.
2554 if (!SE->isSCEVable(V->getType()))
2555 return false;
2556 return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
2557}
2558
2559void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
2560 Value *Ptr = getLoadStorePointerOperand(MemAccess);
2561 if (!Ptr)
2562 return;
2563
2564 Value *Stride = getStrideFromPointer(Ptr, PSE->getSE(), TheLoop);
2565 if (!Stride)
2566 return;
2567
2568 LLVM_DEBUG(dbgs() << "LAA: Found a strided access that is a candidate for "
2569 "versioning:");
2570 LLVM_DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *Stride << "\n");
2571
2572 // Avoid adding the "Stride == 1" predicate when we know that
2573 // Stride >= Trip-Count. Such a predicate will effectively optimize a single
2574 // or zero iteration loop, as Trip-Count <= Stride == 1.
2575 //
2576 // TODO: We are currently not making a very informed decision on when it is
2577 // beneficial to apply stride versioning. It might make more sense that the
2578 // users of this analysis (such as the vectorizer) will trigger it, based on
2579 // their specific cost considerations; For example, in cases where stride
2580 // versioning does not help resolving memory accesses/dependences, the
2581 // vectorizer should evaluate the cost of the runtime test, and the benefit
2582 // of various possible stride specializations, considering the alternatives
2583 // of using gather/scatters (if available).
2584
2585 const SCEV *StrideExpr = PSE->getSCEV(Stride);
2586 const SCEV *BETakenCount = PSE->getBackedgeTakenCount();
2587
2588 // Match the types so we can compare the stride and the BETakenCount.
2589 // The Stride can be positive/negative, so we sign extend Stride;
2590 // The backedgeTakenCount is non-negative, so we zero extend BETakenCount.
2591 const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout();
2592 uint64_t StrideTypeSizeBits = DL.getTypeSizeInBits(StrideExpr->getType());
2593 uint64_t BETypeSizeBits = DL.getTypeSizeInBits(BETakenCount->getType());
2594 const SCEV *CastedStride = StrideExpr;
2595 const SCEV *CastedBECount = BETakenCount;
2596 ScalarEvolution *SE = PSE->getSE();
2597 if (BETypeSizeBits >= StrideTypeSizeBits)
2598 CastedStride = SE->getNoopOrSignExtend(StrideExpr, BETakenCount->getType());
2599 else
2600 CastedBECount = SE->getZeroExtendExpr(BETakenCount, StrideExpr->getType());
2601 const SCEV *StrideMinusBETaken = SE->getMinusSCEV(CastedStride, CastedBECount);
2602 // Since TripCount == BackEdgeTakenCount + 1, checking:
2603 // "Stride >= TripCount" is equivalent to checking:
2604 // Stride - BETakenCount > 0
2605 if (SE->isKnownPositive(StrideMinusBETaken)) {
2606 LLVM_DEBUG(
2607 dbgs() << "LAA: Stride>=TripCount; No point in versioning as the "
2608 "Stride==1 predicate will imply that the loop executes "
2609 "at most once.\n");
2610 return;
2611 }
2612 LLVM_DEBUG(dbgs() << "LAA: Found a strided access that we can version.\n");
2613
2614 SymbolicStrides[Ptr] = Stride;
2615 StrideSet.insert(Stride);
2616}
2617
2619 const TargetLibraryInfo *TLI, AAResults *AA,
2620 DominatorTree *DT, LoopInfo *LI)
2621 : PSE(std::make_unique<PredicatedScalarEvolution>(*SE, *L)),
2622 PtrRtChecking(nullptr),
2623 DepChecker(std::make_unique<MemoryDepChecker>(*PSE, L)), TheLoop(L) {
2624 PtrRtChecking = std::make_unique<RuntimePointerChecking>(*DepChecker, SE);
2625 if (canAnalyzeLoop()) {
2626 analyzeLoop(AA, LI, TLI, DT);
2627 }
2628}
2629
2630void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
2631 if (CanVecMem) {
2632 OS.indent(Depth) << "Memory dependences are safe";
2633 if (MaxSafeDepDistBytes != -1ULL)
2634 OS << " with a maximum dependence distance of " << MaxSafeDepDistBytes
2635 << " bytes";
2636 if (PtrRtChecking->Need)
2637 OS << " with run-time checks";
2638 OS << "\n";
2639 }
2640
2641 if (HasConvergentOp)
2642 OS.indent(Depth) << "Has convergent operation in loop\n";
2643
2644 if (Report)
2645 OS.indent(Depth) << "Report: " << Report->getMsg() << "\n";
2646
2647 if (auto *Dependences = DepChecker->getDependences()) {
2648 OS.indent(Depth) << "Dependences:\n";
2649 for (const auto &Dep : *Dependences) {
2650 Dep.print(OS, Depth + 2, DepChecker->getMemoryInstructions());
2651 OS << "\n";
2652 }
2653 } else
2654 OS.indent(Depth) << "Too many dependences, not recorded\n";
2655
2656 // List the pair of accesses need run-time checks to prove independence.
2657 PtrRtChecking->print(OS, Depth);
2658 OS << "\n";
2659
2660 OS.indent(Depth) << "Non vectorizable stores to invariant address were "
2661 << (HasDependenceInvolvingLoopInvariantAddress ? "" : "not ")
2662 << "found in loop.\n";
2663
2664 OS.indent(Depth) << "SCEV assumptions:\n";
2665 PSE->getPredicate().print(OS, Depth);
2666
2667 OS << "\n";
2668
2669 OS.indent(Depth) << "Expressions re-written:\n";
2670 PSE->print(OS, Depth);
2671}
2672
2674 auto I = LoopAccessInfoMap.insert({&L, nullptr});
2675
2676 if (I.second)
2677 I.first->second =
2678 std::make_unique<LoopAccessInfo>(&L, &SE, TLI, &AA, &DT, &LI);
2679
2680 return *I.first->second;
2681}
2682
2685}
2686
2688 auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2689 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2690 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2691 auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
2692 auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2693 auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2694 LAIs = std::make_unique<LoopAccessInfoManager>(SE, AA, DT, LI, TLI);
2695 return false;
2696}
2697
2703
2704 AU.setPreservesAll();
2705}
2706
2709 return LoopAccessInfoManager(
2713}
2714
2716static const char laa_name[] = "Loop Access Analysis";
2717#define LAA_NAME "loop-accesses"
2718
2725
2727
2728namespace llvm {
2729
2731 return new LoopAccessLegacyAnalysis();
2732 }
2733
2734} // end namespace llvm
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file implements a class to represent arbitrary precision integral constant values and operations...
basic Basic Alias true
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.
uint64_t Size
Generic implementation of equivalence classes through the use Tarjan's efficient union-find algorithm...
#define DEBUG_TYPE
Hexagon Common GEP
IRTranslator LLVM IR MI
#define Check(C,...)
Definition: Lint.cpp:170
static cl::opt< unsigned > MaxDependences("max-dependences", cl::Hidden, cl::desc("Maximum number of dependences collected by " "loop-access analysis (default = 100)"), cl::init(100))
We collect dependences up to this threshold.
static cl::opt< bool > EnableForwardingConflictDetection("store-to-load-forwarding-conflict-detection", cl::Hidden, cl::desc("Enable conflict detection in loop-access analysis"), cl::init(true))
Enable store-to-load forwarding conflict detection.
static void findForkedSCEVs(ScalarEvolution *SE, const Loop *L, Value *Ptr, SmallVectorImpl< PointerIntPair< const SCEV *, 1, bool > > &ScevList, unsigned Depth)
static bool hasComputableBounds(PredicatedScalarEvolution &PSE, Value *Ptr, const SCEV *PtrScev, Loop *L, bool Assume)
Check whether a pointer can participate in a runtime bounds check.
static cl::opt< unsigned > MemoryCheckMergeThreshold("memory-check-merge-threshold", cl::Hidden, cl::desc("Maximum number of comparisons done when trying to merge " "runtime memory checks. (default = 100)"), cl::init(100))
The maximum iterations used to merge memory checks.
static SmallVector< PointerIntPair< const SCEV *, 1, bool > > findForkedPointer(PredicatedScalarEvolution &PSE, const ValueToValueMap &StridesMap, Value *Ptr, const Loop *L)
static cl::opt< unsigned, true > VectorizationInterleave("force-vector-interleave", cl::Hidden, cl::desc("Sets the vectorization interleave count. " "Zero is autoselect."), cl::location(VectorizerParams::VectorizationInterleave))
#define LAA_NAME
static cl::opt< unsigned, true > VectorizationFactor("force-vector-width", cl::Hidden, cl::desc("Sets the SIMD width. Zero is autoselect."), cl::location(VectorizerParams::VectorizationFactor))
static const char laa_name[]
static cl::opt< unsigned, true > RuntimeMemoryCheckThreshold("runtime-memory-check-threshold", cl::Hidden, cl::desc("When performing memory disambiguation checks at runtime do not " "generate more than this number of comparisons (default = 8)."), cl::location(VectorizerParams::RuntimeMemoryCheckThreshold), cl::init(8))
static void visitPointers(Value *StartPtr, const Loop &InnermostLoop, function_ref< void(Value *)> AddPointer)
static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR, PredicatedScalarEvolution &PSE, const Loop *L)
Return true if an AddRec pointer Ptr is unsigned non-wrapping, i.e.
static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE, const SCEV &BackedgeTakenCount, const SCEV &Dist, uint64_t Stride, uint64_t TypeByteSize)
Given a dependence-distance Dist between two memory accesses, that have the same stride whose absolut...
static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride, uint64_t TypeByteSize)
Check the dependence for two accesses with the same stride Stride.
static bool isInBoundsGep(Value *Ptr)
static const SCEV * getMinFromExprs(const SCEV *I, const SCEV *J, ScalarEvolution *SE)
Compare I and J and return the minimum.
static bool isNoWrap(PredicatedScalarEvolution &PSE, const ValueToValueMap &Strides, Value *Ptr, Type *AccessTy, Loop *L)
Check whether a pointer address cannot wrap.
static cl::opt< unsigned > MaxForkedSCEVDepth("max-forked-scev-depth", cl::Hidden, cl::desc("Maximum recursion depth when finding forked SCEVs (default = 5)"), cl::init(5))
static cl::opt< bool > EnableMemAccessVersioning("enable-mem-access-versioning", cl::init(true), cl::Hidden, cl::desc("Enable symbolic stride memory access versioning"))
This enables versioning on the strides of symbolically striding memory accesses in code like the foll...
This header provides classes for managing per-loop analyses.
static M68kRelType getType(unsigned Kind, MCSymbolRefExpr::VariantKind &Modifier, bool &IsPCRel)
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file provides utility analysis objects describing memory locations.
uint64_t High
#define P(N)
This header defines various interfaces for pass management in LLVM.
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file defines the PointerIntPair class.
@ SI
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
static LLVM_ATTRIBUTE_ALWAYS_INLINE bool CheckType(const unsigned char *MatcherTable, unsigned &MatcherIndex, SDValue N, const TargetLowering *TLI, const DataLayout &DL)
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
static const X86InstrFMA3Group Groups[]
A manager for alias analyses.
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Class for arbitrary precision integers.
Definition: APInt.h:75
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1494
APInt abs() const
Get the absolute value.
Definition: APInt.h:1734
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1439
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:312
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1002
bool isStrictlyPositive() const
Determine if this APInt Value is positive.
Definition: APInt.h:339
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1516
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:620
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:774
Represent the analysis usage information of a pass.
void setPreservesAll()
Set by analyses that do not transform their input at all.
AnalysisUsage & addRequiredTransitive()
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:202
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:163
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:158
LLVM Basic Block Representation.
Definition: BasicBlock.h:56
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:112
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
Definition: BasicBlock.cpp:146
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:114
A debug info location.
Definition: DebugLoc.h:33
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:150
iterator end()
Definition: DenseMap.h:84
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:207
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:314
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
EquivalenceClasses - This represents a collection of equivalence classes and supports three efficient...
iterator findValue(const ElemTy &V) const
findValue - Return an iterator to the specified value.
iterator insert(const ElemTy &Data)
insert - Insert a new value into the union/find set, ignoring the request if the value already exists...
member_iterator member_end() const
typename std::set< ECValue, ECValueComparator >::const_iterator iterator
iterator* - Provides a way to iterate over all values in the set.
member_iterator member_begin(iterator I) const
member_iterator unionSets(const ElemTy &V1, const ElemTy &V2)
union - Merge the two equivalence sets for the specified values, inserting them if they do not alread...
const ElemTy & getLeaderValue(const ElemTy &V) const
getLeaderValue - Return the leader for the specified value that is in the set.
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:308
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:644
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:940
PointerType * getType() const
Global values are always pointers.
Definition: GlobalValue.h:290
Class to represent integer types.
Definition: DerivedTypes.h:40
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:313
An instruction for reading from memory.
Definition: Instructions.h:177
Value * getPointerOperand()
Definition: Instructions.h:264
static constexpr LocationSize beforeOrAfterPointer()
Any location before or after the base pointer (but still within the underlying object).
This analysis provides dependence information for the memory accesses of a loop.
Result run(Function &F, FunctionAnalysisManager &AM)
const LoopAccessInfo & getInfo(Loop &L)
Drive the analysis of memory accesses in the loop.
bool isUniform(Value *V) const
Returns true if the value V is uniform within the loop.
void print(raw_ostream &OS, unsigned Depth=0) const
Print the information about the memory accesses in the loop.
LoopAccessInfo(Loop *L, ScalarEvolution *SE, const TargetLibraryInfo *TLI, AAResults *AA, DominatorTree *DT, LoopInfo *LI)
static bool blockNeedsPredication(BasicBlock *BB, Loop *TheLoop, DominatorTree *DT)
Return true if the block BB needs to be predicated in order for the loop to be vectorized.
This analysis provides dependence information for the memory accesses of a loop.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
bool runOnFunction(Function &F) override
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:1268
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:139
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
Definition: LoopInfoImpl.h:232
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
Definition: LoopInfo.h:182
unsigned getNumBackEdges() const
Calculate the number of back edges to the loop header.
Definition: LoopInfo.h:267
BlockT * getHeader() const
Definition: LoopInfo.h:105
Wrapper class to LoopBlocksDFS that provides a standard begin()/end() interface for the DFS reverse p...
Definition: LoopIterator.h:172
The legacy pass manager's analysis pass to compute loop information.
Definition: LoopInfo.h:1293
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:547
bool isAnnotatedParallel() const
Returns true if the loop is annotated parallel.
Definition: LoopInfo.cpp:564
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:631
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:60
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:37
Checks memory dependences among accesses to the same underlying object to determine whether there vec...
ArrayRef< unsigned > getOrderForAccess(Value *Ptr, bool IsWrite) const
Return the program order indices for the access location (Ptr, IsWrite).
const Loop * getInnermostLoop() const
bool isSafeForVectorization() const
No memory dependence was encountered that would inhibit vectorization.
SmallVector< Instruction *, 4 > getInstructionsForAccess(Value *Ptr, bool isWrite) const
Find the set of instructions that read or write via Ptr.
VectorizationSafetyStatus
Type to keep track of the status of the dependence check.
void addAccess(StoreInst *SI)
Register the location (instructions are given increasing numbers) of a write access.
bool areDepsSafe(DepCandidates &AccessSets, MemAccessInfoList &CheckDeps, const ValueToValueMap &Strides)
Check whether the dependencies between the accesses are safe.
PointerIntPair< Value *, 1, bool > MemAccessInfo
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
const Value * Ptr
The address of the start of the location.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.cpp:398
Diagnostic information for optimization analysis remarks.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:91
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
void addPredicate(const SCEVPredicate &Pred)
Adds a new predicate.
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
const SCEVPredicate & getPredicate() const
bool hasNoOverflow(Value *V, SCEVWrapPredicate::IncrementWrapFlags Flags)
Returns true if we've proved that V doesn't wrap by means of a SCEV predicate.
void setNoOverflow(Value *V, SCEVWrapPredicate::IncrementWrapFlags Flags)
Proves that V doesn't overflow by adding SCEV predicate.
void print(raw_ostream &OS, unsigned Depth) const
Print the SCEV mappings done by the Predicated Scalar Evolution.
const SCEVAddRecExpr * getAsAddRec(Value *V)
Attempts to produce an AddRecExpr for V by adding additional SCEV predicates.
const SCEV * getBackedgeTakenCount()
Get the (predicated) backedge count for the analyzed loop.
const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
Holds information about the memory runtime legality checks to verify that a group of pointers do not ...
bool Need
This flag indicates if we need to add the runtime check.
void reset()
Reset the state of the pointer runtime information.
unsigned getNumberOfChecks() const
Returns the number of run-time checks required according to needsChecking.
void printChecks(raw_ostream &OS, const SmallVectorImpl< RuntimePointerCheck > &Checks, unsigned Depth=0) const
Print Checks.
bool needsChecking(const RuntimeCheckingPtrGroup &M, const RuntimeCheckingPtrGroup &N) const
Decide if we need to add a check between two groups of pointers, according to needsChecking.
void print(raw_ostream &OS, unsigned Depth=0) const
Print the list run-time memory checks necessary.
SmallVector< RuntimeCheckingPtrGroup, 2 > CheckingGroups
Holds a partitioning of pointers into "check groups".
void generateChecks(MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies)
Generate the checks and store it.
static bool arePointersInSamePartition(const SmallVectorImpl< int > &PtrToPartition, unsigned PtrIdx1, unsigned PtrIdx2)
Check if pointers are in the same partition.
SmallVector< PointerInfo, 2 > Pointers
Information about the pointers that may require checking.
void insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr, Type *AccessTy, bool WritePtr, unsigned DepSetId, unsigned ASId, PredicatedScalarEvolution &PSE, bool NeedsFreeze)
Insert a pointer and calculate the start and end SCEVs.
This node represents a polynomial recurrence on the trip count of the specified loop.
const SCEV * evaluateAtIteration(const SCEV *It, ScalarEvolution &SE) const
Return the value of this chain of recurrences at the specified iteration number.
const SCEV * getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents a constant integer value.
NoWrapFlags getNoWrapFlags(NoWrapFlags Mask=NoWrapMask) const
virtual void print(raw_ostream &OS, unsigned Depth=0) const =0
Prints a textual representation of this predicate with an indentation of Depth.
This class represents an analyzed expression in the program.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
const SCEV * getUMaxExpr(const SCEV *LHS, const SCEV *RHS)
const SCEVPredicate * getEqualPredicate(const SCEV *LHS, const SCEV *RHS)
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getNoopOrSignExtend(const SCEV *V, Type *Ty)
Return a SCEV corresponding to a conversion of the input value to the specified type.
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
const SCEV * getPtrToIntExpr(const SCEV *Op, Type *Ty)
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
bool isKnownPositive(const SCEV *S)
Test if the given expression is known to be positive.
const SCEV * getSizeOfExpr(Type *IntTy, Type *AllocTy)
Return an expression for the alloc size of AllocTy that is type IntTy.
const SCEV * getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth=0)
bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
Type * getEffectiveSCEVType(Type *Ty) const
Return a type with the same bitwidth as the given type and which represents how SCEV will treat the g...
const SCEV * getUMinExpr(const SCEV *LHS, const SCEV *RHS, bool Sequential=false)
const SCEV * getStoreSizeOfExpr(Type *IntTy, Type *StoreTy)
Return an expression for the store size of StoreTy that is type IntTy.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
const SCEV * getTruncateOrSignExtend(const SCEV *V, Type *Ty, unsigned Depth=0)
Return a SCEV corresponding to a conversion of the input value to the specified type.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:383
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:365
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:450
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:164
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:177
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:577
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:941
void resize(size_type N)
Definition: SmallVector.h:642
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
An instruction for storing to memory.
Definition: Instructions.h:301
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:258
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:249
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:250
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition: Value.h:724
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:308
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:182
An efficient, type-erasing, non-owning reference to a callable.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
friend const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:235
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
LocationClass< Ty > location(Ty &L)
Definition: CommandLine.h:465
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:406
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1735
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const ValueToValueMap &StridesMap=ValueToValueMap(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
AddressSpace
Definition: NVPTXBaseInfo.h:21
const SCEV * replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE, const ValueToValueMap &PtrToStride, Value *Ptr)
Return the SCEV corresponding to a pointer with the symbolic stride replaced with constant one,...
void initializeLoopAccessLegacyAnalysisPass(PassRegistry &)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
Value * getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp)
Get the stride of a pointer access in a loop.
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1910
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1742
Pass * createLAAPass()
Value * stripIntegerCast(Value *V)
bool NullPointerIsDefined(const Function *F, unsigned AS=0)
Check whether null pointer dereferencing is considered undefined behavior for a given function or an ...
Definition: Function.cpp:2136
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, LoopInfo *LI=nullptr, unsigned MaxLookup=6)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
bool isGuaranteedNotToBeUndefOrPoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Return true if this function can prove that V does not have undef bits and is never poison.
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1837
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1762
Type * getLoadStoreType(Value *I)
A helper function that returns the type of a load or store instruction.
Definition: BitVector.h:851
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:853
#define N
MDNode * TBAA
The tag for type-based alias analysis.
Definition: Metadata.h:668
A special type used by analysis passes to provide an address that identifies that particular analysis...
Definition: PassManager.h:69
Dependece between memory access instructions.
DepType Type
The type of the dependence.
bool isPossiblyBackward() const
May be a lexically backward dependence type (includes Unknown).
bool isForward() const
Lexically forward dependence.
bool isBackward() const
Lexically backward dependence.
void print(raw_ostream &OS, unsigned Depth, const SmallVectorImpl< Instruction * > &Instrs) const
Print the dependence.
Instruction * getDestination(const LoopAccessInfo &LAI) const
Return the destination instruction of the dependence.
Instruction * getSource(const LoopAccessInfo &LAI) const
Return the source instruction of the dependence.
DepType
The type of the dependence.
static const char * DepName[]
String version of the types.
static VectorizationSafetyStatus isSafeForVectorization(DepType Type)
Dependence types that don't prevent vectorization.
unsigned AddressSpace
Address space of the involved pointers.
bool addPointer(unsigned Index, RuntimePointerChecking &RtCheck)
Tries to add the pointer recorded in RtCheck at index Index to this pointer checking group.
bool NeedsFreeze
Whether the pointer needs to be frozen after expansion, e.g.
const SCEV * High
The SCEV expression which represents the upper bound of all the pointers in this group.
SmallVector< unsigned, 2 > Members
Indices of all the pointers that constitute this grouping.
RuntimeCheckingPtrGroup(unsigned Index, RuntimePointerChecking &RtCheck)
Create a new pointer checking group containing a single pointer, with index Index in RtCheck.
const SCEV * Low
The SCEV expression which represents the lower bound of all the pointers in this group.
bool IsWritePtr
Holds the information if this pointer is used for writing to memory.
unsigned DependencySetId
Holds the id of the set of pointers that could be dependent because of a shared underlying object.
unsigned AliasSetId
Holds the id of the disjoint alias set to which this pointer belongs.
static const unsigned MaxVectorWidth
Maximum SIMD width.
static unsigned VectorizationFactor
VF as overridden by the user.
static unsigned RuntimeMemoryCheckThreshold
\When performing memory disambiguation checks at runtime do not make more than this number of compari...
static bool isInterleaveForced()
True if force-vector-interleave was specified by the user.
static unsigned VectorizationInterleave
Interleave factor as overridden by the user.
Function object to check whether the first component of a std::pair compares less than the first comp...
Definition: STLExtras.h:1477