LLVM 19.0.0git
LoopAccessAnalysis.cpp
Go to the documentation of this file.
1//===- LoopAccessAnalysis.cpp - Loop Access Analysis Implementation --------==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// The implementation for the loop memory dependence that was originally
10// developed for the loop vectorizer.
11//
12//===----------------------------------------------------------------------===//
13
15#include "llvm/ADT/APInt.h"
16#include "llvm/ADT/DenseMap.h"
19#include "llvm/ADT/STLExtras.h"
20#include "llvm/ADT/SetVector.h"
22#include "llvm/ADT/SmallSet.h"
36#include "llvm/IR/BasicBlock.h"
37#include "llvm/IR/Constants.h"
38#include "llvm/IR/DataLayout.h"
39#include "llvm/IR/DebugLoc.h"
42#include "llvm/IR/Dominators.h"
43#include "llvm/IR/Function.h"
45#include "llvm/IR/InstrTypes.h"
46#include "llvm/IR/Instruction.h"
48#include "llvm/IR/Operator.h"
49#include "llvm/IR/PassManager.h"
51#include "llvm/IR/Type.h"
52#include "llvm/IR/Value.h"
53#include "llvm/IR/ValueHandle.h"
56#include "llvm/Support/Debug.h"
59#include <algorithm>
60#include <cassert>
61#include <cstdint>
62#include <iterator>
63#include <utility>
64#include <variant>
65#include <vector>
66
67using namespace llvm;
68using namespace llvm::PatternMatch;
69
70#define DEBUG_TYPE "loop-accesses"
71
73VectorizationFactor("force-vector-width", cl::Hidden,
74 cl::desc("Sets the SIMD width. Zero is autoselect."),
77
79VectorizationInterleave("force-vector-interleave", cl::Hidden,
80 cl::desc("Sets the vectorization interleave count. "
81 "Zero is autoselect."),
85
87 "runtime-memory-check-threshold", cl::Hidden,
88 cl::desc("When performing memory disambiguation checks at runtime do not "
89 "generate more than this number of comparisons (default = 8)."),
92
93/// The maximum iterations used to merge memory checks
95 "memory-check-merge-threshold", cl::Hidden,
96 cl::desc("Maximum number of comparisons done when trying to merge "
97 "runtime memory checks. (default = 100)"),
98 cl::init(100));
99
100/// Maximum SIMD width.
101const unsigned VectorizerParams::MaxVectorWidth = 64;
102
103/// We collect dependences up to this threshold.
105 MaxDependences("max-dependences", cl::Hidden,
106 cl::desc("Maximum number of dependences collected by "
107 "loop-access analysis (default = 100)"),
108 cl::init(100));
109
110/// This enables versioning on the strides of symbolically striding memory
111/// accesses in code like the following.
112/// for (i = 0; i < N; ++i)
113/// A[i * Stride1] += B[i * Stride2] ...
114///
115/// Will be roughly translated to
116/// if (Stride1 == 1 && Stride2 == 1) {
117/// for (i = 0; i < N; i+=4)
118/// A[i:i+3] += ...
119/// } else
120/// ...
122 "enable-mem-access-versioning", cl::init(true), cl::Hidden,
123 cl::desc("Enable symbolic stride memory access versioning"));
124
125/// Enable store-to-load forwarding conflict detection. This option can
126/// be disabled for correctness testing.
128 "store-to-load-forwarding-conflict-detection", cl::Hidden,
129 cl::desc("Enable conflict detection in loop-access analysis"),
130 cl::init(true));
131
133 "max-forked-scev-depth", cl::Hidden,
134 cl::desc("Maximum recursion depth when finding forked SCEVs (default = 5)"),
135 cl::init(5));
136
138 "laa-speculate-unit-stride", cl::Hidden,
139 cl::desc("Speculate that non-constant strides are unit in LAA"),
140 cl::init(true));
141
143 "hoist-runtime-checks", cl::Hidden,
144 cl::desc(
145 "Hoist inner loop runtime memory checks to outer loop if possible"),
148
150 return ::VectorizationInterleave.getNumOccurrences() > 0;
151}
152
154 const DenseMap<Value *, const SCEV *> &PtrToStride,
155 Value *Ptr) {
156 const SCEV *OrigSCEV = PSE.getSCEV(Ptr);
157
158 // If there is an entry in the map return the SCEV of the pointer with the
159 // symbolic stride replaced by one.
161 if (SI == PtrToStride.end())
162 // For a non-symbolic stride, just return the original expression.
163 return OrigSCEV;
164
165 const SCEV *StrideSCEV = SI->second;
166 // Note: This assert is both overly strong and overly weak. The actual
167 // invariant here is that StrideSCEV should be loop invariant. The only
168 // such invariant strides we happen to speculate right now are unknowns
169 // and thus this is a reasonable proxy of the actual invariant.
170 assert(isa<SCEVUnknown>(StrideSCEV) && "shouldn't be in map");
171
172 ScalarEvolution *SE = PSE.getSE();
173 const auto *CT = SE->getOne(StrideSCEV->getType());
174 PSE.addPredicate(*SE->getEqualPredicate(StrideSCEV, CT));
175 auto *Expr = PSE.getSCEV(Ptr);
176
177 LLVM_DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV
178 << " by: " << *Expr << "\n");
179 return Expr;
180}
181
183 unsigned Index, RuntimePointerChecking &RtCheck)
184 : High(RtCheck.Pointers[Index].End), Low(RtCheck.Pointers[Index].Start),
185 AddressSpace(RtCheck.Pointers[Index]
186 .PointerValue->getType()
188 NeedsFreeze(RtCheck.Pointers[Index].NeedsFreeze) {
190}
191
192/// Calculate Start and End points of memory access.
193/// Let's assume A is the first access and B is a memory access on N-th loop
194/// iteration. Then B is calculated as:
195/// B = A + Step*N .
196/// Step value may be positive or negative.
197/// N is a calculated back-edge taken count:
198/// N = (TripCount > 0) ? RoundDown(TripCount -1 , VF) : 0
199/// Start and End points are calculated in the following way:
200/// Start = UMIN(A, B) ; End = UMAX(A, B) + SizeOfElt,
201/// where SizeOfElt is the size of single memory access in bytes.
202///
203/// There is no conflict when the intervals are disjoint:
204/// NoConflict = (P2.Start >= P1.End) || (P1.Start >= P2.End)
206 Type *AccessTy, bool WritePtr,
207 unsigned DepSetId, unsigned ASId,
209 bool NeedsFreeze) {
210 ScalarEvolution *SE = PSE.getSE();
211
212 const SCEV *ScStart;
213 const SCEV *ScEnd;
214
215 if (SE->isLoopInvariant(PtrExpr, Lp)) {
216 ScStart = ScEnd = PtrExpr;
217 } else {
218 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrExpr);
219 assert(AR && "Invalid addrec expression");
220 const SCEV *Ex = PSE.getBackedgeTakenCount();
221
222 ScStart = AR->getStart();
223 ScEnd = AR->evaluateAtIteration(Ex, *SE);
224 const SCEV *Step = AR->getStepRecurrence(*SE);
225
226 // For expressions with negative step, the upper bound is ScStart and the
227 // lower bound is ScEnd.
228 if (const auto *CStep = dyn_cast<SCEVConstant>(Step)) {
229 if (CStep->getValue()->isNegative())
230 std::swap(ScStart, ScEnd);
231 } else {
232 // Fallback case: the step is not constant, but we can still
233 // get the upper and lower bounds of the interval by using min/max
234 // expressions.
235 ScStart = SE->getUMinExpr(ScStart, ScEnd);
236 ScEnd = SE->getUMaxExpr(AR->getStart(), ScEnd);
237 }
238 }
239 assert(SE->isLoopInvariant(ScStart, Lp) && "ScStart needs to be invariant");
240 assert(SE->isLoopInvariant(ScEnd, Lp)&& "ScEnd needs to be invariant");
241
242 // Add the size of the pointed element to ScEnd.
243 auto &DL = Lp->getHeader()->getModule()->getDataLayout();
244 Type *IdxTy = DL.getIndexType(Ptr->getType());
245 const SCEV *EltSizeSCEV = SE->getStoreSizeOfExpr(IdxTy, AccessTy);
246 ScEnd = SE->getAddExpr(ScEnd, EltSizeSCEV);
247
248 Pointers.emplace_back(Ptr, ScStart, ScEnd, WritePtr, DepSetId, ASId, PtrExpr,
249 NeedsFreeze);
250}
251
252void RuntimePointerChecking::tryToCreateDiffCheck(
253 const RuntimeCheckingPtrGroup &CGI, const RuntimeCheckingPtrGroup &CGJ) {
254 if (!CanUseDiffCheck)
255 return;
256
257 // If either group contains multiple different pointers, bail out.
258 // TODO: Support multiple pointers by using the minimum or maximum pointer,
259 // depending on src & sink.
260 if (CGI.Members.size() != 1 || CGJ.Members.size() != 1) {
261 CanUseDiffCheck = false;
262 return;
263 }
264
265 PointerInfo *Src = &Pointers[CGI.Members[0]];
266 PointerInfo *Sink = &Pointers[CGJ.Members[0]];
267
268 // If either pointer is read and written, multiple checks may be needed. Bail
269 // out.
270 if (!DC.getOrderForAccess(Src->PointerValue, !Src->IsWritePtr).empty() ||
271 !DC.getOrderForAccess(Sink->PointerValue, !Sink->IsWritePtr).empty()) {
272 CanUseDiffCheck = false;
273 return;
274 }
275
276 ArrayRef<unsigned> AccSrc =
277 DC.getOrderForAccess(Src->PointerValue, Src->IsWritePtr);
278 ArrayRef<unsigned> AccSink =
279 DC.getOrderForAccess(Sink->PointerValue, Sink->IsWritePtr);
280 // If either pointer is accessed multiple times, there may not be a clear
281 // src/sink relation. Bail out for now.
282 if (AccSrc.size() != 1 || AccSink.size() != 1) {
283 CanUseDiffCheck = false;
284 return;
285 }
286 // If the sink is accessed before src, swap src/sink.
287 if (AccSink[0] < AccSrc[0])
288 std::swap(Src, Sink);
289
290 auto *SrcAR = dyn_cast<SCEVAddRecExpr>(Src->Expr);
291 auto *SinkAR = dyn_cast<SCEVAddRecExpr>(Sink->Expr);
292 if (!SrcAR || !SinkAR || SrcAR->getLoop() != DC.getInnermostLoop() ||
293 SinkAR->getLoop() != DC.getInnermostLoop()) {
294 CanUseDiffCheck = false;
295 return;
296 }
297
299 DC.getInstructionsForAccess(Src->PointerValue, Src->IsWritePtr);
301 DC.getInstructionsForAccess(Sink->PointerValue, Sink->IsWritePtr);
302 Type *SrcTy = getLoadStoreType(SrcInsts[0]);
303 Type *DstTy = getLoadStoreType(SinkInsts[0]);
304 if (isa<ScalableVectorType>(SrcTy) || isa<ScalableVectorType>(DstTy)) {
305 CanUseDiffCheck = false;
306 return;
307 }
308 const DataLayout &DL =
309 SinkAR->getLoop()->getHeader()->getModule()->getDataLayout();
310 unsigned AllocSize =
311 std::max(DL.getTypeAllocSize(SrcTy), DL.getTypeAllocSize(DstTy));
312
313 // Only matching constant steps matching the AllocSize are supported at the
314 // moment. This simplifies the difference computation. Can be extended in the
315 // future.
316 auto *Step = dyn_cast<SCEVConstant>(SinkAR->getStepRecurrence(*SE));
317 if (!Step || Step != SrcAR->getStepRecurrence(*SE) ||
318 Step->getAPInt().abs() != AllocSize) {
319 CanUseDiffCheck = false;
320 return;
321 }
322
323 IntegerType *IntTy =
324 IntegerType::get(Src->PointerValue->getContext(),
325 DL.getPointerSizeInBits(CGI.AddressSpace));
326
327 // When counting down, the dependence distance needs to be swapped.
328 if (Step->getValue()->isNegative())
329 std::swap(SinkAR, SrcAR);
330
331 const SCEV *SinkStartInt = SE->getPtrToIntExpr(SinkAR->getStart(), IntTy);
332 const SCEV *SrcStartInt = SE->getPtrToIntExpr(SrcAR->getStart(), IntTy);
333 if (isa<SCEVCouldNotCompute>(SinkStartInt) ||
334 isa<SCEVCouldNotCompute>(SrcStartInt)) {
335 CanUseDiffCheck = false;
336 return;
337 }
338
339 const Loop *InnerLoop = SrcAR->getLoop();
340 // If the start values for both Src and Sink also vary according to an outer
341 // loop, then it's probably better to avoid creating diff checks because
342 // they may not be hoisted. We should instead let llvm::addRuntimeChecks
343 // do the expanded full range overlap checks, which can be hoisted.
344 if (HoistRuntimeChecks && InnerLoop->getParentLoop() &&
345 isa<SCEVAddRecExpr>(SinkStartInt) && isa<SCEVAddRecExpr>(SrcStartInt)) {
346 auto *SrcStartAR = cast<SCEVAddRecExpr>(SrcStartInt);
347 auto *SinkStartAR = cast<SCEVAddRecExpr>(SinkStartInt);
348 const Loop *StartARLoop = SrcStartAR->getLoop();
349 if (StartARLoop == SinkStartAR->getLoop() &&
350 StartARLoop == InnerLoop->getParentLoop() &&
351 // If the diff check would already be loop invariant (due to the
352 // recurrences being the same), then we prefer to keep the diff checks
353 // because they are cheaper.
354 SrcStartAR->getStepRecurrence(*SE) !=
355 SinkStartAR->getStepRecurrence(*SE)) {
356 LLVM_DEBUG(dbgs() << "LAA: Not creating diff runtime check, since these "
357 "cannot be hoisted out of the outer loop\n");
358 CanUseDiffCheck = false;
359 return;
360 }
361 }
362
363 LLVM_DEBUG(dbgs() << "LAA: Creating diff runtime check for:\n"
364 << "SrcStart: " << *SrcStartInt << '\n'
365 << "SinkStartInt: " << *SinkStartInt << '\n');
366 DiffChecks.emplace_back(SrcStartInt, SinkStartInt, AllocSize,
367 Src->NeedsFreeze || Sink->NeedsFreeze);
368}
369
370SmallVector<RuntimePointerCheck, 4> RuntimePointerChecking::generateChecks() {
372
373 for (unsigned I = 0; I < CheckingGroups.size(); ++I) {
374 for (unsigned J = I + 1; J < CheckingGroups.size(); ++J) {
377
378 if (needsChecking(CGI, CGJ)) {
379 tryToCreateDiffCheck(CGI, CGJ);
380 Checks.push_back(std::make_pair(&CGI, &CGJ));
381 }
382 }
383 }
384 return Checks;
385}
386
387void RuntimePointerChecking::generateChecks(
388 MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) {
389 assert(Checks.empty() && "Checks is not empty");
390 groupChecks(DepCands, UseDependencies);
391 Checks = generateChecks();
392}
393
395 const RuntimeCheckingPtrGroup &M, const RuntimeCheckingPtrGroup &N) const {
396 for (unsigned I = 0, EI = M.Members.size(); EI != I; ++I)
397 for (unsigned J = 0, EJ = N.Members.size(); EJ != J; ++J)
398 if (needsChecking(M.Members[I], N.Members[J]))
399 return true;
400 return false;
401}
402
403/// Compare \p I and \p J and return the minimum.
404/// Return nullptr in case we couldn't find an answer.
405static const SCEV *getMinFromExprs(const SCEV *I, const SCEV *J,
406 ScalarEvolution *SE) {
407 const SCEV *Diff = SE->getMinusSCEV(J, I);
408 const SCEVConstant *C = dyn_cast<const SCEVConstant>(Diff);
409
410 if (!C)
411 return nullptr;
412 if (C->getValue()->isNegative())
413 return J;
414 return I;
415}
416
418 RuntimePointerChecking &RtCheck) {
419 return addPointer(
420 Index, RtCheck.Pointers[Index].Start, RtCheck.Pointers[Index].End,
421 RtCheck.Pointers[Index].PointerValue->getType()->getPointerAddressSpace(),
422 RtCheck.Pointers[Index].NeedsFreeze, *RtCheck.SE);
423}
424
426 const SCEV *End, unsigned AS,
427 bool NeedsFreeze,
428 ScalarEvolution &SE) {
429 assert(AddressSpace == AS &&
430 "all pointers in a checking group must be in the same address space");
431
432 // Compare the starts and ends with the known minimum and maximum
433 // of this set. We need to know how we compare against the min/max
434 // of the set in order to be able to emit memchecks.
435 const SCEV *Min0 = getMinFromExprs(Start, Low, &SE);
436 if (!Min0)
437 return false;
438
439 const SCEV *Min1 = getMinFromExprs(End, High, &SE);
440 if (!Min1)
441 return false;
442
443 // Update the low bound expression if we've found a new min value.
444 if (Min0 == Start)
445 Low = Start;
446
447 // Update the high bound expression if we've found a new max value.
448 if (Min1 != End)
449 High = End;
450
452 this->NeedsFreeze |= NeedsFreeze;
453 return true;
454}
455
456void RuntimePointerChecking::groupChecks(
457 MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) {
458 // We build the groups from dependency candidates equivalence classes
459 // because:
460 // - We know that pointers in the same equivalence class share
461 // the same underlying object and therefore there is a chance
462 // that we can compare pointers
463 // - We wouldn't be able to merge two pointers for which we need
464 // to emit a memcheck. The classes in DepCands are already
465 // conveniently built such that no two pointers in the same
466 // class need checking against each other.
467
468 // We use the following (greedy) algorithm to construct the groups
469 // For every pointer in the equivalence class:
470 // For each existing group:
471 // - if the difference between this pointer and the min/max bounds
472 // of the group is a constant, then make the pointer part of the
473 // group and update the min/max bounds of that group as required.
474
475 CheckingGroups.clear();
476
477 // If we need to check two pointers to the same underlying object
478 // with a non-constant difference, we shouldn't perform any pointer
479 // grouping with those pointers. This is because we can easily get
480 // into cases where the resulting check would return false, even when
481 // the accesses are safe.
482 //
483 // The following example shows this:
484 // for (i = 0; i < 1000; ++i)
485 // a[5000 + i * m] = a[i] + a[i + 9000]
486 //
487 // Here grouping gives a check of (5000, 5000 + 1000 * m) against
488 // (0, 10000) which is always false. However, if m is 1, there is no
489 // dependence. Not grouping the checks for a[i] and a[i + 9000] allows
490 // us to perform an accurate check in this case.
491 //
492 // The above case requires that we have an UnknownDependence between
493 // accesses to the same underlying object. This cannot happen unless
494 // FoundNonConstantDistanceDependence is set, and therefore UseDependencies
495 // is also false. In this case we will use the fallback path and create
496 // separate checking groups for all pointers.
497
498 // If we don't have the dependency partitions, construct a new
499 // checking pointer group for each pointer. This is also required
500 // for correctness, because in this case we can have checking between
501 // pointers to the same underlying object.
502 if (!UseDependencies) {
503 for (unsigned I = 0; I < Pointers.size(); ++I)
504 CheckingGroups.push_back(RuntimeCheckingPtrGroup(I, *this));
505 return;
506 }
507
508 unsigned TotalComparisons = 0;
509
511 for (unsigned Index = 0; Index < Pointers.size(); ++Index) {
512 auto Iter = PositionMap.insert({Pointers[Index].PointerValue, {}});
513 Iter.first->second.push_back(Index);
514 }
515
516 // We need to keep track of what pointers we've already seen so we
517 // don't process them twice.
519
520 // Go through all equivalence classes, get the "pointer check groups"
521 // and add them to the overall solution. We use the order in which accesses
522 // appear in 'Pointers' to enforce determinism.
523 for (unsigned I = 0; I < Pointers.size(); ++I) {
524 // We've seen this pointer before, and therefore already processed
525 // its equivalence class.
526 if (Seen.count(I))
527 continue;
528
529 MemoryDepChecker::MemAccessInfo Access(Pointers[I].PointerValue,
530 Pointers[I].IsWritePtr);
531
533 auto LeaderI = DepCands.findValue(DepCands.getLeaderValue(Access));
534
535 // Because DepCands is constructed by visiting accesses in the order in
536 // which they appear in alias sets (which is deterministic) and the
537 // iteration order within an equivalence class member is only dependent on
538 // the order in which unions and insertions are performed on the
539 // equivalence class, the iteration order is deterministic.
540 for (auto MI = DepCands.member_begin(LeaderI), ME = DepCands.member_end();
541 MI != ME; ++MI) {
542 auto PointerI = PositionMap.find(MI->getPointer());
543 assert(PointerI != PositionMap.end() &&
544 "pointer in equivalence class not found in PositionMap");
545 for (unsigned Pointer : PointerI->second) {
546 bool Merged = false;
547 // Mark this pointer as seen.
548 Seen.insert(Pointer);
549
550 // Go through all the existing sets and see if we can find one
551 // which can include this pointer.
552 for (RuntimeCheckingPtrGroup &Group : Groups) {
553 // Don't perform more than a certain amount of comparisons.
554 // This should limit the cost of grouping the pointers to something
555 // reasonable. If we do end up hitting this threshold, the algorithm
556 // will create separate groups for all remaining pointers.
557 if (TotalComparisons > MemoryCheckMergeThreshold)
558 break;
559
560 TotalComparisons++;
561
562 if (Group.addPointer(Pointer, *this)) {
563 Merged = true;
564 break;
565 }
566 }
567
568 if (!Merged)
569 // We couldn't add this pointer to any existing set or the threshold
570 // for the number of comparisons has been reached. Create a new group
571 // to hold the current pointer.
572 Groups.push_back(RuntimeCheckingPtrGroup(Pointer, *this));
573 }
574 }
575
576 // We've computed the grouped checks for this partition.
577 // Save the results and continue with the next one.
578 llvm::copy(Groups, std::back_inserter(CheckingGroups));
579 }
580}
581
583 const SmallVectorImpl<int> &PtrToPartition, unsigned PtrIdx1,
584 unsigned PtrIdx2) {
585 return (PtrToPartition[PtrIdx1] != -1 &&
586 PtrToPartition[PtrIdx1] == PtrToPartition[PtrIdx2]);
587}
588
589bool RuntimePointerChecking::needsChecking(unsigned I, unsigned J) const {
590 const PointerInfo &PointerI = Pointers[I];
591 const PointerInfo &PointerJ = Pointers[J];
592
593 // No need to check if two readonly pointers intersect.
594 if (!PointerI.IsWritePtr && !PointerJ.IsWritePtr)
595 return false;
596
597 // Only need to check pointers between two different dependency sets.
598 if (PointerI.DependencySetId == PointerJ.DependencySetId)
599 return false;
600
601 // Only need to check pointers in the same alias set.
602 if (PointerI.AliasSetId != PointerJ.AliasSetId)
603 return false;
604
605 return true;
606}
607
610 unsigned Depth) const {
611 unsigned N = 0;
612 for (const auto &Check : Checks) {
613 const auto &First = Check.first->Members, &Second = Check.second->Members;
614
615 OS.indent(Depth) << "Check " << N++ << ":\n";
616
617 OS.indent(Depth + 2) << "Comparing group (" << Check.first << "):\n";
618 for (unsigned K = 0; K < First.size(); ++K)
619 OS.indent(Depth + 2) << *Pointers[First[K]].PointerValue << "\n";
620
621 OS.indent(Depth + 2) << "Against group (" << Check.second << "):\n";
622 for (unsigned K = 0; K < Second.size(); ++K)
623 OS.indent(Depth + 2) << *Pointers[Second[K]].PointerValue << "\n";
624 }
625}
626
628
629 OS.indent(Depth) << "Run-time memory checks:\n";
630 printChecks(OS, Checks, Depth);
631
632 OS.indent(Depth) << "Grouped accesses:\n";
633 for (unsigned I = 0; I < CheckingGroups.size(); ++I) {
634 const auto &CG = CheckingGroups[I];
635
636 OS.indent(Depth + 2) << "Group " << &CG << ":\n";
637 OS.indent(Depth + 4) << "(Low: " << *CG.Low << " High: " << *CG.High
638 << ")\n";
639 for (unsigned J = 0; J < CG.Members.size(); ++J) {
640 OS.indent(Depth + 6) << "Member: " << *Pointers[CG.Members[J]].Expr
641 << "\n";
642 }
643 }
644}
645
646namespace {
647
648/// Analyses memory accesses in a loop.
649///
650/// Checks whether run time pointer checks are needed and builds sets for data
651/// dependence checking.
652class AccessAnalysis {
653public:
654 /// Read or write access location.
655 typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
656 typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList;
657
658 AccessAnalysis(Loop *TheLoop, AAResults *AA, LoopInfo *LI,
661 SmallPtrSetImpl<MDNode *> &LoopAliasScopes)
662 : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DepCands(DA), PSE(PSE),
663 LoopAliasScopes(LoopAliasScopes) {
664 // We're analyzing dependences across loop iterations.
665 BAA.enableCrossIterationMode();
666 }
667
668 /// Register a load and whether it is only read from.
669 void addLoad(MemoryLocation &Loc, Type *AccessTy, bool IsReadOnly) {
670 Value *Ptr = const_cast<Value *>(Loc.Ptr);
671 AST.add(adjustLoc(Loc));
672 Accesses[MemAccessInfo(Ptr, false)].insert(AccessTy);
673 if (IsReadOnly)
674 ReadOnlyPtr.insert(Ptr);
675 }
676
677 /// Register a store.
678 void addStore(MemoryLocation &Loc, Type *AccessTy) {
679 Value *Ptr = const_cast<Value *>(Loc.Ptr);
680 AST.add(adjustLoc(Loc));
681 Accesses[MemAccessInfo(Ptr, true)].insert(AccessTy);
682 }
683
684 /// Check if we can emit a run-time no-alias check for \p Access.
685 ///
686 /// Returns true if we can emit a run-time no alias check for \p Access.
687 /// If we can check this access, this also adds it to a dependence set and
688 /// adds a run-time to check for it to \p RtCheck. If \p Assume is true,
689 /// we will attempt to use additional run-time checks in order to get
690 /// the bounds of the pointer.
691 bool createCheckForAccess(RuntimePointerChecking &RtCheck,
692 MemAccessInfo Access, Type *AccessTy,
693 const DenseMap<Value *, const SCEV *> &Strides,
695 Loop *TheLoop, unsigned &RunningDepId,
696 unsigned ASId, bool ShouldCheckStride, bool Assume);
697
698 /// Check whether we can check the pointers at runtime for
699 /// non-intersection.
700 ///
701 /// Returns true if we need no check or if we do and we can generate them
702 /// (i.e. the pointers have computable bounds).
703 bool canCheckPtrAtRT(RuntimePointerChecking &RtCheck, ScalarEvolution *SE,
704 Loop *TheLoop, const DenseMap<Value *, const SCEV *> &Strides,
705 Value *&UncomputablePtr, bool ShouldCheckWrap = false);
706
707 /// Goes over all memory accesses, checks whether a RT check is needed
708 /// and builds sets of dependent accesses.
709 void buildDependenceSets() {
710 processMemAccesses();
711 }
712
713 /// Initial processing of memory accesses determined that we need to
714 /// perform dependency checking.
715 ///
716 /// Note that this can later be cleared if we retry memcheck analysis without
717 /// dependency checking (i.e. FoundNonConstantDistanceDependence).
718 bool isDependencyCheckNeeded() { return !CheckDeps.empty(); }
719
720 /// We decided that no dependence analysis would be used. Reset the state.
721 void resetDepChecks(MemoryDepChecker &DepChecker) {
722 CheckDeps.clear();
723 DepChecker.clearDependences();
724 }
725
726 MemAccessInfoList &getDependenciesToCheck() { return CheckDeps; }
727
730 return UnderlyingObjects;
731 }
732
733private:
735
736 /// Adjust the MemoryLocation so that it represents accesses to this
737 /// location across all iterations, rather than a single one.
738 MemoryLocation adjustLoc(MemoryLocation Loc) const {
739 // The accessed location varies within the loop, but remains within the
740 // underlying object.
742 Loc.AATags.Scope = adjustAliasScopeList(Loc.AATags.Scope);
743 Loc.AATags.NoAlias = adjustAliasScopeList(Loc.AATags.NoAlias);
744 return Loc;
745 }
746
747 /// Drop alias scopes that are only valid within a single loop iteration.
748 MDNode *adjustAliasScopeList(MDNode *ScopeList) const {
749 if (!ScopeList)
750 return nullptr;
751
752 // For the sake of simplicity, drop the whole scope list if any scope is
753 // iteration-local.
754 if (any_of(ScopeList->operands(), [&](Metadata *Scope) {
755 return LoopAliasScopes.contains(cast<MDNode>(Scope));
756 }))
757 return nullptr;
758
759 return ScopeList;
760 }
761
762 /// Go over all memory access and check whether runtime pointer checks
763 /// are needed and build sets of dependency check candidates.
764 void processMemAccesses();
765
766 /// Map of all accesses. Values are the types used to access memory pointed to
767 /// by the pointer.
768 PtrAccessMap Accesses;
769
770 /// The loop being checked.
771 const Loop *TheLoop;
772
773 /// List of accesses that need a further dependence check.
774 MemAccessInfoList CheckDeps;
775
776 /// Set of pointers that are read only.
777 SmallPtrSet<Value*, 16> ReadOnlyPtr;
778
779 /// Batched alias analysis results.
780 BatchAAResults BAA;
781
782 /// An alias set tracker to partition the access set by underlying object and
783 //intrinsic property (such as TBAA metadata).
784 AliasSetTracker AST;
785
786 LoopInfo *LI;
787
788 /// Sets of potentially dependent accesses - members of one set share an
789 /// underlying pointer. The set "CheckDeps" identfies which sets really need a
790 /// dependence check.
792
793 /// Initial processing of memory accesses determined that we may need
794 /// to add memchecks. Perform the analysis to determine the necessary checks.
795 ///
796 /// Note that, this is different from isDependencyCheckNeeded. When we retry
797 /// memcheck analysis without dependency checking
798 /// (i.e. FoundNonConstantDistanceDependence), isDependencyCheckNeeded is
799 /// cleared while this remains set if we have potentially dependent accesses.
800 bool IsRTCheckAnalysisNeeded = false;
801
802 /// The SCEV predicate containing all the SCEV-related assumptions.
804
806
807 /// Alias scopes that are declared inside the loop, and as such not valid
808 /// across iterations.
809 SmallPtrSetImpl<MDNode *> &LoopAliasScopes;
810};
811
812} // end anonymous namespace
813
814/// Check whether a pointer can participate in a runtime bounds check.
815/// If \p Assume, try harder to prove that we can compute the bounds of \p Ptr
816/// by adding run-time checks (overflow checks) if necessary.
818 const SCEV *PtrScev, Loop *L, bool Assume) {
819 // The bounds for loop-invariant pointer is trivial.
820 if (PSE.getSE()->isLoopInvariant(PtrScev, L))
821 return true;
822
823 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
824
825 if (!AR && Assume)
826 AR = PSE.getAsAddRec(Ptr);
827
828 if (!AR)
829 return false;
830
831 return AR->isAffine();
832}
833
834/// Check whether a pointer address cannot wrap.
836 const DenseMap<Value *, const SCEV *> &Strides, Value *Ptr, Type *AccessTy,
837 Loop *L) {
838 const SCEV *PtrScev = PSE.getSCEV(Ptr);
839 if (PSE.getSE()->isLoopInvariant(PtrScev, L))
840 return true;
841
842 int64_t Stride = getPtrStride(PSE, AccessTy, Ptr, L, Strides).value_or(0);
843 if (Stride == 1 || PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW))
844 return true;
845
846 return false;
847}
848
849static void visitPointers(Value *StartPtr, const Loop &InnermostLoop,
850 function_ref<void(Value *)> AddPointer) {
852 SmallVector<Value *> WorkList;
853 WorkList.push_back(StartPtr);
854
855 while (!WorkList.empty()) {
856 Value *Ptr = WorkList.pop_back_val();
857 if (!Visited.insert(Ptr).second)
858 continue;
859 auto *PN = dyn_cast<PHINode>(Ptr);
860 // SCEV does not look through non-header PHIs inside the loop. Such phis
861 // can be analyzed by adding separate accesses for each incoming pointer
862 // value.
863 if (PN && InnermostLoop.contains(PN->getParent()) &&
864 PN->getParent() != InnermostLoop.getHeader()) {
865 for (const Use &Inc : PN->incoming_values())
866 WorkList.push_back(Inc);
867 } else
868 AddPointer(Ptr);
869 }
870}
871
872// Walk back through the IR for a pointer, looking for a select like the
873// following:
874//
875// %offset = select i1 %cmp, i64 %a, i64 %b
876// %addr = getelementptr double, double* %base, i64 %offset
877// %ld = load double, double* %addr, align 8
878//
879// We won't be able to form a single SCEVAddRecExpr from this since the
880// address for each loop iteration depends on %cmp. We could potentially
881// produce multiple valid SCEVAddRecExprs, though, and check all of them for
882// memory safety/aliasing if needed.
883//
884// If we encounter some IR we don't yet handle, or something obviously fine
885// like a constant, then we just add the SCEV for that term to the list passed
886// in by the caller. If we have a node that may potentially yield a valid
887// SCEVAddRecExpr then we decompose it into parts and build the SCEV terms
888// ourselves before adding to the list.
889static void findForkedSCEVs(
890 ScalarEvolution *SE, const Loop *L, Value *Ptr,
892 unsigned Depth) {
893 // If our Value is a SCEVAddRecExpr, loop invariant, not an instruction, or
894 // we've exceeded our limit on recursion, just return whatever we have
895 // regardless of whether it can be used for a forked pointer or not, along
896 // with an indication of whether it might be a poison or undef value.
897 const SCEV *Scev = SE->getSCEV(Ptr);
898 if (isa<SCEVAddRecExpr>(Scev) || L->isLoopInvariant(Ptr) ||
899 !isa<Instruction>(Ptr) || Depth == 0) {
900 ScevList.emplace_back(Scev, !isGuaranteedNotToBeUndefOrPoison(Ptr));
901 return;
902 }
903
904 Depth--;
905
906 auto UndefPoisonCheck = [](PointerIntPair<const SCEV *, 1, bool> S) {
907 return get<1>(S);
908 };
909
910 auto GetBinOpExpr = [&SE](unsigned Opcode, const SCEV *L, const SCEV *R) {
911 switch (Opcode) {
912 case Instruction::Add:
913 return SE->getAddExpr(L, R);
914 case Instruction::Sub:
915 return SE->getMinusSCEV(L, R);
916 default:
917 llvm_unreachable("Unexpected binary operator when walking ForkedPtrs");
918 }
919 };
920
921 Instruction *I = cast<Instruction>(Ptr);
922 unsigned Opcode = I->getOpcode();
923 switch (Opcode) {
924 case Instruction::GetElementPtr: {
925 GetElementPtrInst *GEP = cast<GetElementPtrInst>(I);
926 Type *SourceTy = GEP->getSourceElementType();
927 // We only handle base + single offset GEPs here for now.
928 // Not dealing with preexisting gathers yet, so no vectors.
929 if (I->getNumOperands() != 2 || SourceTy->isVectorTy()) {
930 ScevList.emplace_back(Scev, !isGuaranteedNotToBeUndefOrPoison(GEP));
931 break;
932 }
935 findForkedSCEVs(SE, L, I->getOperand(0), BaseScevs, Depth);
936 findForkedSCEVs(SE, L, I->getOperand(1), OffsetScevs, Depth);
937
938 // See if we need to freeze our fork...
939 bool NeedsFreeze = any_of(BaseScevs, UndefPoisonCheck) ||
940 any_of(OffsetScevs, UndefPoisonCheck);
941
942 // Check that we only have a single fork, on either the base or the offset.
943 // Copy the SCEV across for the one without a fork in order to generate
944 // the full SCEV for both sides of the GEP.
945 if (OffsetScevs.size() == 2 && BaseScevs.size() == 1)
946 BaseScevs.push_back(BaseScevs[0]);
947 else if (BaseScevs.size() == 2 && OffsetScevs.size() == 1)
948 OffsetScevs.push_back(OffsetScevs[0]);
949 else {
950 ScevList.emplace_back(Scev, NeedsFreeze);
951 break;
952 }
953
954 // Find the pointer type we need to extend to.
955 Type *IntPtrTy = SE->getEffectiveSCEVType(
956 SE->getSCEV(GEP->getPointerOperand())->getType());
957
958 // Find the size of the type being pointed to. We only have a single
959 // index term (guarded above) so we don't need to index into arrays or
960 // structures, just get the size of the scalar value.
961 const SCEV *Size = SE->getSizeOfExpr(IntPtrTy, SourceTy);
962
963 // Scale up the offsets by the size of the type, then add to the bases.
964 const SCEV *Scaled1 = SE->getMulExpr(
965 Size, SE->getTruncateOrSignExtend(get<0>(OffsetScevs[0]), IntPtrTy));
966 const SCEV *Scaled2 = SE->getMulExpr(
967 Size, SE->getTruncateOrSignExtend(get<0>(OffsetScevs[1]), IntPtrTy));
968 ScevList.emplace_back(SE->getAddExpr(get<0>(BaseScevs[0]), Scaled1),
969 NeedsFreeze);
970 ScevList.emplace_back(SE->getAddExpr(get<0>(BaseScevs[1]), Scaled2),
971 NeedsFreeze);
972 break;
973 }
974 case Instruction::Select: {
976 // A select means we've found a forked pointer, but we currently only
977 // support a single select per pointer so if there's another behind this
978 // then we just bail out and return the generic SCEV.
979 findForkedSCEVs(SE, L, I->getOperand(1), ChildScevs, Depth);
980 findForkedSCEVs(SE, L, I->getOperand(2), ChildScevs, Depth);
981 if (ChildScevs.size() == 2) {
982 ScevList.push_back(ChildScevs[0]);
983 ScevList.push_back(ChildScevs[1]);
984 } else
985 ScevList.emplace_back(Scev, !isGuaranteedNotToBeUndefOrPoison(Ptr));
986 break;
987 }
988 case Instruction::PHI: {
990 // A phi means we've found a forked pointer, but we currently only
991 // support a single phi per pointer so if there's another behind this
992 // then we just bail out and return the generic SCEV.
993 if (I->getNumOperands() == 2) {
994 findForkedSCEVs(SE, L, I->getOperand(0), ChildScevs, Depth);
995 findForkedSCEVs(SE, L, I->getOperand(1), ChildScevs, Depth);
996 }
997 if (ChildScevs.size() == 2) {
998 ScevList.push_back(ChildScevs[0]);
999 ScevList.push_back(ChildScevs[1]);
1000 } else
1001 ScevList.emplace_back(Scev, !isGuaranteedNotToBeUndefOrPoison(Ptr));
1002 break;
1003 }
1004 case Instruction::Add:
1005 case Instruction::Sub: {
1008 findForkedSCEVs(SE, L, I->getOperand(0), LScevs, Depth);
1009 findForkedSCEVs(SE, L, I->getOperand(1), RScevs, Depth);
1010
1011 // See if we need to freeze our fork...
1012 bool NeedsFreeze =
1013 any_of(LScevs, UndefPoisonCheck) || any_of(RScevs, UndefPoisonCheck);
1014
1015 // Check that we only have a single fork, on either the left or right side.
1016 // Copy the SCEV across for the one without a fork in order to generate
1017 // the full SCEV for both sides of the BinOp.
1018 if (LScevs.size() == 2 && RScevs.size() == 1)
1019 RScevs.push_back(RScevs[0]);
1020 else if (RScevs.size() == 2 && LScevs.size() == 1)
1021 LScevs.push_back(LScevs[0]);
1022 else {
1023 ScevList.emplace_back(Scev, NeedsFreeze);
1024 break;
1025 }
1026
1027 ScevList.emplace_back(
1028 GetBinOpExpr(Opcode, get<0>(LScevs[0]), get<0>(RScevs[0])),
1029 NeedsFreeze);
1030 ScevList.emplace_back(
1031 GetBinOpExpr(Opcode, get<0>(LScevs[1]), get<0>(RScevs[1])),
1032 NeedsFreeze);
1033 break;
1034 }
1035 default:
1036 // Just return the current SCEV if we haven't handled the instruction yet.
1037 LLVM_DEBUG(dbgs() << "ForkedPtr unhandled instruction: " << *I << "\n");
1038 ScevList.emplace_back(Scev, !isGuaranteedNotToBeUndefOrPoison(Ptr));
1039 break;
1040 }
1041}
1042
1045 const DenseMap<Value *, const SCEV *> &StridesMap, Value *Ptr,
1046 const Loop *L) {
1047 ScalarEvolution *SE = PSE.getSE();
1048 assert(SE->isSCEVable(Ptr->getType()) && "Value is not SCEVable!");
1050 findForkedSCEVs(SE, L, Ptr, Scevs, MaxForkedSCEVDepth);
1051
1052 // For now, we will only accept a forked pointer with two possible SCEVs
1053 // that are either SCEVAddRecExprs or loop invariant.
1054 if (Scevs.size() == 2 &&
1055 (isa<SCEVAddRecExpr>(get<0>(Scevs[0])) ||
1056 SE->isLoopInvariant(get<0>(Scevs[0]), L)) &&
1057 (isa<SCEVAddRecExpr>(get<0>(Scevs[1])) ||
1058 SE->isLoopInvariant(get<0>(Scevs[1]), L))) {
1059 LLVM_DEBUG(dbgs() << "LAA: Found forked pointer: " << *Ptr << "\n");
1060 LLVM_DEBUG(dbgs() << "\t(1) " << *get<0>(Scevs[0]) << "\n");
1061 LLVM_DEBUG(dbgs() << "\t(2) " << *get<0>(Scevs[1]) << "\n");
1062 return Scevs;
1063 }
1064
1065 return {{replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr), false}};
1066}
1067
1068bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck,
1069 MemAccessInfo Access, Type *AccessTy,
1070 const DenseMap<Value *, const SCEV *> &StridesMap,
1072 Loop *TheLoop, unsigned &RunningDepId,
1073 unsigned ASId, bool ShouldCheckWrap,
1074 bool Assume) {
1075 Value *Ptr = Access.getPointer();
1076
1078 findForkedPointer(PSE, StridesMap, Ptr, TheLoop);
1079
1080 for (auto &P : TranslatedPtrs) {
1081 const SCEV *PtrExpr = get<0>(P);
1082 if (!hasComputableBounds(PSE, Ptr, PtrExpr, TheLoop, Assume))
1083 return false;
1084
1085 // When we run after a failing dependency check we have to make sure
1086 // we don't have wrapping pointers.
1087 if (ShouldCheckWrap) {
1088 // Skip wrap checking when translating pointers.
1089 if (TranslatedPtrs.size() > 1)
1090 return false;
1091
1092 if (!isNoWrap(PSE, StridesMap, Ptr, AccessTy, TheLoop)) {
1093 auto *Expr = PSE.getSCEV(Ptr);
1094 if (!Assume || !isa<SCEVAddRecExpr>(Expr))
1095 return false;
1097 }
1098 }
1099 // If there's only one option for Ptr, look it up after bounds and wrap
1100 // checking, because assumptions might have been added to PSE.
1101 if (TranslatedPtrs.size() == 1)
1102 TranslatedPtrs[0] = {replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr),
1103 false};
1104 }
1105
1106 for (auto [PtrExpr, NeedsFreeze] : TranslatedPtrs) {
1107 // The id of the dependence set.
1108 unsigned DepId;
1109
1110 if (isDependencyCheckNeeded()) {
1111 Value *Leader = DepCands.getLeaderValue(Access).getPointer();
1112 unsigned &LeaderId = DepSetId[Leader];
1113 if (!LeaderId)
1114 LeaderId = RunningDepId++;
1115 DepId = LeaderId;
1116 } else
1117 // Each access has its own dependence set.
1118 DepId = RunningDepId++;
1119
1120 bool IsWrite = Access.getInt();
1121 RtCheck.insert(TheLoop, Ptr, PtrExpr, AccessTy, IsWrite, DepId, ASId, PSE,
1122 NeedsFreeze);
1123 LLVM_DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n');
1124 }
1125
1126 return true;
1127}
1128
1129bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
1130 ScalarEvolution *SE, Loop *TheLoop,
1131 const DenseMap<Value *, const SCEV *> &StridesMap,
1132 Value *&UncomputablePtr, bool ShouldCheckWrap) {
1133 // Find pointers with computable bounds. We are going to use this information
1134 // to place a runtime bound check.
1135 bool CanDoRT = true;
1136
1137 bool MayNeedRTCheck = false;
1138 if (!IsRTCheckAnalysisNeeded) return true;
1139
1140 bool IsDepCheckNeeded = isDependencyCheckNeeded();
1141
1142 // We assign a consecutive id to access from different alias sets.
1143 // Accesses between different groups doesn't need to be checked.
1144 unsigned ASId = 0;
1145 for (auto &AS : AST) {
1146 int NumReadPtrChecks = 0;
1147 int NumWritePtrChecks = 0;
1148 bool CanDoAliasSetRT = true;
1149 ++ASId;
1150 auto ASPointers = AS.getPointers();
1151
1152 // We assign consecutive id to access from different dependence sets.
1153 // Accesses within the same set don't need a runtime check.
1154 unsigned RunningDepId = 1;
1156
1158
1159 // First, count how many write and read accesses are in the alias set. Also
1160 // collect MemAccessInfos for later.
1162 for (const Value *Ptr_ : ASPointers) {
1163 Value *Ptr = const_cast<Value *>(Ptr_);
1164 bool IsWrite = Accesses.count(MemAccessInfo(Ptr, true));
1165 if (IsWrite)
1166 ++NumWritePtrChecks;
1167 else
1168 ++NumReadPtrChecks;
1169 AccessInfos.emplace_back(Ptr, IsWrite);
1170 }
1171
1172 // We do not need runtime checks for this alias set, if there are no writes
1173 // or a single write and no reads.
1174 if (NumWritePtrChecks == 0 ||
1175 (NumWritePtrChecks == 1 && NumReadPtrChecks == 0)) {
1176 assert((ASPointers.size() <= 1 ||
1177 all_of(ASPointers,
1178 [this](const Value *Ptr) {
1179 MemAccessInfo AccessWrite(const_cast<Value *>(Ptr),
1180 true);
1181 return DepCands.findValue(AccessWrite) == DepCands.end();
1182 })) &&
1183 "Can only skip updating CanDoRT below, if all entries in AS "
1184 "are reads or there is at most 1 entry");
1185 continue;
1186 }
1187
1188 for (auto &Access : AccessInfos) {
1189 for (const auto &AccessTy : Accesses[Access]) {
1190 if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
1191 DepSetId, TheLoop, RunningDepId, ASId,
1192 ShouldCheckWrap, false)) {
1193 LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:"
1194 << *Access.getPointer() << '\n');
1195 Retries.push_back({Access, AccessTy});
1196 CanDoAliasSetRT = false;
1197 }
1198 }
1199 }
1200
1201 // Note that this function computes CanDoRT and MayNeedRTCheck
1202 // independently. For example CanDoRT=false, MayNeedRTCheck=false means that
1203 // we have a pointer for which we couldn't find the bounds but we don't
1204 // actually need to emit any checks so it does not matter.
1205 //
1206 // We need runtime checks for this alias set, if there are at least 2
1207 // dependence sets (in which case RunningDepId > 2) or if we need to re-try
1208 // any bound checks (because in that case the number of dependence sets is
1209 // incomplete).
1210 bool NeedsAliasSetRTCheck = RunningDepId > 2 || !Retries.empty();
1211
1212 // We need to perform run-time alias checks, but some pointers had bounds
1213 // that couldn't be checked.
1214 if (NeedsAliasSetRTCheck && !CanDoAliasSetRT) {
1215 // Reset the CanDoSetRt flag and retry all accesses that have failed.
1216 // We know that we need these checks, so we can now be more aggressive
1217 // and add further checks if required (overflow checks).
1218 CanDoAliasSetRT = true;
1219 for (auto Retry : Retries) {
1220 MemAccessInfo Access = Retry.first;
1221 Type *AccessTy = Retry.second;
1222 if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
1223 DepSetId, TheLoop, RunningDepId, ASId,
1224 ShouldCheckWrap, /*Assume=*/true)) {
1225 CanDoAliasSetRT = false;
1226 UncomputablePtr = Access.getPointer();
1227 break;
1228 }
1229 }
1230 }
1231
1232 CanDoRT &= CanDoAliasSetRT;
1233 MayNeedRTCheck |= NeedsAliasSetRTCheck;
1234 ++ASId;
1235 }
1236
1237 // If the pointers that we would use for the bounds comparison have different
1238 // address spaces, assume the values aren't directly comparable, so we can't
1239 // use them for the runtime check. We also have to assume they could
1240 // overlap. In the future there should be metadata for whether address spaces
1241 // are disjoint.
1242 unsigned NumPointers = RtCheck.Pointers.size();
1243 for (unsigned i = 0; i < NumPointers; ++i) {
1244 for (unsigned j = i + 1; j < NumPointers; ++j) {
1245 // Only need to check pointers between two different dependency sets.
1246 if (RtCheck.Pointers[i].DependencySetId ==
1247 RtCheck.Pointers[j].DependencySetId)
1248 continue;
1249 // Only need to check pointers in the same alias set.
1250 if (RtCheck.Pointers[i].AliasSetId != RtCheck.Pointers[j].AliasSetId)
1251 continue;
1252
1253 Value *PtrI = RtCheck.Pointers[i].PointerValue;
1254 Value *PtrJ = RtCheck.Pointers[j].PointerValue;
1255
1256 unsigned ASi = PtrI->getType()->getPointerAddressSpace();
1257 unsigned ASj = PtrJ->getType()->getPointerAddressSpace();
1258 if (ASi != ASj) {
1259 LLVM_DEBUG(
1260 dbgs() << "LAA: Runtime check would require comparison between"
1261 " different address spaces\n");
1262 return false;
1263 }
1264 }
1265 }
1266
1267 if (MayNeedRTCheck && CanDoRT)
1268 RtCheck.generateChecks(DepCands, IsDepCheckNeeded);
1269
1270 LLVM_DEBUG(dbgs() << "LAA: We need to do " << RtCheck.getNumberOfChecks()
1271 << " pointer comparisons.\n");
1272
1273 // If we can do run-time checks, but there are no checks, no runtime checks
1274 // are needed. This can happen when all pointers point to the same underlying
1275 // object for example.
1276 RtCheck.Need = CanDoRT ? RtCheck.getNumberOfChecks() != 0 : MayNeedRTCheck;
1277
1278 bool CanDoRTIfNeeded = !RtCheck.Need || CanDoRT;
1279 if (!CanDoRTIfNeeded)
1280 RtCheck.reset();
1281 return CanDoRTIfNeeded;
1282}
1283
1284void AccessAnalysis::processMemAccesses() {
1285 // We process the set twice: first we process read-write pointers, last we
1286 // process read-only pointers. This allows us to skip dependence tests for
1287 // read-only pointers.
1288
1289 LLVM_DEBUG(dbgs() << "LAA: Processing memory accesses...\n");
1290 LLVM_DEBUG(dbgs() << " AST: "; AST.dump());
1291 LLVM_DEBUG(dbgs() << "LAA: Accesses(" << Accesses.size() << "):\n");
1292 LLVM_DEBUG({
1293 for (auto A : Accesses)
1294 dbgs() << "\t" << *A.first.getPointer() << " ("
1295 << (A.first.getInt()
1296 ? "write"
1297 : (ReadOnlyPtr.count(A.first.getPointer()) ? "read-only"
1298 : "read"))
1299 << ")\n";
1300 });
1301
1302 // The AliasSetTracker has nicely partitioned our pointers by metadata
1303 // compatibility and potential for underlying-object overlap. As a result, we
1304 // only need to check for potential pointer dependencies within each alias
1305 // set.
1306 for (const auto &AS : AST) {
1307 // Note that both the alias-set tracker and the alias sets themselves used
1308 // ordered collections internally and so the iteration order here is
1309 // deterministic.
1310 auto ASPointers = AS.getPointers();
1311
1312 bool SetHasWrite = false;
1313
1314 // Map of pointers to last access encountered.
1315 typedef DenseMap<const Value*, MemAccessInfo> UnderlyingObjToAccessMap;
1316 UnderlyingObjToAccessMap ObjToLastAccess;
1317
1318 // Set of access to check after all writes have been processed.
1319 PtrAccessMap DeferredAccesses;
1320
1321 // Iterate over each alias set twice, once to process read/write pointers,
1322 // and then to process read-only pointers.
1323 for (int SetIteration = 0; SetIteration < 2; ++SetIteration) {
1324 bool UseDeferred = SetIteration > 0;
1325 PtrAccessMap &S = UseDeferred ? DeferredAccesses : Accesses;
1326
1327 for (const Value *Ptr_ : ASPointers) {
1328 Value *Ptr = const_cast<Value *>(Ptr_);
1329
1330 // For a single memory access in AliasSetTracker, Accesses may contain
1331 // both read and write, and they both need to be handled for CheckDeps.
1332 for (const auto &AC : S) {
1333 if (AC.first.getPointer() != Ptr)
1334 continue;
1335
1336 bool IsWrite = AC.first.getInt();
1337
1338 // If we're using the deferred access set, then it contains only
1339 // reads.
1340 bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite;
1341 if (UseDeferred && !IsReadOnlyPtr)
1342 continue;
1343 // Otherwise, the pointer must be in the PtrAccessSet, either as a
1344 // read or a write.
1345 assert(((IsReadOnlyPtr && UseDeferred) || IsWrite ||
1346 S.count(MemAccessInfo(Ptr, false))) &&
1347 "Alias-set pointer not in the access set?");
1348
1349 MemAccessInfo Access(Ptr, IsWrite);
1350 DepCands.insert(Access);
1351
1352 // Memorize read-only pointers for later processing and skip them in
1353 // the first round (they need to be checked after we have seen all
1354 // write pointers). Note: we also mark pointer that are not
1355 // consecutive as "read-only" pointers (so that we check
1356 // "a[b[i]] +="). Hence, we need the second check for "!IsWrite".
1357 if (!UseDeferred && IsReadOnlyPtr) {
1358 // We only use the pointer keys, the types vector values don't
1359 // matter.
1360 DeferredAccesses.insert({Access, {}});
1361 continue;
1362 }
1363
1364 // If this is a write - check other reads and writes for conflicts. If
1365 // this is a read only check other writes for conflicts (but only if
1366 // there is no other write to the ptr - this is an optimization to
1367 // catch "a[i] = a[i] + " without having to do a dependence check).
1368 if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) {
1369 CheckDeps.push_back(Access);
1370 IsRTCheckAnalysisNeeded = true;
1371 }
1372
1373 if (IsWrite)
1374 SetHasWrite = true;
1375
1376 // Create sets of pointers connected by a shared alias set and
1377 // underlying object.
1378 typedef SmallVector<const Value *, 16> ValueVector;
1379 ValueVector TempObjects;
1380
1381 UnderlyingObjects[Ptr] = {};
1382 SmallVector<const Value *, 16> &UOs = UnderlyingObjects[Ptr];
1383 ::getUnderlyingObjects(Ptr, UOs, LI);
1385 << "Underlying objects for pointer " << *Ptr << "\n");
1386 for (const Value *UnderlyingObj : UOs) {
1387 // nullptr never alias, don't join sets for pointer that have "null"
1388 // in their UnderlyingObjects list.
1389 if (isa<ConstantPointerNull>(UnderlyingObj) &&
1391 TheLoop->getHeader()->getParent(),
1392 UnderlyingObj->getType()->getPointerAddressSpace()))
1393 continue;
1394
1395 UnderlyingObjToAccessMap::iterator Prev =
1396 ObjToLastAccess.find(UnderlyingObj);
1397 if (Prev != ObjToLastAccess.end())
1398 DepCands.unionSets(Access, Prev->second);
1399
1400 ObjToLastAccess[UnderlyingObj] = Access;
1401 LLVM_DEBUG(dbgs() << " " << *UnderlyingObj << "\n");
1402 }
1403 }
1404 }
1405 }
1406 }
1407}
1408
1409/// Return true if an AddRec pointer \p Ptr is unsigned non-wrapping,
1410/// i.e. monotonically increasing/decreasing.
1411static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR,
1412 PredicatedScalarEvolution &PSE, const Loop *L) {
1413
1414 // FIXME: This should probably only return true for NUW.
1416 return true;
1417
1419 return true;
1420
1421 // Scalar evolution does not propagate the non-wrapping flags to values that
1422 // are derived from a non-wrapping induction variable because non-wrapping
1423 // could be flow-sensitive.
1424 //
1425 // Look through the potentially overflowing instruction to try to prove
1426 // non-wrapping for the *specific* value of Ptr.
1427
1428 // The arithmetic implied by an inbounds GEP can't overflow.
1429 auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
1430 if (!GEP || !GEP->isInBounds())
1431 return false;
1432
1433 // Make sure there is only one non-const index and analyze that.
1434 Value *NonConstIndex = nullptr;
1435 for (Value *Index : GEP->indices())
1436 if (!isa<ConstantInt>(Index)) {
1437 if (NonConstIndex)
1438 return false;
1439 NonConstIndex = Index;
1440 }
1441 if (!NonConstIndex)
1442 // The recurrence is on the pointer, ignore for now.
1443 return false;
1444
1445 // The index in GEP is signed. It is non-wrapping if it's derived from a NSW
1446 // AddRec using a NSW operation.
1447 if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(NonConstIndex))
1448 if (OBO->hasNoSignedWrap() &&
1449 // Assume constant for other the operand so that the AddRec can be
1450 // easily found.
1451 isa<ConstantInt>(OBO->getOperand(1))) {
1452 auto *OpScev = PSE.getSCEV(OBO->getOperand(0));
1453
1454 if (auto *OpAR = dyn_cast<SCEVAddRecExpr>(OpScev))
1455 return OpAR->getLoop() == L && OpAR->getNoWrapFlags(SCEV::FlagNSW);
1456 }
1457
1458 return false;
1459}
1460
1461/// Check whether the access through \p Ptr has a constant stride.
1463 Type *AccessTy, Value *Ptr,
1464 const Loop *Lp,
1465 const DenseMap<Value *, const SCEV *> &StridesMap,
1466 bool Assume, bool ShouldCheckWrap) {
1467 Type *Ty = Ptr->getType();
1468 assert(Ty->isPointerTy() && "Unexpected non-ptr");
1469
1470 if (isa<ScalableVectorType>(AccessTy)) {
1471 LLVM_DEBUG(dbgs() << "LAA: Bad stride - Scalable object: " << *AccessTy
1472 << "\n");
1473 return std::nullopt;
1474 }
1475
1476 const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr);
1477
1478 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
1479 if (Assume && !AR)
1480 AR = PSE.getAsAddRec(Ptr);
1481
1482 if (!AR) {
1483 LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not an AddRecExpr pointer " << *Ptr
1484 << " SCEV: " << *PtrScev << "\n");
1485 return std::nullopt;
1486 }
1487
1488 // The access function must stride over the innermost loop.
1489 if (Lp != AR->getLoop()) {
1490 LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not striding over innermost loop "
1491 << *Ptr << " SCEV: " << *AR << "\n");
1492 return std::nullopt;
1493 }
1494
1495 // Check the step is constant.
1496 const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
1497
1498 // Calculate the pointer stride and check if it is constant.
1499 const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
1500 if (!C) {
1501 LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not a constant strided " << *Ptr
1502 << " SCEV: " << *AR << "\n");
1503 return std::nullopt;
1504 }
1505
1506 auto &DL = Lp->getHeader()->getModule()->getDataLayout();
1507 TypeSize AllocSize = DL.getTypeAllocSize(AccessTy);
1508 int64_t Size = AllocSize.getFixedValue();
1509 const APInt &APStepVal = C->getAPInt();
1510
1511 // Huge step value - give up.
1512 if (APStepVal.getBitWidth() > 64)
1513 return std::nullopt;
1514
1515 int64_t StepVal = APStepVal.getSExtValue();
1516
1517 // Strided access.
1518 int64_t Stride = StepVal / Size;
1519 int64_t Rem = StepVal % Size;
1520 if (Rem)
1521 return std::nullopt;
1522
1523 if (!ShouldCheckWrap)
1524 return Stride;
1525
1526 // The address calculation must not wrap. Otherwise, a dependence could be
1527 // inverted.
1528 if (isNoWrapAddRec(Ptr, AR, PSE, Lp))
1529 return Stride;
1530
1531 // An inbounds getelementptr that is a AddRec with a unit stride
1532 // cannot wrap per definition. If it did, the result would be poison
1533 // and any memory access dependent on it would be immediate UB
1534 // when executed.
1535 if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
1536 GEP && GEP->isInBounds() && (Stride == 1 || Stride == -1))
1537 return Stride;
1538
1539 // If the null pointer is undefined, then a access sequence which would
1540 // otherwise access it can be assumed not to unsigned wrap. Note that this
1541 // assumes the object in memory is aligned to the natural alignment.
1542 unsigned AddrSpace = Ty->getPointerAddressSpace();
1543 if (!NullPointerIsDefined(Lp->getHeader()->getParent(), AddrSpace) &&
1544 (Stride == 1 || Stride == -1))
1545 return Stride;
1546
1547 if (Assume) {
1549 LLVM_DEBUG(dbgs() << "LAA: Pointer may wrap:\n"
1550 << "LAA: Pointer: " << *Ptr << "\n"
1551 << "LAA: SCEV: " << *AR << "\n"
1552 << "LAA: Added an overflow assumption\n");
1553 return Stride;
1554 }
1555 LLVM_DEBUG(
1556 dbgs() << "LAA: Bad stride - Pointer may wrap in the address space "
1557 << *Ptr << " SCEV: " << *AR << "\n");
1558 return std::nullopt;
1559}
1560
1561std::optional<int> llvm::getPointersDiff(Type *ElemTyA, Value *PtrA,
1562 Type *ElemTyB, Value *PtrB,
1563 const DataLayout &DL,
1564 ScalarEvolution &SE, bool StrictCheck,
1565 bool CheckType) {
1566 assert(PtrA && PtrB && "Expected non-nullptr pointers.");
1567
1568 // Make sure that A and B are different pointers.
1569 if (PtrA == PtrB)
1570 return 0;
1571
1572 // Make sure that the element types are the same if required.
1573 if (CheckType && ElemTyA != ElemTyB)
1574 return std::nullopt;
1575
1576 unsigned ASA = PtrA->getType()->getPointerAddressSpace();
1577 unsigned ASB = PtrB->getType()->getPointerAddressSpace();
1578
1579 // Check that the address spaces match.
1580 if (ASA != ASB)
1581 return std::nullopt;
1582 unsigned IdxWidth = DL.getIndexSizeInBits(ASA);
1583
1584 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
1585 Value *PtrA1 = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
1586 Value *PtrB1 = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
1587
1588 int Val;
1589 if (PtrA1 == PtrB1) {
1590 // Retrieve the address space again as pointer stripping now tracks through
1591 // `addrspacecast`.
1592 ASA = cast<PointerType>(PtrA1->getType())->getAddressSpace();
1593 ASB = cast<PointerType>(PtrB1->getType())->getAddressSpace();
1594 // Check that the address spaces match and that the pointers are valid.
1595 if (ASA != ASB)
1596 return std::nullopt;
1597
1598 IdxWidth = DL.getIndexSizeInBits(ASA);
1599 OffsetA = OffsetA.sextOrTrunc(IdxWidth);
1600 OffsetB = OffsetB.sextOrTrunc(IdxWidth);
1601
1602 OffsetB -= OffsetA;
1603 Val = OffsetB.getSExtValue();
1604 } else {
1605 // Otherwise compute the distance with SCEV between the base pointers.
1606 const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
1607 const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
1608 const auto *Diff =
1609 dyn_cast<SCEVConstant>(SE.getMinusSCEV(PtrSCEVB, PtrSCEVA));
1610 if (!Diff)
1611 return std::nullopt;
1612 Val = Diff->getAPInt().getSExtValue();
1613 }
1614 int Size = DL.getTypeStoreSize(ElemTyA);
1615 int Dist = Val / Size;
1616
1617 // Ensure that the calculated distance matches the type-based one after all
1618 // the bitcasts removal in the provided pointers.
1619 if (!StrictCheck || Dist * Size == Val)
1620 return Dist;
1621 return std::nullopt;
1622}
1623
1625 const DataLayout &DL, ScalarEvolution &SE,
1626 SmallVectorImpl<unsigned> &SortedIndices) {
1628 VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
1629 "Expected list of pointer operands.");
1630 // Walk over the pointers, and map each of them to an offset relative to
1631 // first pointer in the array.
1632 Value *Ptr0 = VL[0];
1633
1634 using DistOrdPair = std::pair<int64_t, int>;
1635 auto Compare = llvm::less_first();
1636 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
1637 Offsets.emplace(0, 0);
1638 int Cnt = 1;
1639 bool IsConsecutive = true;
1640 for (auto *Ptr : VL.drop_front()) {
1641 std::optional<int> Diff = getPointersDiff(ElemTy, Ptr0, ElemTy, Ptr, DL, SE,
1642 /*StrictCheck=*/true);
1643 if (!Diff)
1644 return false;
1645
1646 // Check if the pointer with the same offset is found.
1647 int64_t Offset = *Diff;
1648 auto Res = Offsets.emplace(Offset, Cnt);
1649 if (!Res.second)
1650 return false;
1651 // Consecutive order if the inserted element is the last one.
1652 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
1653 ++Cnt;
1654 }
1655 SortedIndices.clear();
1656 if (!IsConsecutive) {
1657 // Fill SortedIndices array only if it is non-consecutive.
1658 SortedIndices.resize(VL.size());
1659 Cnt = 0;
1660 for (const std::pair<int64_t, int> &Pair : Offsets) {
1661 SortedIndices[Cnt] = Pair.second;
1662 ++Cnt;
1663 }
1664 }
1665 return true;
1666}
1667
1668/// Returns true if the memory operations \p A and \p B are consecutive.
1670 ScalarEvolution &SE, bool CheckType) {
1673 if (!PtrA || !PtrB)
1674 return false;
1675 Type *ElemTyA = getLoadStoreType(A);
1676 Type *ElemTyB = getLoadStoreType(B);
1677 std::optional<int> Diff =
1678 getPointersDiff(ElemTyA, PtrA, ElemTyB, PtrB, DL, SE,
1679 /*StrictCheck=*/true, CheckType);
1680 return Diff && *Diff == 1;
1681}
1682
1684 visitPointers(SI->getPointerOperand(), *InnermostLoop,
1685 [this, SI](Value *Ptr) {
1686 Accesses[MemAccessInfo(Ptr, true)].push_back(AccessIdx);
1687 InstMap.push_back(SI);
1688 ++AccessIdx;
1689 });
1690}
1691
1693 visitPointers(LI->getPointerOperand(), *InnermostLoop,
1694 [this, LI](Value *Ptr) {
1695 Accesses[MemAccessInfo(Ptr, false)].push_back(AccessIdx);
1696 InstMap.push_back(LI);
1697 ++AccessIdx;
1698 });
1699}
1700
1703 switch (Type) {
1704 case NoDep:
1705 case Forward:
1708
1709 case Unknown:
1712 case Backward:
1714 case IndirectUnsafe:
1716 }
1717 llvm_unreachable("unexpected DepType!");
1718}
1719
1721 switch (Type) {
1722 case NoDep:
1723 case Forward:
1724 case ForwardButPreventsForwarding:
1725 case Unknown:
1726 case IndirectUnsafe:
1727 return false;
1728
1729 case BackwardVectorizable:
1730 case Backward:
1731 case BackwardVectorizableButPreventsForwarding:
1732 return true;
1733 }
1734 llvm_unreachable("unexpected DepType!");
1735}
1736
1738 return isBackward() || Type == Unknown;
1739}
1740
1742 switch (Type) {
1743 case Forward:
1744 case ForwardButPreventsForwarding:
1745 return true;
1746
1747 case NoDep:
1748 case Unknown:
1749 case BackwardVectorizable:
1750 case Backward:
1751 case BackwardVectorizableButPreventsForwarding:
1752 case IndirectUnsafe:
1753 return false;
1754 }
1755 llvm_unreachable("unexpected DepType!");
1756}
1757
1758bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance,
1759 uint64_t TypeByteSize) {
1760 // If loads occur at a distance that is not a multiple of a feasible vector
1761 // factor store-load forwarding does not take place.
1762 // Positive dependences might cause troubles because vectorizing them might
1763 // prevent store-load forwarding making vectorized code run a lot slower.
1764 // a[i] = a[i-3] ^ a[i-8];
1765 // The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and
1766 // hence on your typical architecture store-load forwarding does not take
1767 // place. Vectorizing in such cases does not make sense.
1768 // Store-load forwarding distance.
1769
1770 // After this many iterations store-to-load forwarding conflicts should not
1771 // cause any slowdowns.
1772 const uint64_t NumItersForStoreLoadThroughMemory = 8 * TypeByteSize;
1773 // Maximum vector factor.
1774 uint64_t MaxVFWithoutSLForwardIssues = std::min(
1775 VectorizerParams::MaxVectorWidth * TypeByteSize, MinDepDistBytes);
1776
1777 // Compute the smallest VF at which the store and load would be misaligned.
1778 for (uint64_t VF = 2 * TypeByteSize; VF <= MaxVFWithoutSLForwardIssues;
1779 VF *= 2) {
1780 // If the number of vector iteration between the store and the load are
1781 // small we could incur conflicts.
1782 if (Distance % VF && Distance / VF < NumItersForStoreLoadThroughMemory) {
1783 MaxVFWithoutSLForwardIssues = (VF >> 1);
1784 break;
1785 }
1786 }
1787
1788 if (MaxVFWithoutSLForwardIssues < 2 * TypeByteSize) {
1789 LLVM_DEBUG(
1790 dbgs() << "LAA: Distance " << Distance
1791 << " that could cause a store-load forwarding conflict\n");
1792 return true;
1793 }
1794
1795 if (MaxVFWithoutSLForwardIssues < MinDepDistBytes &&
1796 MaxVFWithoutSLForwardIssues !=
1797 VectorizerParams::MaxVectorWidth * TypeByteSize)
1798 MinDepDistBytes = MaxVFWithoutSLForwardIssues;
1799 return false;
1800}
1801
1802void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) {
1803 if (Status < S)
1804 Status = S;
1805}
1806
1807/// Given a dependence-distance \p Dist between two
1808/// memory accesses, that have the same stride whose absolute value is given
1809/// in \p Stride, and that have the same type size \p TypeByteSize,
1810/// in a loop whose takenCount is \p BackedgeTakenCount, check if it is
1811/// possible to prove statically that the dependence distance is larger
1812/// than the range that the accesses will travel through the execution of
1813/// the loop. If so, return true; false otherwise. This is useful for
1814/// example in loops such as the following (PR31098):
1815/// for (i = 0; i < D; ++i) {
1816/// = out[i];
1817/// out[i+D] =
1818/// }
1820 const SCEV &BackedgeTakenCount,
1821 const SCEV &Dist, uint64_t Stride,
1822 uint64_t TypeByteSize) {
1823
1824 // If we can prove that
1825 // (**) |Dist| > BackedgeTakenCount * Step
1826 // where Step is the absolute stride of the memory accesses in bytes,
1827 // then there is no dependence.
1828 //
1829 // Rationale:
1830 // We basically want to check if the absolute distance (|Dist/Step|)
1831 // is >= the loop iteration count (or > BackedgeTakenCount).
1832 // This is equivalent to the Strong SIV Test (Practical Dependence Testing,
1833 // Section 4.2.1); Note, that for vectorization it is sufficient to prove
1834 // that the dependence distance is >= VF; This is checked elsewhere.
1835 // But in some cases we can prune dependence distances early, and
1836 // even before selecting the VF, and without a runtime test, by comparing
1837 // the distance against the loop iteration count. Since the vectorized code
1838 // will be executed only if LoopCount >= VF, proving distance >= LoopCount
1839 // also guarantees that distance >= VF.
1840 //
1841 const uint64_t ByteStride = Stride * TypeByteSize;
1842 const SCEV *Step = SE.getConstant(BackedgeTakenCount.getType(), ByteStride);
1843 const SCEV *Product = SE.getMulExpr(&BackedgeTakenCount, Step);
1844
1845 const SCEV *CastedDist = &Dist;
1846 const SCEV *CastedProduct = Product;
1847 uint64_t DistTypeSizeBits = DL.getTypeSizeInBits(Dist.getType());
1848 uint64_t ProductTypeSizeBits = DL.getTypeSizeInBits(Product->getType());
1849
1850 // The dependence distance can be positive/negative, so we sign extend Dist;
1851 // The multiplication of the absolute stride in bytes and the
1852 // backedgeTakenCount is non-negative, so we zero extend Product.
1853 if (DistTypeSizeBits > ProductTypeSizeBits)
1854 CastedProduct = SE.getZeroExtendExpr(Product, Dist.getType());
1855 else
1856 CastedDist = SE.getNoopOrSignExtend(&Dist, Product->getType());
1857
1858 // Is Dist - (BackedgeTakenCount * Step) > 0 ?
1859 // (If so, then we have proven (**) because |Dist| >= Dist)
1860 const SCEV *Minus = SE.getMinusSCEV(CastedDist, CastedProduct);
1861 if (SE.isKnownPositive(Minus))
1862 return true;
1863
1864 // Second try: Is -Dist - (BackedgeTakenCount * Step) > 0 ?
1865 // (If so, then we have proven (**) because |Dist| >= -1*Dist)
1866 const SCEV *NegDist = SE.getNegativeSCEV(CastedDist);
1867 Minus = SE.getMinusSCEV(NegDist, CastedProduct);
1868 if (SE.isKnownPositive(Minus))
1869 return true;
1870
1871 return false;
1872}
1873
1874/// Check the dependence for two accesses with the same stride \p Stride.
1875/// \p Distance is the positive distance and \p TypeByteSize is type size in
1876/// bytes.
1877///
1878/// \returns true if they are independent.
1880 uint64_t TypeByteSize) {
1881 assert(Stride > 1 && "The stride must be greater than 1");
1882 assert(TypeByteSize > 0 && "The type size in byte must be non-zero");
1883 assert(Distance > 0 && "The distance must be non-zero");
1884
1885 // Skip if the distance is not multiple of type byte size.
1886 if (Distance % TypeByteSize)
1887 return false;
1888
1889 uint64_t ScaledDist = Distance / TypeByteSize;
1890
1891 // No dependence if the scaled distance is not multiple of the stride.
1892 // E.g.
1893 // for (i = 0; i < 1024 ; i += 4)
1894 // A[i+2] = A[i] + 1;
1895 //
1896 // Two accesses in memory (scaled distance is 2, stride is 4):
1897 // | A[0] | | | | A[4] | | | |
1898 // | | | A[2] | | | | A[6] | |
1899 //
1900 // E.g.
1901 // for (i = 0; i < 1024 ; i += 3)
1902 // A[i+4] = A[i] + 1;
1903 //
1904 // Two accesses in memory (scaled distance is 4, stride is 3):
1905 // | A[0] | | | A[3] | | | A[6] | | |
1906 // | | | | | A[4] | | | A[7] | |
1907 return ScaledDist % Stride;
1908}
1909
1910/// Returns true if any of the underlying objects has a loop varying address,
1911/// i.e. may change in \p L.
1912static bool
1914 ScalarEvolution &SE, const Loop *L) {
1915 return any_of(UnderlyingObjects, [&SE, L](const Value *UO) {
1916 return !SE.isLoopInvariant(SE.getSCEV(const_cast<Value *>(UO)), L);
1917 });
1918}
1919
1920namespace {
1921struct DepDistanceStrideAndSizeInfo {
1922 const SCEV *Dist;
1923 uint64_t Stride;
1924 uint64_t TypeByteSize;
1925 bool AIsWrite;
1926 bool BIsWrite;
1927
1928 DepDistanceStrideAndSizeInfo(const SCEV *Dist, uint64_t Stride,
1929 uint64_t TypeByteSize, bool AIsWrite,
1930 bool BIsWrite)
1931 : Dist(Dist), Stride(Stride), TypeByteSize(TypeByteSize),
1932 AIsWrite(AIsWrite), BIsWrite(BIsWrite) {}
1933};
1934} // namespace
1935
1936// Get the dependence distance, stride, type size and whether it is a write for
1937// the dependence between A and B. Returns a DepType, if we can prove there's
1938// no dependence or the analysis fails. Outlined to lambda to limit he scope
1939// of various temporary variables, like A/BPtr, StrideA/BPtr and others.
1940// Returns either the dependence result, if it could already be determined, or a
1941// struct containing (Distance, Stride, TypeSize, AIsWrite, BIsWrite).
1942static std::variant<MemoryDepChecker::Dependence::DepType,
1943 DepDistanceStrideAndSizeInfo>
1947 const DenseMap<Value *, const SCEV *> &Strides,
1948 const DenseMap<Value *, SmallVector<const Value *, 16>> &UnderlyingObjects,
1949 PredicatedScalarEvolution &PSE, const Loop *InnermostLoop) {
1950 auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout();
1951 auto &SE = *PSE.getSE();
1952 auto [APtr, AIsWrite] = A;
1953 auto [BPtr, BIsWrite] = B;
1954
1955 // Two reads are independent.
1956 if (!AIsWrite && !BIsWrite)
1958
1959 Type *ATy = getLoadStoreType(AInst);
1960 Type *BTy = getLoadStoreType(BInst);
1961
1962 // We cannot check pointers in different address spaces.
1963 if (APtr->getType()->getPointerAddressSpace() !=
1964 BPtr->getType()->getPointerAddressSpace())
1966
1967 int64_t StrideAPtr =
1968 getPtrStride(PSE, ATy, APtr, InnermostLoop, Strides, true).value_or(0);
1969 int64_t StrideBPtr =
1970 getPtrStride(PSE, BTy, BPtr, InnermostLoop, Strides, true).value_or(0);
1971
1972 const SCEV *Src = PSE.getSCEV(APtr);
1973 const SCEV *Sink = PSE.getSCEV(BPtr);
1974
1975 // If the induction step is negative we have to invert source and sink of the
1976 // dependence when measuring the distance between them. We should not swap
1977 // AIsWrite with BIsWrite, as their uses expect them in program order.
1978 if (StrideAPtr < 0) {
1979 std::swap(Src, Sink);
1980 std::swap(AInst, BInst);
1981 }
1982
1983 const SCEV *Dist = SE.getMinusSCEV(Sink, Src);
1984
1985 LLVM_DEBUG(dbgs() << "LAA: Src Scev: " << *Src << "Sink Scev: " << *Sink
1986 << "(Induction step: " << StrideAPtr << ")\n");
1987 LLVM_DEBUG(dbgs() << "LAA: Distance for " << *AInst << " to " << *BInst
1988 << ": " << *Dist << "\n");
1989
1990 // Needs accesses where the addresses of the accessed underlying objects do
1991 // not change within the loop.
1992 if (isLoopVariantIndirectAddress(UnderlyingObjects.find(APtr)->second, SE,
1993 InnermostLoop) ||
1994 isLoopVariantIndirectAddress(UnderlyingObjects.find(BPtr)->second, SE,
1995 InnermostLoop))
1997
1998 // Need accesses with constant stride. We don't want to vectorize
1999 // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap
2000 // in the address space.
2001 if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr) {
2002 LLVM_DEBUG(dbgs() << "Pointer access with non-constant stride\n");
2004 }
2005
2006 uint64_t TypeByteSize = DL.getTypeAllocSize(ATy);
2007 bool HasSameSize =
2008 DL.getTypeStoreSizeInBits(ATy) == DL.getTypeStoreSizeInBits(BTy);
2009 if (!HasSameSize)
2010 TypeByteSize = 0;
2011 uint64_t Stride = std::abs(StrideAPtr);
2012 return DepDistanceStrideAndSizeInfo(Dist, Stride, TypeByteSize, AIsWrite,
2013 BIsWrite);
2014}
2015
2016MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
2017 const MemAccessInfo &A, unsigned AIdx, const MemAccessInfo &B,
2018 unsigned BIdx, const DenseMap<Value *, const SCEV *> &Strides,
2020 &UnderlyingObjects) {
2021 assert(AIdx < BIdx && "Must pass arguments in program order");
2022
2023 // Get the dependence distance, stride, type size and what access writes for
2024 // the dependence between A and B.
2026 A, InstMap[AIdx], B, InstMap[BIdx], Strides, UnderlyingObjects, PSE,
2027 InnermostLoop);
2028 if (std::holds_alternative<Dependence::DepType>(Res))
2029 return std::get<Dependence::DepType>(Res);
2030
2031 const auto &[Dist, Stride, TypeByteSize, AIsWrite, BIsWrite] =
2032 std::get<DepDistanceStrideAndSizeInfo>(Res);
2033 bool HasSameSize = TypeByteSize > 0;
2034
2035 ScalarEvolution &SE = *PSE.getSE();
2036 auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout();
2037 // If the distance between the acecsses is larger than their absolute stride
2038 // multiplied by the backedge taken count, the accesses are independet, i.e.
2039 // they are far enough appart that accesses won't access the same location
2040 // across all loop ierations.
2041 if (!isa<SCEVCouldNotCompute>(Dist) && HasSameSize &&
2043 Stride, TypeByteSize))
2044 return Dependence::NoDep;
2045
2046 const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
2047 if (!C) {
2048 LLVM_DEBUG(dbgs() << "LAA: Dependence because of non-constant distance\n");
2049 FoundNonConstantDistanceDependence = true;
2050 return Dependence::Unknown;
2051 }
2052
2053 const APInt &Val = C->getAPInt();
2054 int64_t Distance = Val.getSExtValue();
2055
2056 // If the distance between accesses and their strides are known constants,
2057 // check whether the accesses interlace each other.
2058 if (std::abs(Distance) > 0 && Stride > 1 && HasSameSize &&
2059 areStridedAccessesIndependent(std::abs(Distance), Stride, TypeByteSize)) {
2060 LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n");
2061 return Dependence::NoDep;
2062 }
2063
2064 // Negative distances are not plausible dependencies.
2065 if (Val.isNegative()) {
2066 bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
2067 // Check if the first access writes to a location that is read in a later
2068 // iteration, where the distance between them is not a multiple of a vector
2069 // factor and relatively small.
2070 //
2071 // NOTE: There is no need to update MaxSafeVectorWidthInBits after call to
2072 // couldPreventStoreLoadForward, even if it changed MinDepDistBytes, since a
2073 // forward dependency will allow vectorization using any width.
2074 if (IsTrueDataDependence && EnableForwardingConflictDetection &&
2075 (!HasSameSize || couldPreventStoreLoadForward(Val.abs().getZExtValue(),
2076 TypeByteSize))) {
2077 LLVM_DEBUG(dbgs() << "LAA: Forward but may prevent st->ld forwarding\n");
2079 }
2080
2081 LLVM_DEBUG(dbgs() << "LAA: Dependence is negative\n");
2082 return Dependence::Forward;
2083 }
2084
2085 // Write to the same location with the same size.
2086 if (Val == 0) {
2087 if (HasSameSize)
2088 return Dependence::Forward;
2089 LLVM_DEBUG(
2090 dbgs() << "LAA: Zero dependence difference but different type sizes\n");
2091 return Dependence::Unknown;
2092 }
2093
2094 assert(Val.isStrictlyPositive() && "Expect a positive value");
2095
2096 if (!HasSameSize) {
2097 LLVM_DEBUG(dbgs() << "LAA: ReadWrite-Write positive dependency with "
2098 "different type sizes\n");
2099 return Dependence::Unknown;
2100 }
2101
2102 // Bail out early if passed-in parameters make vectorization not feasible.
2103 unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ?
2105 unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ?
2107 // The minimum number of iterations for a vectorized/unrolled version.
2108 unsigned MinNumIter = std::max(ForcedFactor * ForcedUnroll, 2U);
2109
2110 // It's not vectorizable if the distance is smaller than the minimum distance
2111 // needed for a vectroized/unrolled version. Vectorizing one iteration in
2112 // front needs TypeByteSize * Stride. Vectorizing the last iteration needs
2113 // TypeByteSize (No need to plus the last gap distance).
2114 //
2115 // E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
2116 // foo(int *A) {
2117 // int *B = (int *)((char *)A + 14);
2118 // for (i = 0 ; i < 1024 ; i += 2)
2119 // B[i] = A[i] + 1;
2120 // }
2121 //
2122 // Two accesses in memory (stride is 2):
2123 // | A[0] | | A[2] | | A[4] | | A[6] | |
2124 // | B[0] | | B[2] | | B[4] |
2125 //
2126 // Distance needs for vectorizing iterations except the last iteration:
2127 // 4 * 2 * (MinNumIter - 1). Distance needs for the last iteration: 4.
2128 // So the minimum distance needed is: 4 * 2 * (MinNumIter - 1) + 4.
2129 //
2130 // If MinNumIter is 2, it is vectorizable as the minimum distance needed is
2131 // 12, which is less than distance.
2132 //
2133 // If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4),
2134 // the minimum distance needed is 28, which is greater than distance. It is
2135 // not safe to do vectorization.
2136 uint64_t MinDistanceNeeded =
2137 TypeByteSize * Stride * (MinNumIter - 1) + TypeByteSize;
2138 if (MinDistanceNeeded > static_cast<uint64_t>(Distance)) {
2139 LLVM_DEBUG(dbgs() << "LAA: Failure because of positive distance "
2140 << Distance << '\n');
2141 return Dependence::Backward;
2142 }
2143
2144 // Unsafe if the minimum distance needed is greater than smallest dependence
2145 // distance distance.
2146 if (MinDistanceNeeded > MinDepDistBytes) {
2147 LLVM_DEBUG(dbgs() << "LAA: Failure because it needs at least "
2148 << MinDistanceNeeded << " size in bytes\n");
2149 return Dependence::Backward;
2150 }
2151
2152 // Positive distance bigger than max vectorization factor.
2153 // FIXME: Should use max factor instead of max distance in bytes, which could
2154 // not handle different types.
2155 // E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
2156 // void foo (int *A, char *B) {
2157 // for (unsigned i = 0; i < 1024; i++) {
2158 // A[i+2] = A[i] + 1;
2159 // B[i+2] = B[i] + 1;
2160 // }
2161 // }
2162 //
2163 // This case is currently unsafe according to the max safe distance. If we
2164 // analyze the two accesses on array B, the max safe dependence distance
2165 // is 2. Then we analyze the accesses on array A, the minimum distance needed
2166 // is 8, which is less than 2 and forbidden vectorization, But actually
2167 // both A and B could be vectorized by 2 iterations.
2168 MinDepDistBytes =
2169 std::min(static_cast<uint64_t>(Distance), MinDepDistBytes);
2170
2171 bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
2172 uint64_t MinDepDistBytesOld = MinDepDistBytes;
2173 if (IsTrueDataDependence && EnableForwardingConflictDetection &&
2174 couldPreventStoreLoadForward(Distance, TypeByteSize)) {
2175 // Sanity check that we didn't update MinDepDistBytes when calling
2176 // couldPreventStoreLoadForward
2177 assert(MinDepDistBytes == MinDepDistBytesOld &&
2178 "An update to MinDepDistBytes requires an update to "
2179 "MaxSafeVectorWidthInBits");
2180 (void)MinDepDistBytesOld;
2182 }
2183
2184 // An update to MinDepDistBytes requires an update to MaxSafeVectorWidthInBits
2185 // since there is a backwards dependency.
2186 uint64_t MaxVF = MinDepDistBytes / (TypeByteSize * Stride);
2187 LLVM_DEBUG(dbgs() << "LAA: Positive distance " << Val.getSExtValue()
2188 << " with max VF = " << MaxVF << '\n');
2189 uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8;
2190 MaxSafeVectorWidthInBits = std::min(MaxSafeVectorWidthInBits, MaxVFInBits);
2192}
2193
2195 DepCandidates &AccessSets, MemAccessInfoList &CheckDeps,
2196 const DenseMap<Value *, const SCEV *> &Strides,
2198 &UnderlyingObjects) {
2199
2200 MinDepDistBytes = -1;
2202 for (MemAccessInfo CurAccess : CheckDeps) {
2203 if (Visited.count(CurAccess))
2204 continue;
2205
2206 // Get the relevant memory access set.
2208 AccessSets.findValue(AccessSets.getLeaderValue(CurAccess));
2209
2210 // Check accesses within this set.
2212 AccessSets.member_begin(I);
2214 AccessSets.member_end();
2215
2216 // Check every access pair.
2217 while (AI != AE) {
2218 Visited.insert(*AI);
2219 bool AIIsWrite = AI->getInt();
2220 // Check loads only against next equivalent class, but stores also against
2221 // other stores in the same equivalence class - to the same address.
2223 (AIIsWrite ? AI : std::next(AI));
2224 while (OI != AE) {
2225 // Check every accessing instruction pair in program order.
2226 for (std::vector<unsigned>::iterator I1 = Accesses[*AI].begin(),
2227 I1E = Accesses[*AI].end(); I1 != I1E; ++I1)
2228 // Scan all accesses of another equivalence class, but only the next
2229 // accesses of the same equivalent class.
2230 for (std::vector<unsigned>::iterator
2231 I2 = (OI == AI ? std::next(I1) : Accesses[*OI].begin()),
2232 I2E = (OI == AI ? I1E : Accesses[*OI].end());
2233 I2 != I2E; ++I2) {
2234 auto A = std::make_pair(&*AI, *I1);
2235 auto B = std::make_pair(&*OI, *I2);
2236
2237 assert(*I1 != *I2);
2238 if (*I1 > *I2)
2239 std::swap(A, B);
2240
2242 isDependent(*A.first, A.second, *B.first, B.second, Strides,
2243 UnderlyingObjects);
2245
2246 // Gather dependences unless we accumulated MaxDependences
2247 // dependences. In that case return as soon as we find the first
2248 // unsafe dependence. This puts a limit on this quadratic
2249 // algorithm.
2250 if (RecordDependences) {
2251 if (Type != Dependence::NoDep)
2252 Dependences.push_back(Dependence(A.second, B.second, Type));
2253
2254 if (Dependences.size() >= MaxDependences) {
2255 RecordDependences = false;
2256 Dependences.clear();
2258 << "Too many dependences, stopped recording\n");
2259 }
2260 }
2261 if (!RecordDependences && !isSafeForVectorization())
2262 return false;
2263 }
2264 ++OI;
2265 }
2266 AI++;
2267 }
2268 }
2269
2270 LLVM_DEBUG(dbgs() << "Total Dependences: " << Dependences.size() << "\n");
2271 return isSafeForVectorization();
2272}
2273
2276 MemAccessInfo Access(Ptr, isWrite);
2277 auto &IndexVector = Accesses.find(Access)->second;
2278
2280 transform(IndexVector,
2281 std::back_inserter(Insts),
2282 [&](unsigned Idx) { return this->InstMap[Idx]; });
2283 return Insts;
2284}
2285
2287 "NoDep",
2288 "Unknown",
2289 "IndirectUnsafe",
2290 "Forward",
2291 "ForwardButPreventsForwarding",
2292 "Backward",
2293 "BackwardVectorizable",
2294 "BackwardVectorizableButPreventsForwarding"};
2295
2297 raw_ostream &OS, unsigned Depth,
2298 const SmallVectorImpl<Instruction *> &Instrs) const {
2299 OS.indent(Depth) << DepName[Type] << ":\n";
2300 OS.indent(Depth + 2) << *Instrs[Source] << " -> \n";
2301 OS.indent(Depth + 2) << *Instrs[Destination] << "\n";
2302}
2303
2304bool LoopAccessInfo::canAnalyzeLoop() {
2305 // We need to have a loop header.
2306 LLVM_DEBUG(dbgs() << "LAA: Found a loop in "
2307 << TheLoop->getHeader()->getParent()->getName() << ": "
2308 << TheLoop->getHeader()->getName() << '\n');
2309
2310 // We can only analyze innermost loops.
2311 if (!TheLoop->isInnermost()) {
2312 LLVM_DEBUG(dbgs() << "LAA: loop is not the innermost loop\n");
2313 recordAnalysis("NotInnerMostLoop") << "loop is not the innermost loop";
2314 return false;
2315 }
2316
2317 // We must have a single backedge.
2318 if (TheLoop->getNumBackEdges() != 1) {
2319 LLVM_DEBUG(
2320 dbgs() << "LAA: loop control flow is not understood by analyzer\n");
2321 recordAnalysis("CFGNotUnderstood")
2322 << "loop control flow is not understood by analyzer";
2323 return false;
2324 }
2325
2326 // ScalarEvolution needs to be able to find the exit count.
2327 const SCEV *ExitCount = PSE->getBackedgeTakenCount();
2328 if (isa<SCEVCouldNotCompute>(ExitCount)) {
2329 recordAnalysis("CantComputeNumberOfIterations")
2330 << "could not determine number of loop iterations";
2331 LLVM_DEBUG(dbgs() << "LAA: SCEV could not compute the loop exit count.\n");
2332 return false;
2333 }
2334
2335 return true;
2336}
2337
2338void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
2339 const TargetLibraryInfo *TLI,
2340 DominatorTree *DT) {
2341 // Holds the Load and Store instructions.
2344 SmallPtrSet<MDNode *, 8> LoopAliasScopes;
2345
2346 // Holds all the different accesses in the loop.
2347 unsigned NumReads = 0;
2348 unsigned NumReadWrites = 0;
2349
2350 bool HasComplexMemInst = false;
2351
2352 // A runtime check is only legal to insert if there are no convergent calls.
2353 HasConvergentOp = false;
2354
2355 PtrRtChecking->Pointers.clear();
2356 PtrRtChecking->Need = false;
2357
2358 const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
2359
2360 const bool EnableMemAccessVersioningOfLoop =
2362 !TheLoop->getHeader()->getParent()->hasOptSize();
2363
2364 // Traverse blocks in fixed RPOT order, regardless of their storage in the
2365 // loop info, as it may be arbitrary.
2366 LoopBlocksRPO RPOT(TheLoop);
2367 RPOT.perform(LI);
2368 for (BasicBlock *BB : RPOT) {
2369 // Scan the BB and collect legal loads and stores. Also detect any
2370 // convergent instructions.
2371 for (Instruction &I : *BB) {
2372 if (auto *Call = dyn_cast<CallBase>(&I)) {
2373 if (Call->isConvergent())
2374 HasConvergentOp = true;
2375 }
2376
2377 // With both a non-vectorizable memory instruction and a convergent
2378 // operation, found in this loop, no reason to continue the search.
2379 if (HasComplexMemInst && HasConvergentOp) {
2380 CanVecMem = false;
2381 return;
2382 }
2383
2384 // Avoid hitting recordAnalysis multiple times.
2385 if (HasComplexMemInst)
2386 continue;
2387
2388 // Record alias scopes defined inside the loop.
2389 if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I))
2390 for (Metadata *Op : Decl->getScopeList()->operands())
2391 LoopAliasScopes.insert(cast<MDNode>(Op));
2392
2393 // Many math library functions read the rounding mode. We will only
2394 // vectorize a loop if it contains known function calls that don't set
2395 // the flag. Therefore, it is safe to ignore this read from memory.
2396 auto *Call = dyn_cast<CallInst>(&I);
2397 if (Call && getVectorIntrinsicIDForCall(Call, TLI))
2398 continue;
2399
2400 // If this is a load, save it. If this instruction can read from memory
2401 // but is not a load, then we quit. Notice that we don't handle function
2402 // calls that read or write.
2403 if (I.mayReadFromMemory()) {
2404 // If the function has an explicit vectorized counterpart, we can safely
2405 // assume that it can be vectorized.
2406 if (Call && !Call->isNoBuiltin() && Call->getCalledFunction() &&
2407 !VFDatabase::getMappings(*Call).empty())
2408 continue;
2409
2410 auto *Ld = dyn_cast<LoadInst>(&I);
2411 if (!Ld) {
2412 recordAnalysis("CantVectorizeInstruction", Ld)
2413 << "instruction cannot be vectorized";
2414 HasComplexMemInst = true;
2415 continue;
2416 }
2417 if (!Ld->isSimple() && !IsAnnotatedParallel) {
2418 recordAnalysis("NonSimpleLoad", Ld)
2419 << "read with atomic ordering or volatile read";
2420 LLVM_DEBUG(dbgs() << "LAA: Found a non-simple load.\n");
2421 HasComplexMemInst = true;
2422 continue;
2423 }
2424 NumLoads++;
2425 Loads.push_back(Ld);
2426 DepChecker->addAccess(Ld);
2427 if (EnableMemAccessVersioningOfLoop)
2428 collectStridedAccess(Ld);
2429 continue;
2430 }
2431
2432 // Save 'store' instructions. Abort if other instructions write to memory.
2433 if (I.mayWriteToMemory()) {
2434 auto *St = dyn_cast<StoreInst>(&I);
2435 if (!St) {
2436 recordAnalysis("CantVectorizeInstruction", St)
2437 << "instruction cannot be vectorized";
2438 HasComplexMemInst = true;
2439 continue;
2440 }
2441 if (!St->isSimple() && !IsAnnotatedParallel) {
2442 recordAnalysis("NonSimpleStore", St)
2443 << "write with atomic ordering or volatile write";
2444 LLVM_DEBUG(dbgs() << "LAA: Found a non-simple store.\n");
2445 HasComplexMemInst = true;
2446 continue;
2447 }
2448 NumStores++;
2449 Stores.push_back(St);
2450 DepChecker->addAccess(St);
2451 if (EnableMemAccessVersioningOfLoop)
2452 collectStridedAccess(St);
2453 }
2454 } // Next instr.
2455 } // Next block.
2456
2457 if (HasComplexMemInst) {
2458 CanVecMem = false;
2459 return;
2460 }
2461
2462 // Now we have two lists that hold the loads and the stores.
2463 // Next, we find the pointers that they use.
2464
2465 // Check if we see any stores. If there are no stores, then we don't
2466 // care if the pointers are *restrict*.
2467 if (!Stores.size()) {
2468 LLVM_DEBUG(dbgs() << "LAA: Found a read-only loop!\n");
2469 CanVecMem = true;
2470 return;
2471 }
2472
2473 MemoryDepChecker::DepCandidates DependentAccesses;
2474 AccessAnalysis Accesses(TheLoop, AA, LI, DependentAccesses, *PSE,
2475 LoopAliasScopes);
2476
2477 // Holds the analyzed pointers. We don't want to call getUnderlyingObjects
2478 // multiple times on the same object. If the ptr is accessed twice, once
2479 // for read and once for write, it will only appear once (on the write
2480 // list). This is okay, since we are going to check for conflicts between
2481 // writes and between reads and writes, but not between reads and reads.
2483
2484 // Record uniform store addresses to identify if we have multiple stores
2485 // to the same address.
2486 SmallPtrSet<Value *, 16> UniformStores;
2487
2488 for (StoreInst *ST : Stores) {
2489 Value *Ptr = ST->getPointerOperand();
2490
2491 if (isInvariant(Ptr)) {
2492 // Record store instructions to loop invariant addresses
2493 StoresToInvariantAddresses.push_back(ST);
2494 HasDependenceInvolvingLoopInvariantAddress |=
2495 !UniformStores.insert(Ptr).second;
2496 }
2497
2498 // If we did *not* see this pointer before, insert it to the read-write
2499 // list. At this phase it is only a 'write' list.
2500 Type *AccessTy = getLoadStoreType(ST);
2501 if (Seen.insert({Ptr, AccessTy}).second) {
2502 ++NumReadWrites;
2503
2505 // The TBAA metadata could have a control dependency on the predication
2506 // condition, so we cannot rely on it when determining whether or not we
2507 // need runtime pointer checks.
2508 if (blockNeedsPredication(ST->getParent(), TheLoop, DT))
2509 Loc.AATags.TBAA = nullptr;
2510
2511 visitPointers(const_cast<Value *>(Loc.Ptr), *TheLoop,
2512 [&Accesses, AccessTy, Loc](Value *Ptr) {
2513 MemoryLocation NewLoc = Loc.getWithNewPtr(Ptr);
2514 Accesses.addStore(NewLoc, AccessTy);
2515 });
2516 }
2517 }
2518
2519 if (IsAnnotatedParallel) {
2520 LLVM_DEBUG(
2521 dbgs() << "LAA: A loop annotated parallel, ignore memory dependency "
2522 << "checks.\n");
2523 CanVecMem = true;
2524 return;
2525 }
2526
2527 for (LoadInst *LD : Loads) {
2528 Value *Ptr = LD->getPointerOperand();
2529 // If we did *not* see this pointer before, insert it to the
2530 // read list. If we *did* see it before, then it is already in
2531 // the read-write list. This allows us to vectorize expressions
2532 // such as A[i] += x; Because the address of A[i] is a read-write
2533 // pointer. This only works if the index of A[i] is consecutive.
2534 // If the address of i is unknown (for example A[B[i]]) then we may
2535 // read a few words, modify, and write a few words, and some of the
2536 // words may be written to the same address.
2537 bool IsReadOnlyPtr = false;
2538 Type *AccessTy = getLoadStoreType(LD);
2539 if (Seen.insert({Ptr, AccessTy}).second ||
2540 !getPtrStride(*PSE, LD->getType(), Ptr, TheLoop, SymbolicStrides).value_or(0)) {
2541 ++NumReads;
2542 IsReadOnlyPtr = true;
2543 }
2544
2545 // See if there is an unsafe dependency between a load to a uniform address and
2546 // store to the same uniform address.
2547 if (UniformStores.count(Ptr)) {
2548 LLVM_DEBUG(dbgs() << "LAA: Found an unsafe dependency between a uniform "
2549 "load and uniform store to the same address!\n");
2550 HasDependenceInvolvingLoopInvariantAddress = true;
2551 }
2552
2554 // The TBAA metadata could have a control dependency on the predication
2555 // condition, so we cannot rely on it when determining whether or not we
2556 // need runtime pointer checks.
2557 if (blockNeedsPredication(LD->getParent(), TheLoop, DT))
2558 Loc.AATags.TBAA = nullptr;
2559
2560 visitPointers(const_cast<Value *>(Loc.Ptr), *TheLoop,
2561 [&Accesses, AccessTy, Loc, IsReadOnlyPtr](Value *Ptr) {
2562 MemoryLocation NewLoc = Loc.getWithNewPtr(Ptr);
2563 Accesses.addLoad(NewLoc, AccessTy, IsReadOnlyPtr);
2564 });
2565 }
2566
2567 // If we write (or read-write) to a single destination and there are no
2568 // other reads in this loop then is it safe to vectorize.
2569 if (NumReadWrites == 1 && NumReads == 0) {
2570 LLVM_DEBUG(dbgs() << "LAA: Found a write-only loop!\n");
2571 CanVecMem = true;
2572 return;
2573 }
2574
2575 // Build dependence sets and check whether we need a runtime pointer bounds
2576 // check.
2577 Accesses.buildDependenceSets();
2578
2579 // Find pointers with computable bounds. We are going to use this information
2580 // to place a runtime bound check.
2581 Value *UncomputablePtr = nullptr;
2582 bool CanDoRTIfNeeded =
2583 Accesses.canCheckPtrAtRT(*PtrRtChecking, PSE->getSE(), TheLoop,
2584 SymbolicStrides, UncomputablePtr, false);
2585 if (!CanDoRTIfNeeded) {
2586 auto *I = dyn_cast_or_null<Instruction>(UncomputablePtr);
2587 recordAnalysis("CantIdentifyArrayBounds", I)
2588 << "cannot identify array bounds";
2589 LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because we can't find "
2590 << "the array bounds.\n");
2591 CanVecMem = false;
2592 return;
2593 }
2594
2595 LLVM_DEBUG(
2596 dbgs() << "LAA: May be able to perform a memory runtime check if needed.\n");
2597
2598 CanVecMem = true;
2599 if (Accesses.isDependencyCheckNeeded()) {
2600 LLVM_DEBUG(dbgs() << "LAA: Checking memory dependencies\n");
2601 CanVecMem = DepChecker->areDepsSafe(
2602 DependentAccesses, Accesses.getDependenciesToCheck(), SymbolicStrides,
2603 Accesses.getUnderlyingObjects());
2604
2605 if (!CanVecMem && DepChecker->shouldRetryWithRuntimeCheck()) {
2606 LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n");
2607
2608 // Clear the dependency checks. We assume they are not needed.
2609 Accesses.resetDepChecks(*DepChecker);
2610
2611 PtrRtChecking->reset();
2612 PtrRtChecking->Need = true;
2613
2614 auto *SE = PSE->getSE();
2615 UncomputablePtr = nullptr;
2616 CanDoRTIfNeeded = Accesses.canCheckPtrAtRT(
2617 *PtrRtChecking, SE, TheLoop, SymbolicStrides, UncomputablePtr, true);
2618
2619 // Check that we found the bounds for the pointer.
2620 if (!CanDoRTIfNeeded) {
2621 auto *I = dyn_cast_or_null<Instruction>(UncomputablePtr);
2622 recordAnalysis("CantCheckMemDepsAtRunTime", I)
2623 << "cannot check memory dependencies at runtime";
2624 LLVM_DEBUG(dbgs() << "LAA: Can't vectorize with memory checks\n");
2625 CanVecMem = false;
2626 return;
2627 }
2628
2629 CanVecMem = true;
2630 }
2631 }
2632
2633 if (HasConvergentOp) {
2634 recordAnalysis("CantInsertRuntimeCheckWithConvergent")
2635 << "cannot add control dependency to convergent operation";
2636 LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because a runtime check "
2637 "would be needed with a convergent operation\n");
2638 CanVecMem = false;
2639 return;
2640 }
2641
2642 if (CanVecMem)
2643 LLVM_DEBUG(
2644 dbgs() << "LAA: No unsafe dependent memory operations in loop. We"
2645 << (PtrRtChecking->Need ? "" : " don't")
2646 << " need runtime memory checks.\n");
2647 else
2648 emitUnsafeDependenceRemark();
2649}
2650
2651void LoopAccessInfo::emitUnsafeDependenceRemark() {
2652 auto Deps = getDepChecker().getDependences();
2653 if (!Deps)
2654 return;
2655 auto Found = llvm::find_if(*Deps, [](const MemoryDepChecker::Dependence &D) {
2658 });
2659 if (Found == Deps->end())
2660 return;
2661 MemoryDepChecker::Dependence Dep = *Found;
2662
2663 LLVM_DEBUG(dbgs() << "LAA: unsafe dependent memory operations in loop\n");
2664
2665 // Emit remark for first unsafe dependence
2666 bool HasForcedDistribution = false;
2667 std::optional<const MDOperand *> Value =
2668 findStringMetadataForLoop(TheLoop, "llvm.loop.distribute.enable");
2669 if (Value) {
2670 const MDOperand *Op = *Value;
2671 assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata");
2672 HasForcedDistribution = mdconst::extract<ConstantInt>(*Op)->getZExtValue();
2673 }
2674
2675 const std::string Info =
2676 HasForcedDistribution
2677 ? "unsafe dependent memory operations in loop."
2678 : "unsafe dependent memory operations in loop. Use "
2679 "#pragma clang loop distribute(enable) to allow loop distribution "
2680 "to attempt to isolate the offending operations into a separate "
2681 "loop";
2683 recordAnalysis("UnsafeDep", Dep.getDestination(*this)) << Info;
2684
2685 switch (Dep.Type) {
2689 llvm_unreachable("Unexpected dependence");
2691 R << "\nBackward loop carried data dependence.";
2692 break;
2694 R << "\nForward loop carried data dependence that prevents "
2695 "store-to-load forwarding.";
2696 break;
2698 R << "\nBackward loop carried data dependence that prevents "
2699 "store-to-load forwarding.";
2700 break;
2702 R << "\nUnsafe indirect dependence.";
2703 break;
2705 R << "\nUnknown data dependence.";
2706 break;
2707 }
2708
2709 if (Instruction *I = Dep.getSource(*this)) {
2710 DebugLoc SourceLoc = I->getDebugLoc();
2711 if (auto *DD = dyn_cast_or_null<Instruction>(getPointerOperand(I)))
2712 SourceLoc = DD->getDebugLoc();
2713 if (SourceLoc)
2714 R << " Memory location is the same as accessed at "
2715 << ore::NV("Location", SourceLoc);
2716 }
2717}
2718
2720 DominatorTree *DT) {
2721 assert(TheLoop->contains(BB) && "Unknown block used");
2722
2723 // Blocks that do not dominate the latch need predication.
2724 BasicBlock* Latch = TheLoop->getLoopLatch();
2725 return !DT->dominates(BB, Latch);
2726}
2727
2728OptimizationRemarkAnalysis &LoopAccessInfo::recordAnalysis(StringRef RemarkName,
2729 Instruction *I) {
2730 assert(!Report && "Multiple reports generated");
2731
2732 Value *CodeRegion = TheLoop->getHeader();
2733 DebugLoc DL = TheLoop->getStartLoc();
2734
2735 if (I) {
2736 CodeRegion = I->getParent();
2737 // If there is no debug location attached to the instruction, revert back to
2738 // using the loop's.
2739 if (I->getDebugLoc())
2740 DL = I->getDebugLoc();
2741 }
2742
2743 Report = std::make_unique<OptimizationRemarkAnalysis>(DEBUG_TYPE, RemarkName, DL,
2744 CodeRegion);
2745 return *Report;
2746}
2747
2749 auto *SE = PSE->getSE();
2750 // TODO: Is this really what we want? Even without FP SCEV, we may want some
2751 // trivially loop-invariant FP values to be considered invariant.
2752 if (!SE->isSCEVable(V->getType()))
2753 return false;
2754 const SCEV *S = SE->getSCEV(V);
2755 return SE->isLoopInvariant(S, TheLoop);
2756}
2757
2758/// Find the operand of the GEP that should be checked for consecutive
2759/// stores. This ignores trailing indices that have no effect on the final
2760/// pointer.
2761static unsigned getGEPInductionOperand(const GetElementPtrInst *Gep) {
2762 const DataLayout &DL = Gep->getModule()->getDataLayout();
2763 unsigned LastOperand = Gep->getNumOperands() - 1;
2764 TypeSize GEPAllocSize = DL.getTypeAllocSize(Gep->getResultElementType());
2765
2766 // Walk backwards and try to peel off zeros.
2767 while (LastOperand > 1 && match(Gep->getOperand(LastOperand), m_Zero())) {
2768 // Find the type we're currently indexing into.
2769 gep_type_iterator GEPTI = gep_type_begin(Gep);
2770 std::advance(GEPTI, LastOperand - 2);
2771
2772 // If it's a type with the same allocation size as the result of the GEP we
2773 // can peel off the zero index.
2774 TypeSize ElemSize = GEPTI.isStruct()
2775 ? DL.getTypeAllocSize(GEPTI.getIndexedType())
2777 if (ElemSize != GEPAllocSize)
2778 break;
2779 --LastOperand;
2780 }
2781
2782 return LastOperand;
2783}
2784
2785/// If the argument is a GEP, then returns the operand identified by
2786/// getGEPInductionOperand. However, if there is some other non-loop-invariant
2787/// operand, it returns that instead.
2789 GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
2790 if (!GEP)
2791 return Ptr;
2792
2793 unsigned InductionOperand = getGEPInductionOperand(GEP);
2794
2795 // Check that all of the gep indices are uniform except for our induction
2796 // operand.
2797 for (unsigned i = 0, e = GEP->getNumOperands(); i != e; ++i)
2798 if (i != InductionOperand &&
2799 !SE->isLoopInvariant(SE->getSCEV(GEP->getOperand(i)), Lp))
2800 return Ptr;
2801 return GEP->getOperand(InductionOperand);
2802}
2803
2804/// If a value has only one user that is a CastInst, return it.
2806 Value *UniqueCast = nullptr;
2807 for (User *U : Ptr->users()) {
2808 CastInst *CI = dyn_cast<CastInst>(U);
2809 if (CI && CI->getType() == Ty) {
2810 if (!UniqueCast)
2811 UniqueCast = CI;
2812 else
2813 return nullptr;
2814 }
2815 }
2816 return UniqueCast;
2817}
2818
2819/// Get the stride of a pointer access in a loop. Looks for symbolic
2820/// strides "a[i*stride]". Returns the symbolic stride, or null otherwise.
2822 auto *PtrTy = dyn_cast<PointerType>(Ptr->getType());
2823 if (!PtrTy || PtrTy->isAggregateType())
2824 return nullptr;
2825
2826 // Try to remove a gep instruction to make the pointer (actually index at this
2827 // point) easier analyzable. If OrigPtr is equal to Ptr we are analyzing the
2828 // pointer, otherwise, we are analyzing the index.
2829 Value *OrigPtr = Ptr;
2830
2831 // The size of the pointer access.
2832 int64_t PtrAccessSize = 1;
2833
2834 Ptr = stripGetElementPtr(Ptr, SE, Lp);
2835 const SCEV *V = SE->getSCEV(Ptr);
2836
2837 if (Ptr != OrigPtr)
2838 // Strip off casts.
2839 while (const SCEVIntegralCastExpr *C = dyn_cast<SCEVIntegralCastExpr>(V))
2840 V = C->getOperand();
2841
2842 const SCEVAddRecExpr *S = dyn_cast<SCEVAddRecExpr>(V);
2843 if (!S)
2844 return nullptr;
2845
2846 // If the pointer is invariant then there is no stride and it makes no
2847 // sense to add it here.
2848 if (Lp != S->getLoop())
2849 return nullptr;
2850
2851 V = S->getStepRecurrence(*SE);
2852 if (!V)
2853 return nullptr;
2854
2855 // Strip off the size of access multiplication if we are still analyzing the
2856 // pointer.
2857 if (OrigPtr == Ptr) {
2858 if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(V)) {
2859 if (M->getOperand(0)->getSCEVType() != scConstant)
2860 return nullptr;
2861
2862 const APInt &APStepVal = cast<SCEVConstant>(M->getOperand(0))->getAPInt();
2863
2864 // Huge step value - give up.
2865 if (APStepVal.getBitWidth() > 64)
2866 return nullptr;
2867
2868 int64_t StepVal = APStepVal.getSExtValue();
2869 if (PtrAccessSize != StepVal)
2870 return nullptr;
2871 V = M->getOperand(1);
2872 }
2873 }
2874
2875 // Note that the restriction after this loop invariant check are only
2876 // profitability restrictions.
2877 if (!SE->isLoopInvariant(V, Lp))
2878 return nullptr;
2879
2880 // Look for the loop invariant symbolic value.
2881 const SCEVUnknown *U = dyn_cast<SCEVUnknown>(V);
2882 if (!U) {
2883 const auto *C = dyn_cast<SCEVIntegralCastExpr>(V);
2884 if (!C)
2885 return nullptr;
2886 U = dyn_cast<SCEVUnknown>(C->getOperand());
2887 if (!U)
2888 return nullptr;
2889
2890 // Match legacy behavior - this is not needed for correctness
2891 if (!getUniqueCastUse(U->getValue(), Lp, V->getType()))
2892 return nullptr;
2893 }
2894
2895 return V;
2896}
2897
2898void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
2899 Value *Ptr = getLoadStorePointerOperand(MemAccess);
2900 if (!Ptr)
2901 return;
2902
2903 // Note: getStrideFromPointer is a *profitability* heuristic. We
2904 // could broaden the scope of values returned here - to anything
2905 // which happens to be loop invariant and contributes to the
2906 // computation of an interesting IV - but we chose not to as we
2907 // don't have a cost model here, and broadening the scope exposes
2908 // far too many unprofitable cases.
2909 const SCEV *StrideExpr = getStrideFromPointer(Ptr, PSE->getSE(), TheLoop);
2910 if (!StrideExpr)
2911 return;
2912
2913 LLVM_DEBUG(dbgs() << "LAA: Found a strided access that is a candidate for "
2914 "versioning:");
2915 LLVM_DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *StrideExpr << "\n");
2916
2917 if (!SpeculateUnitStride) {
2918 LLVM_DEBUG(dbgs() << " Chose not to due to -laa-speculate-unit-stride\n");
2919 return;
2920 }
2921
2922 // Avoid adding the "Stride == 1" predicate when we know that
2923 // Stride >= Trip-Count. Such a predicate will effectively optimize a single
2924 // or zero iteration loop, as Trip-Count <= Stride == 1.
2925 //
2926 // TODO: We are currently not making a very informed decision on when it is
2927 // beneficial to apply stride versioning. It might make more sense that the
2928 // users of this analysis (such as the vectorizer) will trigger it, based on
2929 // their specific cost considerations; For example, in cases where stride
2930 // versioning does not help resolving memory accesses/dependences, the
2931 // vectorizer should evaluate the cost of the runtime test, and the benefit
2932 // of various possible stride specializations, considering the alternatives
2933 // of using gather/scatters (if available).
2934
2935 const SCEV *BETakenCount = PSE->getBackedgeTakenCount();
2936
2937 // Match the types so we can compare the stride and the BETakenCount.
2938 // The Stride can be positive/negative, so we sign extend Stride;
2939 // The backedgeTakenCount is non-negative, so we zero extend BETakenCount.
2940 const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout();
2941 uint64_t StrideTypeSizeBits = DL.getTypeSizeInBits(StrideExpr->getType());
2942 uint64_t BETypeSizeBits = DL.getTypeSizeInBits(BETakenCount->getType());
2943 const SCEV *CastedStride = StrideExpr;
2944 const SCEV *CastedBECount = BETakenCount;
2945 ScalarEvolution *SE = PSE->getSE();
2946 if (BETypeSizeBits >= StrideTypeSizeBits)
2947 CastedStride = SE->getNoopOrSignExtend(StrideExpr, BETakenCount->getType());
2948 else
2949 CastedBECount = SE->getZeroExtendExpr(BETakenCount, StrideExpr->getType());
2950 const SCEV *StrideMinusBETaken = SE->getMinusSCEV(CastedStride, CastedBECount);
2951 // Since TripCount == BackEdgeTakenCount + 1, checking:
2952 // "Stride >= TripCount" is equivalent to checking:
2953 // Stride - BETakenCount > 0
2954 if (SE->isKnownPositive(StrideMinusBETaken)) {
2955 LLVM_DEBUG(
2956 dbgs() << "LAA: Stride>=TripCount; No point in versioning as the "
2957 "Stride==1 predicate will imply that the loop executes "
2958 "at most once.\n");
2959 return;
2960 }
2961 LLVM_DEBUG(dbgs() << "LAA: Found a strided access that we can version.\n");
2962
2963 // Strip back off the integer cast, and check that our result is a
2964 // SCEVUnknown as we expect.
2965 const SCEV *StrideBase = StrideExpr;
2966 if (const auto *C = dyn_cast<SCEVIntegralCastExpr>(StrideBase))
2967 StrideBase = C->getOperand();
2968 SymbolicStrides[Ptr] = cast<SCEVUnknown>(StrideBase);
2969}
2970
2972 const TargetLibraryInfo *TLI, AAResults *AA,
2973 DominatorTree *DT, LoopInfo *LI)
2974 : PSE(std::make_unique<PredicatedScalarEvolution>(*SE, *L)),
2975 PtrRtChecking(nullptr),
2976 DepChecker(std::make_unique<MemoryDepChecker>(*PSE, L)), TheLoop(L) {
2977 PtrRtChecking = std::make_unique<RuntimePointerChecking>(*DepChecker, SE);
2978 if (canAnalyzeLoop()) {
2979 analyzeLoop(AA, LI, TLI, DT);
2980 }
2981}
2982
2984 if (CanVecMem) {
2985 OS.indent(Depth) << "Memory dependences are safe";
2986 const MemoryDepChecker &DC = getDepChecker();
2987 if (!DC.isSafeForAnyVectorWidth())
2988 OS << " with a maximum safe vector width of "
2989 << DC.getMaxSafeVectorWidthInBits() << " bits";
2990 if (PtrRtChecking->Need)
2991 OS << " with run-time checks";
2992 OS << "\n";
2993 }
2994
2995 if (HasConvergentOp)
2996 OS.indent(Depth) << "Has convergent operation in loop\n";
2997
2998 if (Report)
2999 OS.indent(Depth) << "Report: " << Report->getMsg() << "\n";
3000
3001 if (auto *Dependences = DepChecker->getDependences()) {
3002 OS.indent(Depth) << "Dependences:\n";
3003 for (const auto &Dep : *Dependences) {
3004 Dep.print(OS, Depth + 2, DepChecker->getMemoryInstructions());
3005 OS << "\n";
3006 }
3007 } else
3008 OS.indent(Depth) << "Too many dependences, not recorded\n";
3009
3010 // List the pair of accesses need run-time checks to prove independence.
3011 PtrRtChecking->print(OS, Depth);
3012 OS << "\n";
3013
3014 OS.indent(Depth) << "Non vectorizable stores to invariant address were "
3015 << (HasDependenceInvolvingLoopInvariantAddress ? "" : "not ")
3016 << "found in loop.\n";
3017
3018 OS.indent(Depth) << "SCEV assumptions:\n";
3019 PSE->getPredicate().print(OS, Depth);
3020
3021 OS << "\n";
3022
3023 OS.indent(Depth) << "Expressions re-written:\n";
3024 PSE->print(OS, Depth);
3025}
3026
3028 auto I = LoopAccessInfoMap.insert({&L, nullptr});
3029
3030 if (I.second)
3031 I.first->second =
3032 std::make_unique<LoopAccessInfo>(&L, &SE, TLI, &AA, &DT, &LI);
3033
3034 return *I.first->second;
3035}
3036
3038 Function &F, const PreservedAnalyses &PA,
3040 // Check whether our analysis is preserved.
3041 auto PAC = PA.getChecker<LoopAccessAnalysis>();
3042 if (!PAC.preserved() && !PAC.preservedSet<AllAnalysesOn<Function>>())
3043 // If not, give up now.
3044 return true;
3045
3046 // Check whether the analyses we depend on became invalid for any reason.
3047 // Skip checking TargetLibraryAnalysis as it is immutable and can't become
3048 // invalid.
3049 return Inv.invalidate<AAManager>(F, PA) ||
3051 Inv.invalidate<LoopAnalysis>(F, PA) ||
3053}
3054
3058 auto &AA = FAM.getResult<AAManager>(F);
3059 auto &DT = FAM.getResult<DominatorTreeAnalysis>(F);
3060 auto &LI = FAM.getResult<LoopAnalysis>(F);
3061 auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
3062 return LoopAccessInfoManager(SE, AA, DT, LI, &TLI);
3063}
3064
3065AnalysisKey LoopAccessAnalysis::Key;
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file implements a class to represent arbitrary precision integral constant values and operations...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
Generic implementation of equivalence classes through the use Tarjan's efficient union-find algorithm...
#define Check(C,...)
#define DEBUG_TYPE
Hexagon Common GEP
IRTranslator LLVM IR MI
static cl::opt< unsigned > MaxDependences("max-dependences", cl::Hidden, cl::desc("Maximum number of dependences collected by " "loop-access analysis (default = 100)"), cl::init(100))
We collect dependences up to this threshold.
static cl::opt< bool > EnableForwardingConflictDetection("store-to-load-forwarding-conflict-detection", cl::Hidden, cl::desc("Enable conflict detection in loop-access analysis"), cl::init(true))
Enable store-to-load forwarding conflict detection.
static void findForkedSCEVs(ScalarEvolution *SE, const Loop *L, Value *Ptr, SmallVectorImpl< PointerIntPair< const SCEV *, 1, bool > > &ScevList, unsigned Depth)
static std::variant< MemoryDepChecker::Dependence::DepType, DepDistanceStrideAndSizeInfo > getDependenceDistanceStrideAndSize(const AccessAnalysis::MemAccessInfo &A, Instruction *AInst, const AccessAnalysis::MemAccessInfo &B, Instruction *BInst, const DenseMap< Value *, const SCEV * > &Strides, const DenseMap< Value *, SmallVector< const Value *, 16 > > &UnderlyingObjects, PredicatedScalarEvolution &PSE, const Loop *InnermostLoop)
static bool hasComputableBounds(PredicatedScalarEvolution &PSE, Value *Ptr, const SCEV *PtrScev, Loop *L, bool Assume)
Check whether a pointer can participate in a runtime bounds check.
static cl::opt< unsigned > MemoryCheckMergeThreshold("memory-check-merge-threshold", cl::Hidden, cl::desc("Maximum number of comparisons done when trying to merge " "runtime memory checks. (default = 100)"), cl::init(100))
The maximum iterations used to merge memory checks.
static bool isNoWrap(PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &Strides, Value *Ptr, Type *AccessTy, Loop *L)
Check whether a pointer address cannot wrap.
static const SCEV * getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp)
Get the stride of a pointer access in a loop.
static unsigned getGEPInductionOperand(const GetElementPtrInst *Gep)
Find the operand of the GEP that should be checked for consecutive stores.
static cl::opt< unsigned, true > VectorizationInterleave("force-vector-interleave", cl::Hidden, cl::desc("Sets the vectorization interleave count. " "Zero is autoselect."), cl::location(VectorizerParams::VectorizationInterleave))
static bool isLoopVariantIndirectAddress(ArrayRef< const Value * > UnderlyingObjects, ScalarEvolution &SE, const Loop *L)
Returns true if any of the underlying objects has a loop varying address, i.e.
static Value * getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty)
If a value has only one user that is a CastInst, return it.
static cl::opt< bool, true > HoistRuntimeChecks("hoist-runtime-checks", cl::Hidden, cl::desc("Hoist inner loop runtime memory checks to outer loop if possible"), cl::location(VectorizerParams::HoistRuntimeChecks), cl::init(true))
static cl::opt< unsigned, true > VectorizationFactor("force-vector-width", cl::Hidden, cl::desc("Sets the SIMD width. Zero is autoselect."), cl::location(VectorizerParams::VectorizationFactor))
static cl::opt< unsigned, true > RuntimeMemoryCheckThreshold("runtime-memory-check-threshold", cl::Hidden, cl::desc("When performing memory disambiguation checks at runtime do not " "generate more than this number of comparisons (default = 8)."), cl::location(VectorizerParams::RuntimeMemoryCheckThreshold), cl::init(8))
static void visitPointers(Value *StartPtr, const Loop &InnermostLoop, function_ref< void(Value *)> AddPointer)
static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR, PredicatedScalarEvolution &PSE, const Loop *L)
Return true if an AddRec pointer Ptr is unsigned non-wrapping, i.e.
static Value * stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp)
If the argument is a GEP, then returns the operand identified by getGEPInductionOperand.
static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE, const SCEV &BackedgeTakenCount, const SCEV &Dist, uint64_t Stride, uint64_t TypeByteSize)
Given a dependence-distance Dist between two memory accesses, that have the same stride whose absolut...
static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride, uint64_t TypeByteSize)
Check the dependence for two accesses with the same stride Stride.
static const SCEV * getMinFromExprs(const SCEV *I, const SCEV *J, ScalarEvolution *SE)
Compare I and J and return the minimum.
static cl::opt< unsigned > MaxForkedSCEVDepth("max-forked-scev-depth", cl::Hidden, cl::desc("Maximum recursion depth when finding forked SCEVs (default = 5)"), cl::init(5))
static cl::opt< bool > SpeculateUnitStride("laa-speculate-unit-stride", cl::Hidden, cl::desc("Speculate that non-constant strides are unit in LAA"), cl::init(true))
static SmallVector< PointerIntPair< const SCEV *, 1, bool > > findForkedPointer(PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap, Value *Ptr, const Loop *L)
static cl::opt< bool > EnableMemAccessVersioning("enable-mem-access-versioning", cl::init(true), cl::Hidden, cl::desc("Enable symbolic stride memory access versioning"))
This enables versioning on the strides of symbolically striding memory accesses in code like the foll...
This header provides classes for managing per-loop analyses.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file provides utility analysis objects describing memory locations.
uint64_t High
#define P(N)
FunctionAnalysisManager FAM
This header defines various interfaces for pass management in LLVM.
This file defines the PointerIntPair class.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
static LLVM_ATTRIBUTE_ALWAYS_INLINE bool CheckType(MVT::SimpleValueType VT, SDValue N, const TargetLowering *TLI, const DataLayout &DL)
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
static const X86InstrFMA3Group Groups[]
A manager for alias analyses.
Class for arbitrary precision integers.
Definition: APInt.h:76
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1491
APInt abs() const
Get the absolute value.
Definition: APInt.h:1737
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1439
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:307
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
bool isStrictlyPositive() const
Determine if this APInt Value is positive.
Definition: APInt.h:334
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1513
This templated class represents "all analyses that operate over <a particular IR unit>" (e....
Definition: Analysis.h:47
API to communicate dependencies between analyses during invalidation.
Definition: PassManager.h:360
bool invalidate(IRUnitT &IR, const PreservedAnalyses &PA)
Trigger the invalidation of some other analysis pass if not already handled and return whether it was...
Definition: PassManager.h:378
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:321
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:473
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:204
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:206
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
Definition: BasicBlock.cpp:289
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:601
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
A debug info location.
Definition: DebugLoc.h:33
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
iterator end()
Definition: DenseMap.h:84
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
EquivalenceClasses - This represents a collection of equivalence classes and supports three efficient...
iterator findValue(const ElemTy &V) const
findValue - Return an iterator to the specified value.
iterator insert(const ElemTy &Data)
insert - Insert a new value into the union/find set, ignoring the request if the value already exists...
member_iterator member_end() const
typename std::set< ECValue, ECValueComparator >::const_iterator iterator
iterator* - Provides a way to iterate over all values in the set.
member_iterator member_begin(iterator I) const
member_iterator unionSets(const ElemTy &V1, const ElemTy &V2)
union - Merge the two equivalence sets for the specified values, inserting them if they do not alread...
const ElemTy & getLeaderValue(const ElemTy &V) const
getLeaderValue - Return the leader for the specified value that is in the set.
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:681
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:973
Type * getResultElementType() const
PointerType * getType() const
Global values are always pointers.
Definition: GlobalValue.h:294
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:83
Class to represent integer types.
Definition: DerivedTypes.h:40
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
An instruction for reading from memory.
Definition: Instructions.h:184
Value * getPointerOperand()
Definition: Instructions.h:280
static constexpr LocationSize beforeOrAfterPointer()
Any location before or after the base pointer (but still within the underlying object).
This analysis provides dependence information for the memory accesses of a loop.
Result run(Function &F, FunctionAnalysisManager &AM)
bool invalidate(Function &F, const PreservedAnalyses &PA, FunctionAnalysisManager::Invalidator &Inv)
const LoopAccessInfo & getInfo(Loop &L)
Drive the analysis of memory accesses in the loop.
const MemoryDepChecker & getDepChecker() const
the Memory Dependence Checker which can determine the loop-independent and loop-carried dependences b...
bool isInvariant(Value *V) const
Returns true if value V is loop invariant.
void print(raw_ostream &OS, unsigned Depth=0) const
Print the information about the memory accesses in the loop.
LoopAccessInfo(Loop *L, ScalarEvolution *SE, const TargetLibraryInfo *TLI, AAResults *AA, DominatorTree *DT, LoopInfo *LI)
static bool blockNeedsPredication(BasicBlock *BB, Loop *TheLoop, DominatorTree *DT)
Return true if the block BB needs to be predicated in order for the loop to be vectorized.
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
unsigned getNumBackEdges() const
Calculate the number of back edges to the loop header.
BlockT * getHeader() const
LoopT * getParentLoop() const
Return the parent loop if it exists or nullptr for top level loops.
Wrapper class to LoopBlocksDFS that provides a standard begin()/end() interface for the DFS reverse p...
Definition: LoopIterator.h:172
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
bool isAnnotatedParallel() const
Returns true if the loop is annotated parallel.
Definition: LoopInfo.cpp:564
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:631
Metadata node.
Definition: Metadata.h:1067
ArrayRef< MDOperand > operands() const
Definition: Metadata.h:1426
Tracking metadata reference owned by Metadata.
Definition: Metadata.h:889
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
Checks memory dependences among accesses to the same underlying object to determine whether there vec...
ArrayRef< unsigned > getOrderForAccess(Value *Ptr, bool IsWrite) const
Return the program order indices for the access location (Ptr, IsWrite).
bool isSafeForAnyVectorWidth() const
Return true if the number of elements that are safe to operate on simultaneously is not bounded.
bool areDepsSafe(DepCandidates &AccessSets, MemAccessInfoList &CheckDeps, const DenseMap< Value *, const SCEV * > &Strides, const DenseMap< Value *, SmallVector< const Value *, 16 > > &UnderlyingObjects)
Check whether the dependencies between the accesses are safe.
const SmallVectorImpl< Instruction * > & getMemoryInstructions() const
The vector of memory access instructions.
const Loop * getInnermostLoop() const
uint64_t getMaxSafeVectorWidthInBits() const
Return the number of elements that are safe to operate on simultaneously, multiplied by the size of t...
bool isSafeForVectorization() const
No memory dependence was encountered that would inhibit vectorization.
const SmallVectorImpl< Dependence > * getDependences() const
Returns the memory dependences.
SmallVector< Instruction *, 4 > getInstructionsForAccess(Value *Ptr, bool isWrite) const
Find the set of instructions that read or write via Ptr.
VectorizationSafetyStatus
Type to keep track of the status of the dependence check.
bool shouldRetryWithRuntimeCheck() const
In same cases when the dependency check fails we can still vectorize the loop with a dynamic array ac...
void addAccess(StoreInst *SI)
Register the location (instructions are given increasing numbers) of a write access.
PointerIntPair< Value *, 1, bool > MemAccessInfo
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
LocationSize Size
The maximum size of the location, in address-units, or UnknownSize if the size is not known.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
const Value * Ptr
The address of the start of the location.
Root of the metadata hierarchy.
Definition: Metadata.h:62
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:293
Diagnostic information for optimization analysis remarks.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
void addPredicate(const SCEVPredicate &Pred)
Adds a new predicate.
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
const SCEVPredicate & getPredicate() const
bool hasNoOverflow(Value *V, SCEVWrapPredicate::IncrementWrapFlags Flags)
Returns true if we've proved that V doesn't wrap by means of a SCEV predicate.
void setNoOverflow(Value *V, SCEVWrapPredicate::IncrementWrapFlags Flags)
Proves that V doesn't overflow by adding SCEV predicate.
void print(raw_ostream &OS, unsigned Depth) const
Print the SCEV mappings done by the Predicated Scalar Evolution.
const SCEVAddRecExpr * getAsAddRec(Value *V)
Attempts to produce an AddRecExpr for V by adding additional SCEV predicates.
const SCEV * getBackedgeTakenCount()
Get the (predicated) backedge count for the analyzed loop.
const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:109
PreservedAnalysisChecker getChecker() const
Build a checker for this PreservedAnalyses and the specified analysis type.
Definition: Analysis.h:264
Holds information about the memory runtime legality checks to verify that a group of pointers do not ...
bool Need
This flag indicates if we need to add the runtime check.
void reset()
Reset the state of the pointer runtime information.
unsigned getNumberOfChecks() const
Returns the number of run-time checks required according to needsChecking.
void printChecks(raw_ostream &OS, const SmallVectorImpl< RuntimePointerCheck > &Checks, unsigned Depth=0) const
Print Checks.
bool needsChecking(const RuntimeCheckingPtrGroup &M, const RuntimeCheckingPtrGroup &N) const
Decide if we need to add a check between two groups of pointers, according to needsChecking.
void print(raw_ostream &OS, unsigned Depth=0) const
Print the list run-time memory checks necessary.
SmallVector< RuntimeCheckingPtrGroup, 2 > CheckingGroups
Holds a partitioning of pointers into "check groups".
void generateChecks(MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies)
Generate the checks and store it.
static bool arePointersInSamePartition(const SmallVectorImpl< int > &PtrToPartition, unsigned PtrIdx1, unsigned PtrIdx2)
Check if pointers are in the same partition.
SmallVector< PointerInfo, 2 > Pointers
Information about the pointers that may require checking.
void insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr, Type *AccessTy, bool WritePtr, unsigned DepSetId, unsigned ASId, PredicatedScalarEvolution &PSE, bool NeedsFreeze)
Insert a pointer and calculate the start and end SCEVs.
This node represents a polynomial recurrence on the trip count of the specified loop.
const SCEV * evaluateAtIteration(const SCEV *It, ScalarEvolution &SE) const
Return the value of this chain of recurrences at the specified iteration number.
const SCEV * getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents a constant integer value.
This is the base class for unary integral cast operator classes.
This node represents multiplication of some number of SCEVs.
NoWrapFlags getNoWrapFlags(NoWrapFlags Mask=NoWrapMask) const
virtual void print(raw_ostream &OS, unsigned Depth=0) const =0
Prints a textual representation of this predicate with an indentation of Depth.
This means that we are dealing with an entirely unknown SCEV value, and only represent it as its LLVM...
This class represents an analyzed expression in the program.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
const SCEV * getUMaxExpr(const SCEV *LHS, const SCEV *RHS)
const SCEVPredicate * getEqualPredicate(const SCEV *LHS, const SCEV *RHS)
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getNoopOrSignExtend(const SCEV *V, Type *Ty)
Return a SCEV corresponding to a conversion of the input value to the specified type.
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
const SCEV * getPtrToIntExpr(const SCEV *Op, Type *Ty)
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
bool isKnownPositive(const SCEV *S)
Test if the given expression is known to be positive.
const SCEV * getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth=0)
bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
Type * getEffectiveSCEVType(Type *Ty) const
Return a type with the same bitwidth as the given type and which represents how SCEV will treat the g...
const SCEV * getUMinExpr(const SCEV *LHS, const SCEV *RHS, bool Sequential=false)
const SCEV * getStoreSizeOfExpr(Type *IntTy, Type *StoreTy)
Return an expression for the store size of StoreTy that is type IntTy.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getSizeOfExpr(Type *IntTy, TypeSize Size)
Return an expression for a TypeSize.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
const SCEV * getTruncateOrSignExtend(const SCEV *V, Type *Ty, unsigned Depth=0)
Return a SCEV corresponding to a conversion of the input value to the specified type.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:321
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:360
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:317
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
Value * getOperand(unsigned i) const
Definition: User.h:169
unsigned getNumOperands() const
Definition: User.h:191
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:70
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition: Value.h:736
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:187
An efficient, type-erasing, non-owning reference to a callable.
TypeSize getSequentialElementStride(const DataLayout &DL) const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
friend const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:236
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:573
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
LocationClass< Ty > location(Ty &L)
Definition: CommandLine.h:470
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:456
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
unsigned getPointerAddressSpace(const Type *T)
Definition: SPIRVUtils.h:122
std::optional< const MDOperand * > findStringMetadataForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for loop.
Definition: LoopInfo.cpp:1053
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
AddressSpace
Definition: NVPTXBaseInfo.h:21
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1928
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
bool NullPointerIsDefined(const Function *F, unsigned AS=0)
Check whether null pointer dereferencing is considered undefined behavior for a given function or an ...
Definition: Function.cpp:2043
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool isPointerTy(const Type *T)
Definition: SPIRVUtils.h:116
std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, LoopInfo *LI=nullptr, unsigned MaxLookup=6)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
const SCEV * replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &PtrToStride, Value *Ptr)
Return the SCEV corresponding to a pointer with the symbolic stride replaced with constant one,...
bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
bool isGuaranteedNotToBeUndefOrPoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Return true if this function can prove that V does not have undef bits and is never poison.
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1824
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
gep_type_iterator gep_type_begin(const User *GEP)
Type * getLoadStoreType(Value *I)
A helper function that returns the type of a load or store instruction.
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition: Metadata.h:783
MDNode * TBAA
The tag for type-based alias analysis.
Definition: Metadata.h:777
MDNode * NoAlias
The tag specifying the noalias scope.
Definition: Metadata.h:786
A special type used by analysis passes to provide an address that identifies that particular analysis...
Definition: Analysis.h:26
Dependece between memory access instructions.
DepType Type
The type of the dependence.
bool isPossiblyBackward() const
May be a lexically backward dependence type (includes Unknown).
bool isForward() const
Lexically forward dependence.
bool isBackward() const
Lexically backward dependence.
void print(raw_ostream &OS, unsigned Depth, const SmallVectorImpl< Instruction * > &Instrs) const
Print the dependence.
Instruction * getDestination(const LoopAccessInfo &LAI) const
Return the destination instruction of the dependence.
Instruction * getSource(const LoopAccessInfo &LAI) const
Return the source instruction of the dependence.
DepType
The type of the dependence.
static const char * DepName[]
String version of the types.
static VectorizationSafetyStatus isSafeForVectorization(DepType Type)
Dependence types that don't prevent vectorization.
unsigned AddressSpace
Address space of the involved pointers.
bool addPointer(unsigned Index, RuntimePointerChecking &RtCheck)
Tries to add the pointer recorded in RtCheck at index Index to this pointer checking group.
bool NeedsFreeze
Whether the pointer needs to be frozen after expansion, e.g.
const SCEV * High
The SCEV expression which represents the upper bound of all the pointers in this group.
SmallVector< unsigned, 2 > Members
Indices of all the pointers that constitute this grouping.
RuntimeCheckingPtrGroup(unsigned Index, RuntimePointerChecking &RtCheck)
Create a new pointer checking group containing a single pointer, with index Index in RtCheck.
const SCEV * Low
The SCEV expression which represents the lower bound of all the pointers in this group.
bool IsWritePtr
Holds the information if this pointer is used for writing to memory.
unsigned DependencySetId
Holds the id of the set of pointers that could be dependent because of a shared underlying object.
unsigned AliasSetId
Holds the id of the disjoint alias set to which this pointer belongs.
static const unsigned MaxVectorWidth
Maximum SIMD width.
static unsigned VectorizationFactor
VF as overridden by the user.
static unsigned RuntimeMemoryCheckThreshold
\When performing memory disambiguation checks at runtime do not make more than this number of compari...
static bool isInterleaveForced()
True if force-vector-interleave was specified by the user.
static unsigned VectorizationInterleave
Interleave factor as overridden by the user.
Function object to check whether the first component of a container supported by std::get (like std::...
Definition: STLExtras.h:1450