Line data Source code
1 : //===- LoopAccessAnalysis.cpp - Loop Access Analysis Implementation --------==//
2 : //
3 : // The LLVM Compiler Infrastructure
4 : //
5 : // This file is distributed under the University of Illinois Open Source
6 : // License. See LICENSE.TXT for details.
7 : //
8 : //===----------------------------------------------------------------------===//
9 : //
10 : // The implementation for the loop memory dependence that was originally
11 : // developed for the loop vectorizer.
12 : //
13 : //===----------------------------------------------------------------------===//
14 :
15 : #include "llvm/Analysis/LoopAccessAnalysis.h"
16 : #include "llvm/ADT/APInt.h"
17 : #include "llvm/ADT/DenseMap.h"
18 : #include "llvm/ADT/DepthFirstIterator.h"
19 : #include "llvm/ADT/EquivalenceClasses.h"
20 : #include "llvm/ADT/PointerIntPair.h"
21 : #include "llvm/ADT/STLExtras.h"
22 : #include "llvm/ADT/SetVector.h"
23 : #include "llvm/ADT/SmallPtrSet.h"
24 : #include "llvm/ADT/SmallSet.h"
25 : #include "llvm/ADT/SmallVector.h"
26 : #include "llvm/ADT/iterator_range.h"
27 : #include "llvm/Analysis/AliasAnalysis.h"
28 : #include "llvm/Analysis/AliasSetTracker.h"
29 : #include "llvm/Analysis/LoopAnalysisManager.h"
30 : #include "llvm/Analysis/LoopInfo.h"
31 : #include "llvm/Analysis/MemoryLocation.h"
32 : #include "llvm/Analysis/OptimizationRemarkEmitter.h"
33 : #include "llvm/Analysis/ScalarEvolution.h"
34 : #include "llvm/Analysis/ScalarEvolutionExpander.h"
35 : #include "llvm/Analysis/ScalarEvolutionExpressions.h"
36 : #include "llvm/Analysis/TargetLibraryInfo.h"
37 : #include "llvm/Analysis/ValueTracking.h"
38 : #include "llvm/Analysis/VectorUtils.h"
39 : #include "llvm/IR/BasicBlock.h"
40 : #include "llvm/IR/Constants.h"
41 : #include "llvm/IR/DataLayout.h"
42 : #include "llvm/IR/DebugLoc.h"
43 : #include "llvm/IR/DerivedTypes.h"
44 : #include "llvm/IR/DiagnosticInfo.h"
45 : #include "llvm/IR/Dominators.h"
46 : #include "llvm/IR/Function.h"
47 : #include "llvm/IR/IRBuilder.h"
48 : #include "llvm/IR/InstrTypes.h"
49 : #include "llvm/IR/Instruction.h"
50 : #include "llvm/IR/Instructions.h"
51 : #include "llvm/IR/Operator.h"
52 : #include "llvm/IR/PassManager.h"
53 : #include "llvm/IR/Type.h"
54 : #include "llvm/IR/Value.h"
55 : #include "llvm/IR/ValueHandle.h"
56 : #include "llvm/Pass.h"
57 : #include "llvm/Support/Casting.h"
58 : #include "llvm/Support/CommandLine.h"
59 : #include "llvm/Support/Debug.h"
60 : #include "llvm/Support/ErrorHandling.h"
61 : #include "llvm/Support/raw_ostream.h"
62 : #include <algorithm>
63 : #include <cassert>
64 : #include <cstdint>
65 : #include <cstdlib>
66 : #include <iterator>
67 : #include <utility>
68 : #include <vector>
69 :
70 : using namespace llvm;
71 :
72 : #define DEBUG_TYPE "loop-accesses"
73 :
74 : static cl::opt<unsigned, true>
75 : VectorizationFactor("force-vector-width", cl::Hidden,
76 : cl::desc("Sets the SIMD width. Zero is autoselect."),
77 : cl::location(VectorizerParams::VectorizationFactor));
78 : unsigned VectorizerParams::VectorizationFactor;
79 :
80 : static cl::opt<unsigned, true>
81 : VectorizationInterleave("force-vector-interleave", cl::Hidden,
82 : cl::desc("Sets the vectorization interleave count. "
83 : "Zero is autoselect."),
84 : cl::location(
85 : VectorizerParams::VectorizationInterleave));
86 : unsigned VectorizerParams::VectorizationInterleave;
87 :
88 : static cl::opt<unsigned, true> RuntimeMemoryCheckThreshold(
89 : "runtime-memory-check-threshold", cl::Hidden,
90 : cl::desc("When performing memory disambiguation checks at runtime do not "
91 : "generate more than this number of comparisons (default = 8)."),
92 : cl::location(VectorizerParams::RuntimeMemoryCheckThreshold), cl::init(8));
93 : unsigned VectorizerParams::RuntimeMemoryCheckThreshold;
94 :
95 : /// The maximum iterations used to merge memory checks
96 : static cl::opt<unsigned> MemoryCheckMergeThreshold(
97 : "memory-check-merge-threshold", cl::Hidden,
98 : cl::desc("Maximum number of comparisons done when trying to merge "
99 : "runtime memory checks. (default = 100)"),
100 : cl::init(100));
101 :
102 : /// Maximum SIMD width.
103 : const unsigned VectorizerParams::MaxVectorWidth = 64;
104 :
105 : /// We collect dependences up to this threshold.
106 : static cl::opt<unsigned>
107 : MaxDependences("max-dependences", cl::Hidden,
108 : cl::desc("Maximum number of dependences collected by "
109 : "loop-access analysis (default = 100)"),
110 : cl::init(100));
111 :
112 : /// This enables versioning on the strides of symbolically striding memory
113 : /// accesses in code like the following.
114 : /// for (i = 0; i < N; ++i)
115 : /// A[i * Stride1] += B[i * Stride2] ...
116 : ///
117 : /// Will be roughly translated to
118 : /// if (Stride1 == 1 && Stride2 == 1) {
119 : /// for (i = 0; i < N; i+=4)
120 : /// A[i:i+3] += ...
121 : /// } else
122 : /// ...
123 : static cl::opt<bool> EnableMemAccessVersioning(
124 : "enable-mem-access-versioning", cl::init(true), cl::Hidden,
125 : cl::desc("Enable symbolic stride memory access versioning"));
126 :
127 : /// Enable store-to-load forwarding conflict detection. This option can
128 : /// be disabled for correctness testing.
129 : static cl::opt<bool> EnableForwardingConflictDetection(
130 : "store-to-load-forwarding-conflict-detection", cl::Hidden,
131 : cl::desc("Enable conflict detection in loop-access analysis"),
132 : cl::init(true));
133 :
134 4831 : bool VectorizerParams::isInterleaveForced() {
135 4831 : return ::VectorizationInterleave.getNumOccurrences() > 0;
136 : }
137 :
138 43 : Value *llvm::stripIntegerCast(Value *V) {
139 : if (auto *CI = dyn_cast<CastInst>(V))
140 6 : if (CI->getOperand(0)->getType()->isIntegerTy())
141 : return CI->getOperand(0);
142 : return V;
143 : }
144 :
145 29549 : const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
146 : const ValueToValueMap &PtrToStride,
147 : Value *Ptr, Value *OrigPtr) {
148 29549 : const SCEV *OrigSCEV = PSE.getSCEV(Ptr);
149 :
150 : // If there is an entry in the map return the SCEV of the pointer with the
151 : // symbolic stride replaced by one.
152 : ValueToValueMap::const_iterator SI =
153 59098 : PtrToStride.find(OrigPtr ? OrigPtr : Ptr);
154 29549 : if (SI != PtrToStride.end()) {
155 43 : Value *StrideVal = SI->second;
156 :
157 : // Strip casts.
158 43 : StrideVal = stripIntegerCast(StrideVal);
159 :
160 43 : ScalarEvolution *SE = PSE.getSE();
161 43 : const auto *U = cast<SCEVUnknown>(SE->getSCEV(StrideVal));
162 : const auto *CT =
163 43 : static_cast<const SCEVConstant *>(SE->getOne(StrideVal->getType()));
164 :
165 43 : PSE.addPredicate(*SE->getEqualPredicate(U, CT));
166 43 : auto *Expr = PSE.getSCEV(Ptr);
167 :
168 : LLVM_DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV
169 : << " by: " << *Expr << "\n");
170 43 : return Expr;
171 : }
172 :
173 : // Otherwise, just return the SCEV of the original pointer.
174 : return OrigSCEV;
175 : }
176 :
177 : /// Calculate Start and End points of memory access.
178 : /// Let's assume A is the first access and B is a memory access on N-th loop
179 : /// iteration. Then B is calculated as:
180 : /// B = A + Step*N .
181 : /// Step value may be positive or negative.
182 : /// N is a calculated back-edge taken count:
183 : /// N = (TripCount > 0) ? RoundDown(TripCount -1 , VF) : 0
184 : /// Start and End points are calculated in the following way:
185 : /// Start = UMIN(A, B) ; End = UMAX(A, B) + SizeOfElt,
186 : /// where SizeOfElt is the size of single memory access in bytes.
187 : ///
188 : /// There is no conflict when the intervals are disjoint:
189 : /// NoConflict = (P2.Start >= P1.End) || (P1.Start >= P2.End)
190 4271 : void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, bool WritePtr,
191 : unsigned DepSetId, unsigned ASId,
192 : const ValueToValueMap &Strides,
193 : PredicatedScalarEvolution &PSE) {
194 : // Get the stride replaced scev.
195 4271 : const SCEV *Sc = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
196 4271 : ScalarEvolution *SE = PSE.getSE();
197 :
198 : const SCEV *ScStart;
199 : const SCEV *ScEnd;
200 :
201 4271 : if (SE->isLoopInvariant(Sc, Lp))
202 1858 : ScStart = ScEnd = Sc;
203 : else {
204 2413 : const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
205 : assert(AR && "Invalid addrec expression");
206 2413 : const SCEV *Ex = PSE.getBackedgeTakenCount();
207 :
208 2413 : ScStart = AR->getStart();
209 2413 : ScEnd = AR->evaluateAtIteration(Ex, *SE);
210 2413 : const SCEV *Step = AR->getStepRecurrence(*SE);
211 :
212 : // For expressions with negative step, the upper bound is ScStart and the
213 : // lower bound is ScEnd.
214 : if (const auto *CStep = dyn_cast<SCEVConstant>(Step)) {
215 2391 : if (CStep->getValue()->isNegative())
216 : std::swap(ScStart, ScEnd);
217 : } else {
218 : // Fallback case: the step is not constant, but we can still
219 : // get the upper and lower bounds of the interval by using min/max
220 : // expressions.
221 22 : ScStart = SE->getUMinExpr(ScStart, ScEnd);
222 44 : ScEnd = SE->getUMaxExpr(AR->getStart(), ScEnd);
223 : }
224 : // Add the size of the pointed element to ScEnd.
225 : unsigned EltSize =
226 2413 : Ptr->getType()->getPointerElementType()->getScalarSizeInBits() / 8;
227 2413 : const SCEV *EltSizeSCEV = SE->getConstant(ScEnd->getType(), EltSize);
228 2413 : ScEnd = SE->getAddExpr(ScEnd, EltSizeSCEV);
229 : }
230 :
231 4271 : Pointers.emplace_back(Ptr, ScStart, ScEnd, WritePtr, DepSetId, ASId, Sc);
232 4271 : }
233 :
234 : SmallVector<RuntimePointerChecking::PointerCheck, 4>
235 680 : RuntimePointerChecking::generateChecks() const {
236 : SmallVector<PointerCheck, 4> Checks;
237 :
238 2625 : for (unsigned I = 0; I < CheckingGroups.size(); ++I) {
239 4232 : for (unsigned J = I + 1; J < CheckingGroups.size(); ++J) {
240 : const RuntimePointerChecking::CheckingPtrGroup &CGI = CheckingGroups[I];
241 : const RuntimePointerChecking::CheckingPtrGroup &CGJ = CheckingGroups[J];
242 :
243 2287 : if (needsChecking(CGI, CGJ))
244 1946 : Checks.push_back(std::make_pair(&CGI, &CGJ));
245 : }
246 : }
247 680 : return Checks;
248 : }
249 :
250 680 : void RuntimePointerChecking::generateChecks(
251 : MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) {
252 : assert(Checks.empty() && "Checks is not empty");
253 680 : groupChecks(DepCands, UseDependencies);
254 680 : Checks = generateChecks();
255 680 : }
256 :
257 2287 : bool RuntimePointerChecking::needsChecking(const CheckingPtrGroup &M,
258 : const CheckingPtrGroup &N) const {
259 2765 : for (unsigned I = 0, EI = M.Members.size(); EI != I; ++I)
260 2947 : for (unsigned J = 0, EJ = N.Members.size(); EJ != J; ++J)
261 7407 : if (needsChecking(M.Members[I], N.Members[J]))
262 : return true;
263 : return false;
264 : }
265 :
266 : /// Compare \p I and \p J and return the minimum.
267 : /// Return nullptr in case we couldn't find an answer.
268 3279 : static const SCEV *getMinFromExprs(const SCEV *I, const SCEV *J,
269 : ScalarEvolution *SE) {
270 3279 : const SCEV *Diff = SE->getMinusSCEV(J, I);
271 : const SCEVConstant *C = dyn_cast<const SCEVConstant>(Diff);
272 :
273 : if (!C)
274 : return nullptr;
275 3264 : if (C->getValue()->isNegative())
276 636 : return J;
277 : return I;
278 : }
279 :
280 1647 : bool RuntimePointerChecking::CheckingPtrGroup::addPointer(unsigned Index) {
281 1647 : const SCEV *Start = RtCheck.Pointers[Index].Start;
282 1647 : const SCEV *End = RtCheck.Pointers[Index].End;
283 :
284 : // Compare the starts and ends with the known minimum and maximum
285 : // of this set. We need to know how we compare against the min/max
286 : // of the set in order to be able to emit memchecks.
287 1647 : const SCEV *Min0 = getMinFromExprs(Start, Low, RtCheck.SE);
288 1647 : if (!Min0)
289 : return false;
290 :
291 1632 : const SCEV *Min1 = getMinFromExprs(End, High, RtCheck.SE);
292 1632 : if (!Min1)
293 : return false;
294 :
295 : // Update the low bound expression if we've found a new min value.
296 1632 : if (Min0 == Start)
297 1088 : Low = Start;
298 :
299 : // Update the high bound expression if we've found a new max value.
300 1632 : if (Min1 != End)
301 92 : High = End;
302 :
303 1632 : Members.push_back(Index);
304 1632 : return true;
305 : }
306 :
307 680 : void RuntimePointerChecking::groupChecks(
308 : MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) {
309 : // We build the groups from dependency candidates equivalence classes
310 : // because:
311 : // - We know that pointers in the same equivalence class share
312 : // the same underlying object and therefore there is a chance
313 : // that we can compare pointers
314 : // - We wouldn't be able to merge two pointers for which we need
315 : // to emit a memcheck. The classes in DepCands are already
316 : // conveniently built such that no two pointers in the same
317 : // class need checking against each other.
318 :
319 : // We use the following (greedy) algorithm to construct the groups
320 : // For every pointer in the equivalence class:
321 : // For each existing group:
322 : // - if the difference between this pointer and the min/max bounds
323 : // of the group is a constant, then make the pointer part of the
324 : // group and update the min/max bounds of that group as required.
325 :
326 680 : CheckingGroups.clear();
327 :
328 : // If we need to check two pointers to the same underlying object
329 : // with a non-constant difference, we shouldn't perform any pointer
330 : // grouping with those pointers. This is because we can easily get
331 : // into cases where the resulting check would return false, even when
332 : // the accesses are safe.
333 : //
334 : // The following example shows this:
335 : // for (i = 0; i < 1000; ++i)
336 : // a[5000 + i * m] = a[i] + a[i + 9000]
337 : //
338 : // Here grouping gives a check of (5000, 5000 + 1000 * m) against
339 : // (0, 10000) which is always false. However, if m is 1, there is no
340 : // dependence. Not grouping the checks for a[i] and a[i + 9000] allows
341 : // us to perform an accurate check in this case.
342 : //
343 : // The above case requires that we have an UnknownDependence between
344 : // accesses to the same underlying object. This cannot happen unless
345 : // ShouldRetryWithRuntimeCheck is set, and therefore UseDependencies
346 : // is also false. In this case we will use the fallback path and create
347 : // separate checking groups for all pointers.
348 :
349 : // If we don't have the dependency partitions, construct a new
350 : // checking pointer group for each pointer. This is also required
351 : // for correctness, because in this case we can have checking between
352 : // pointers to the same underlying object.
353 680 : if (!UseDependencies) {
354 69 : for (unsigned I = 0; I < Pointers.size(); ++I)
355 98 : CheckingGroups.push_back(CheckingPtrGroup(I, *this));
356 20 : return;
357 : }
358 :
359 : unsigned TotalComparisons = 0;
360 :
361 : DenseMap<Value *, unsigned> PositionMap;
362 3318 : for (unsigned Index = 0; Index < Pointers.size(); ++Index)
363 2658 : PositionMap[Pointers[Index].PointerValue] = Index;
364 :
365 : // We need to keep track of what pointers we've already seen so we
366 : // don't process them twice.
367 660 : SmallSet<unsigned, 2> Seen;
368 :
369 : // Go through all equivalence classes, get the "pointer check groups"
370 : // and add them to the overall solution. We use the order in which accesses
371 : // appear in 'Pointers' to enforce determinism.
372 3318 : for (unsigned I = 0; I < Pointers.size(); ++I) {
373 : // We've seen this pointer before, and therefore already processed
374 : // its equivalence class.
375 2658 : if (Seen.count(I))
376 771 : continue;
377 :
378 : MemoryDepChecker::MemAccessInfo Access(Pointers[I].PointerValue,
379 1887 : Pointers[I].IsWritePtr);
380 :
381 1887 : SmallVector<CheckingPtrGroup, 2> Groups;
382 1887 : auto LeaderI = DepCands.findValue(DepCands.getLeaderValue(Access));
383 :
384 : // Because DepCands is constructed by visiting accesses in the order in
385 : // which they appear in alias sets (which is deterministic) and the
386 : // iteration order within an equivalence class member is only dependent on
387 : // the order in which unions and insertions are performed on the
388 : // equivalence class, the iteration order is deterministic.
389 : for (auto MI = DepCands.member_begin(LeaderI), ME = DepCands.member_end();
390 5415 : MI != ME; ++MI) {
391 7056 : unsigned Pointer = PositionMap[MI->getPointer()];
392 : bool Merged = false;
393 : // Mark this pointer as seen.
394 3528 : Seen.insert(Pointer);
395 :
396 : // Go through all the existing sets and see if we can find one
397 : // which can include this pointer.
398 3543 : for (CheckingPtrGroup &Group : Groups) {
399 : // Don't perform more than a certain amount of comparisons.
400 : // This should limit the cost of grouping the pointers to something
401 : // reasonable. If we do end up hitting this threshold, the algorithm
402 : // will create separate groups for all remaining pointers.
403 1647 : if (TotalComparisons > MemoryCheckMergeThreshold)
404 : break;
405 :
406 1647 : TotalComparisons++;
407 :
408 1647 : if (Group.addPointer(Pointer)) {
409 : Merged = true;
410 : break;
411 : }
412 : }
413 :
414 3528 : if (!Merged)
415 : // We couldn't add this pointer to any existing set or the threshold
416 : // for the number of comparisons has been reached. Create a new group
417 : // to hold the current pointer.
418 3792 : Groups.push_back(CheckingPtrGroup(Pointer, *this));
419 : }
420 :
421 : // We've computed the grouped checks for this partition.
422 : // Save the results and continue with the next one.
423 1887 : std::copy(Groups.begin(), Groups.end(), std::back_inserter(CheckingGroups));
424 : }
425 : }
426 :
427 60 : bool RuntimePointerChecking::arePointersInSamePartition(
428 : const SmallVectorImpl<int> &PtrToPartition, unsigned PtrIdx1,
429 : unsigned PtrIdx2) {
430 120 : return (PtrToPartition[PtrIdx1] != -1 &&
431 120 : PtrToPartition[PtrIdx1] == PtrToPartition[PtrIdx2]);
432 : }
433 :
434 2556 : bool RuntimePointerChecking::needsChecking(unsigned I, unsigned J) const {
435 2556 : const PointerInfo &PointerI = Pointers[I];
436 2556 : const PointerInfo &PointerJ = Pointers[J];
437 :
438 : // No need to check if two readonly pointers intersect.
439 2556 : if (!PointerI.IsWritePtr && !PointerJ.IsWritePtr)
440 : return false;
441 :
442 : // Only need to check pointers between two different dependency sets.
443 2024 : if (PointerI.DependencySetId == PointerJ.DependencySetId)
444 : return false;
445 :
446 : // Only need to check pointers in the same alias set.
447 2008 : if (PointerI.AliasSetId != PointerJ.AliasSetId)
448 2 : return false;
449 :
450 : return true;
451 : }
452 :
453 105 : void RuntimePointerChecking::printChecks(
454 : raw_ostream &OS, const SmallVectorImpl<PointerCheck> &Checks,
455 : unsigned Depth) const {
456 : unsigned N = 0;
457 186 : for (const auto &Check : Checks) {
458 81 : const auto &First = Check.first->Members, &Second = Check.second->Members;
459 :
460 81 : OS.indent(Depth) << "Check " << N++ << ":\n";
461 :
462 81 : OS.indent(Depth + 2) << "Comparing group (" << Check.first << "):\n";
463 191 : for (unsigned K = 0; K < First.size(); ++K)
464 110 : OS.indent(Depth + 2) << *Pointers[First[K]].PointerValue << "\n";
465 :
466 81 : OS.indent(Depth + 2) << "Against group (" << Check.second << "):\n";
467 169 : for (unsigned K = 0; K < Second.size(); ++K)
468 88 : OS.indent(Depth + 2) << *Pointers[Second[K]].PointerValue << "\n";
469 : }
470 105 : }
471 :
472 105 : void RuntimePointerChecking::print(raw_ostream &OS, unsigned Depth) const {
473 :
474 105 : OS.indent(Depth) << "Run-time memory checks:\n";
475 105 : printChecks(OS, Checks, Depth);
476 :
477 105 : OS.indent(Depth) << "Grouped accesses:\n";
478 206 : for (unsigned I = 0; I < CheckingGroups.size(); ++I) {
479 : const auto &CG = CheckingGroups[I];
480 :
481 101 : OS.indent(Depth + 2) << "Group " << &CG << ":\n";
482 101 : OS.indent(Depth + 4) << "(Low: " << *CG.Low << " High: " << *CG.High
483 101 : << ")\n";
484 228 : for (unsigned J = 0; J < CG.Members.size(); ++J) {
485 127 : OS.indent(Depth + 6) << "Member: " << *Pointers[CG.Members[J]].Expr
486 127 : << "\n";
487 : }
488 : }
489 105 : }
490 :
491 : namespace {
492 :
493 : /// Analyses memory accesses in a loop.
494 : ///
495 : /// Checks whether run time pointer checks are needed and builds sets for data
496 : /// dependence checking.
497 : class AccessAnalysis {
498 : public:
499 : /// Read or write access location.
500 : typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
501 : typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList;
502 :
503 1588 : AccessAnalysis(const DataLayout &Dl, Loop *TheLoop, AliasAnalysis *AA,
504 : LoopInfo *LI, MemoryDepChecker::DepCandidates &DA,
505 : PredicatedScalarEvolution &PSE)
506 1588 : : DL(Dl), TheLoop(TheLoop), AST(*AA), LI(LI), DepCands(DA),
507 1588 : IsRTCheckAnalysisNeeded(false), PSE(PSE) {}
508 :
509 : /// Register a load and whether it is only read from.
510 3903 : void addLoad(MemoryLocation &Loc, bool IsReadOnly) {
511 3903 : Value *Ptr = const_cast<Value*>(Loc.Ptr);
512 3903 : AST.add(Ptr, LocationSize::unknown(), Loc.AATags);
513 7806 : Accesses.insert(MemAccessInfo(Ptr, false));
514 3903 : if (IsReadOnly)
515 3659 : ReadOnlyPtr.insert(Ptr);
516 3903 : }
517 :
518 : /// Register a store.
519 3676 : void addStore(MemoryLocation &Loc) {
520 3676 : Value *Ptr = const_cast<Value*>(Loc.Ptr);
521 3676 : AST.add(Ptr, LocationSize::unknown(), Loc.AATags);
522 7352 : Accesses.insert(MemAccessInfo(Ptr, true));
523 3676 : }
524 :
525 : /// Check if we can emit a run-time no-alias check for \p Access.
526 : ///
527 : /// Returns true if we can emit a run-time no alias check for \p Access.
528 : /// If we can check this access, this also adds it to a dependence set and
529 : /// adds a run-time to check for it to \p RtCheck. If \p Assume is true,
530 : /// we will attempt to use additional run-time checks in order to get
531 : /// the bounds of the pointer.
532 : bool createCheckForAccess(RuntimePointerChecking &RtCheck,
533 : MemAccessInfo Access,
534 : const ValueToValueMap &Strides,
535 : DenseMap<Value *, unsigned> &DepSetId,
536 : Loop *TheLoop, unsigned &RunningDepId,
537 : unsigned ASId, bool ShouldCheckStride,
538 : bool Assume);
539 :
540 : /// Check whether we can check the pointers at runtime for
541 : /// non-intersection.
542 : ///
543 : /// Returns true if we need no check or if we do and we can generate them
544 : /// (i.e. the pointers have computable bounds).
545 : bool canCheckPtrAtRT(RuntimePointerChecking &RtCheck, ScalarEvolution *SE,
546 : Loop *TheLoop, const ValueToValueMap &Strides,
547 : bool ShouldCheckWrap = false);
548 :
549 : /// Goes over all memory accesses, checks whether a RT check is needed
550 : /// and builds sets of dependent accesses.
551 : void buildDependenceSets() {
552 1327 : processMemAccesses();
553 : }
554 :
555 : /// Initial processing of memory accesses determined that we need to
556 : /// perform dependency checking.
557 : ///
558 : /// Note that this can later be cleared if we retry memcheck analysis without
559 : /// dependency checking (i.e. ShouldRetryWithRuntimeCheck).
560 1029 : bool isDependencyCheckNeeded() { return !CheckDeps.empty(); }
561 :
562 : /// We decided that no dependence analysis would be used. Reset the state.
563 : void resetDepChecks(MemoryDepChecker &DepChecker) {
564 : CheckDeps.clear();
565 : DepChecker.clearDependences();
566 : }
567 :
568 : MemAccessInfoList &getDependenciesToCheck() { return CheckDeps; }
569 :
570 : private:
571 : typedef SetVector<MemAccessInfo> PtrAccessSet;
572 :
573 : /// Go over all memory access and check whether runtime pointer checks
574 : /// are needed and build sets of dependency check candidates.
575 : void processMemAccesses();
576 :
577 : /// Set of all accesses.
578 : PtrAccessSet Accesses;
579 :
580 : const DataLayout &DL;
581 :
582 : /// The loop being checked.
583 : const Loop *TheLoop;
584 :
585 : /// List of accesses that need a further dependence check.
586 : MemAccessInfoList CheckDeps;
587 :
588 : /// Set of pointers that are read only.
589 : SmallPtrSet<Value*, 16> ReadOnlyPtr;
590 :
591 : /// An alias set tracker to partition the access set by underlying object and
592 : //intrinsic property (such as TBAA metadata).
593 : AliasSetTracker AST;
594 :
595 : LoopInfo *LI;
596 :
597 : /// Sets of potentially dependent accesses - members of one set share an
598 : /// underlying pointer. The set "CheckDeps" identfies which sets really need a
599 : /// dependence check.
600 : MemoryDepChecker::DepCandidates &DepCands;
601 :
602 : /// Initial processing of memory accesses determined that we may need
603 : /// to add memchecks. Perform the analysis to determine the necessary checks.
604 : ///
605 : /// Note that, this is different from isDependencyCheckNeeded. When we retry
606 : /// memcheck analysis without dependency checking
607 : /// (i.e. ShouldRetryWithRuntimeCheck), isDependencyCheckNeeded is cleared
608 : /// while this remains set if we have potentially dependent accesses.
609 : bool IsRTCheckAnalysisNeeded;
610 :
611 : /// The SCEV predicate containing all the SCEV-related assumptions.
612 : PredicatedScalarEvolution &PSE;
613 : };
614 :
615 : } // end anonymous namespace
616 :
617 : /// Check whether a pointer can participate in a runtime bounds check.
618 : /// If \p Assume, try harder to prove that we can compute the bounds of \p Ptr
619 : /// by adding run-time checks (overflow checks) if necessary.
620 4650 : static bool hasComputableBounds(PredicatedScalarEvolution &PSE,
621 : const ValueToValueMap &Strides, Value *Ptr,
622 : Loop *L, bool Assume) {
623 4650 : const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
624 :
625 : // The bounds for loop-invariant pointer is trivial.
626 4650 : if (PSE.getSE()->isLoopInvariant(PtrScev, L))
627 : return true;
628 :
629 : const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
630 :
631 2792 : if (!AR && Assume)
632 145 : AR = PSE.getAsAddRec(Ptr);
633 :
634 2792 : if (!AR)
635 : return false;
636 :
637 2419 : return AR->isAffine();
638 : }
639 :
640 : /// Check whether a pointer address cannot wrap.
641 51 : static bool isNoWrap(PredicatedScalarEvolution &PSE,
642 : const ValueToValueMap &Strides, Value *Ptr, Loop *L) {
643 51 : const SCEV *PtrScev = PSE.getSCEV(Ptr);
644 51 : if (PSE.getSE()->isLoopInvariant(PtrScev, L))
645 : return true;
646 :
647 48 : int64_t Stride = getPtrStride(PSE, Ptr, L, Strides);
648 48 : if (Stride == 1 || PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW))
649 44 : return true;
650 :
651 : return false;
652 : }
653 :
654 4650 : bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck,
655 : MemAccessInfo Access,
656 : const ValueToValueMap &StridesMap,
657 : DenseMap<Value *, unsigned> &DepSetId,
658 : Loop *TheLoop, unsigned &RunningDepId,
659 : unsigned ASId, bool ShouldCheckWrap,
660 : bool Assume) {
661 : Value *Ptr = Access.getPointer();
662 :
663 4650 : if (!hasComputableBounds(PSE, StridesMap, Ptr, TheLoop, Assume))
664 : return false;
665 :
666 : // When we run after a failing dependency check we have to make sure
667 : // we don't have wrapping pointers.
668 4273 : if (ShouldCheckWrap && !isNoWrap(PSE, StridesMap, Ptr, TheLoop)) {
669 4 : auto *Expr = PSE.getSCEV(Ptr);
670 4 : if (!Assume || !isa<SCEVAddRecExpr>(Expr))
671 : return false;
672 2 : PSE.setNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW);
673 : }
674 :
675 : // The id of the dependence set.
676 : unsigned DepId;
677 :
678 4271 : if (isDependencyCheckNeeded()) {
679 4222 : Value *Leader = DepCands.getLeaderValue(Access).getPointer();
680 : unsigned &LeaderId = DepSetId[Leader];
681 4222 : if (!LeaderId)
682 2691 : LeaderId = RunningDepId++;
683 4222 : DepId = LeaderId;
684 : } else
685 : // Each access has its own dependence set.
686 49 : DepId = RunningDepId++;
687 :
688 : bool IsWrite = Access.getInt();
689 4271 : RtCheck.insert(TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap, PSE);
690 : LLVM_DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n');
691 :
692 4271 : return true;
693 : }
694 :
695 1347 : bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
696 : ScalarEvolution *SE, Loop *TheLoop,
697 : const ValueToValueMap &StridesMap,
698 : bool ShouldCheckWrap) {
699 : // Find pointers with computable bounds. We are going to use this information
700 : // to place a runtime bound check.
701 : bool CanDoRT = true;
702 :
703 : bool NeedRTCheck = false;
704 1347 : if (!IsRTCheckAnalysisNeeded) return true;
705 :
706 : bool IsDepCheckNeeded = isDependencyCheckNeeded();
707 :
708 : // We assign a consecutive id to access from different alias sets.
709 : // Accesses between different groups doesn't need to be checked.
710 : unsigned ASId = 1;
711 2290 : for (auto &AS : AST) {
712 : int NumReadPtrChecks = 0;
713 : int NumWritePtrChecks = 0;
714 : bool CanDoAliasSetRT = true;
715 :
716 : // We assign consecutive id to access from different dependence sets.
717 : // Accesses within the same set don't need a runtime check.
718 1261 : unsigned RunningDepId = 1;
719 : DenseMap<Value *, unsigned> DepSetId;
720 :
721 : SmallVector<MemAccessInfo, 4> Retries;
722 :
723 5758 : for (auto A : AS) {
724 : Value *Ptr = A.getValue();
725 4497 : bool IsWrite = Accesses.count(MemAccessInfo(Ptr, true));
726 : MemAccessInfo Access(Ptr, IsWrite);
727 :
728 4497 : if (IsWrite)
729 3096 : ++NumWritePtrChecks;
730 : else
731 1401 : ++NumReadPtrChecks;
732 :
733 4497 : if (!createCheckForAccess(RtCheck, Access, StridesMap, DepSetId, TheLoop,
734 : RunningDepId, ASId, ShouldCheckWrap, false)) {
735 : LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:" << *Ptr << '\n');
736 251 : Retries.push_back(Access);
737 : CanDoAliasSetRT = false;
738 : }
739 : }
740 :
741 : // If we have at least two writes or one write and a read then we need to
742 : // check them. But there is no need to checks if there is only one
743 : // dependence set for this alias set.
744 : //
745 : // Note that this function computes CanDoRT and NeedRTCheck independently.
746 : // For example CanDoRT=false, NeedRTCheck=false means that we have a pointer
747 : // for which we couldn't find the bounds but we don't actually need to emit
748 : // any checks so it does not matter.
749 : bool NeedsAliasSetRTCheck = false;
750 1261 : if (!(IsDepCheckNeeded && CanDoAliasSetRT && RunningDepId == 2))
751 941 : NeedsAliasSetRTCheck = (NumWritePtrChecks >= 2 ||
752 542 : (NumReadPtrChecks >= 1 && NumWritePtrChecks >= 1));
753 :
754 : // We need to perform run-time alias checks, but some pointers had bounds
755 : // that couldn't be checked.
756 1261 : if (NeedsAliasSetRTCheck && !CanDoAliasSetRT) {
757 : // Reset the CanDoSetRt flag and retry all accesses that have failed.
758 : // We know that we need these checks, so we can now be more aggressive
759 : // and add further checks if required (overflow checks).
760 : CanDoAliasSetRT = true;
761 165 : for (auto Access : Retries)
762 153 : if (!createCheckForAccess(RtCheck, Access, StridesMap, DepSetId,
763 : TheLoop, RunningDepId, ASId,
764 : ShouldCheckWrap, /*Assume=*/true)) {
765 : CanDoAliasSetRT = false;
766 : break;
767 : }
768 : }
769 :
770 : CanDoRT &= CanDoAliasSetRT;
771 1261 : NeedRTCheck |= NeedsAliasSetRTCheck;
772 1261 : ++ASId;
773 : }
774 :
775 : // If the pointers that we would use for the bounds comparison have different
776 : // address spaces, assume the values aren't directly comparable, so we can't
777 : // use them for the runtime check. We also have to assume they could
778 : // overlap. In the future there should be metadata for whether address spaces
779 : // are disjoint.
780 1029 : unsigned NumPointers = RtCheck.Pointers.size();
781 5278 : for (unsigned i = 0; i < NumPointers; ++i) {
782 18810 : for (unsigned j = i + 1; j < NumPointers; ++j) {
783 : // Only need to check pointers between two different dependency sets.
784 14561 : if (RtCheck.Pointers[i].DependencySetId ==
785 29122 : RtCheck.Pointers[j].DependencySetId)
786 : continue;
787 : // Only need to check pointers in the same alias set.
788 11926 : if (RtCheck.Pointers[i].AliasSetId != RtCheck.Pointers[j].AliasSetId)
789 : continue;
790 :
791 : Value *PtrI = RtCheck.Pointers[i].PointerValue;
792 : Value *PtrJ = RtCheck.Pointers[j].PointerValue;
793 :
794 11920 : unsigned ASi = PtrI->getType()->getPointerAddressSpace();
795 11920 : unsigned ASj = PtrJ->getType()->getPointerAddressSpace();
796 11920 : if (ASi != ASj) {
797 : LLVM_DEBUG(
798 : dbgs() << "LAA: Runtime check would require comparison between"
799 : " different address spaces\n");
800 : return false;
801 : }
802 : }
803 : }
804 :
805 1020 : if (NeedRTCheck && CanDoRT)
806 680 : RtCheck.generateChecks(DepCands, IsDepCheckNeeded);
807 :
808 : LLVM_DEBUG(dbgs() << "LAA: We need to do " << RtCheck.getNumberOfChecks()
809 : << " pointer comparisons.\n");
810 :
811 1020 : RtCheck.Need = NeedRTCheck;
812 :
813 1020 : bool CanDoRTIfNeeded = !NeedRTCheck || CanDoRT;
814 1020 : if (!CanDoRTIfNeeded)
815 : RtCheck.reset();
816 : return CanDoRTIfNeeded;
817 : }
818 :
819 1327 : void AccessAnalysis::processMemAccesses() {
820 : // We process the set twice: first we process read-write pointers, last we
821 : // process read-only pointers. This allows us to skip dependence tests for
822 : // read-only pointers.
823 :
824 : LLVM_DEBUG(dbgs() << "LAA: Processing memory accesses...\n");
825 : LLVM_DEBUG(dbgs() << " AST: "; AST.dump());
826 : LLVM_DEBUG(dbgs() << "LAA: Accesses(" << Accesses.size() << "):\n");
827 : LLVM_DEBUG({
828 : for (auto A : Accesses)
829 : dbgs() << "\t" << *A.getPointer() << " (" <<
830 : (A.getInt() ? "write" : (ReadOnlyPtr.count(A.getPointer()) ?
831 : "read-only" : "read")) << ")\n";
832 : });
833 :
834 : // The AliasSetTracker has nicely partitioned our pointers by metadata
835 : // compatibility and potential for underlying-object overlap. As a result, we
836 : // only need to check for potential pointer dependencies within each alias
837 : // set.
838 3321 : for (auto &AS : AST) {
839 : // Note that both the alias-set tracker and the alias sets themselves used
840 : // linked lists internally and so the iteration order here is deterministic
841 : // (matching the original instruction order within each set).
842 :
843 : bool SetHasWrite = false;
844 :
845 : // Map of pointers to last access encountered.
846 : typedef DenseMap<Value*, MemAccessInfo> UnderlyingObjToAccessMap;
847 : UnderlyingObjToAccessMap ObjToLastAccess;
848 :
849 : // Set of access to check after all writes have been processed.
850 1994 : PtrAccessSet DeferredAccesses;
851 :
852 : // Iterate over each alias set twice, once to process read/write pointers,
853 : // and then to process read-only pointers.
854 5982 : for (int SetIteration = 0; SetIteration < 2; ++SetIteration) {
855 3988 : bool UseDeferred = SetIteration > 0;
856 3988 : PtrAccessSet &S = UseDeferred ? DeferredAccesses : Accesses;
857 :
858 14412 : for (auto AV : AS) {
859 : Value *Ptr = AV.getValue();
860 :
861 : // For a single memory access in AliasSetTracker, Accesses may contain
862 : // both read and write, and they both need to be handled for CheckDeps.
863 102662 : for (auto AC : S) {
864 92238 : if (AC.getPointer() != Ptr)
865 85248 : continue;
866 :
867 : bool IsWrite = AC.getInt();
868 :
869 : // If we're using the deferred access set, then it contains only
870 : // reads.
871 10429 : bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite;
872 10429 : if (UseDeferred && !IsReadOnlyPtr)
873 : continue;
874 : // Otherwise, the pointer must be in the PtrAccessSet, either as a
875 : // read or a write.
876 : assert(((IsReadOnlyPtr && UseDeferred) || IsWrite ||
877 : S.count(MemAccessInfo(Ptr, false))) &&
878 : "Alias-set pointer not in the access set?");
879 :
880 : MemAccessInfo Access(Ptr, IsWrite);
881 10429 : DepCands.insert(Access);
882 :
883 : // Memorize read-only pointers for later processing and skip them in
884 : // the first round (they need to be checked after we have seen all
885 : // write pointers). Note: we also mark pointer that are not
886 : // consecutive as "read-only" pointers (so that we check
887 : // "a[b[i]] +="). Hence, we need the second check for "!IsWrite".
888 10429 : if (!UseDeferred && IsReadOnlyPtr) {
889 3439 : DeferredAccesses.insert(Access);
890 3439 : continue;
891 : }
892 :
893 : // If this is a write - check other reads and writes for conflicts. If
894 : // this is a read only check other writes for conflicts (but only if
895 : // there is no other write to the ptr - this is an optimization to
896 : // catch "a[i] = a[i] + " without having to do a dependence check).
897 6990 : if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) {
898 4934 : CheckDeps.push_back(Access);
899 4934 : IsRTCheckAnalysisNeeded = true;
900 : }
901 :
902 6990 : if (IsWrite)
903 : SetHasWrite = true;
904 :
905 : // Create sets of pointers connected by a shared alias set and
906 : // underlying object.
907 : typedef SmallVector<Value *, 16> ValueVector;
908 : ValueVector TempObjects;
909 :
910 6990 : GetUnderlyingObjects(Ptr, TempObjects, DL, LI);
911 : LLVM_DEBUG(dbgs()
912 : << "Underlying objects for pointer " << *Ptr << "\n");
913 14246 : for (Value *UnderlyingObj : TempObjects) {
914 : // nullptr never alias, don't join sets for pointer that have "null"
915 : // in their UnderlyingObjects list.
916 7339 : if (isa<ConstantPointerNull>(UnderlyingObj) &&
917 83 : !NullPointerIsDefined(
918 166 : TheLoop->getHeader()->getParent(),
919 : UnderlyingObj->getType()->getPointerAddressSpace()))
920 83 : continue;
921 :
922 : UnderlyingObjToAccessMap::iterator Prev =
923 7173 : ObjToLastAccess.find(UnderlyingObj);
924 7173 : if (Prev != ObjToLastAccess.end())
925 3449 : DepCands.unionSets(Access, Prev->second);
926 :
927 7173 : ObjToLastAccess[UnderlyingObj] = Access;
928 : LLVM_DEBUG(dbgs() << " " << *UnderlyingObj << "\n");
929 : }
930 : }
931 : }
932 : }
933 : }
934 1327 : }
935 :
936 : static bool isInBoundsGep(Value *Ptr) {
937 : if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr))
938 8867 : return GEP->isInBounds();
939 : return false;
940 : }
941 :
942 : /// Return true if an AddRec pointer \p Ptr is unsigned non-wrapping,
943 : /// i.e. monotonically increasing/decreasing.
944 1349 : static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR,
945 : PredicatedScalarEvolution &PSE, const Loop *L) {
946 : // FIXME: This should probably only return true for NUW.
947 1349 : if (AR->getNoWrapFlags(SCEV::NoWrapMask))
948 : return true;
949 :
950 : // Scalar evolution does not propagate the non-wrapping flags to values that
951 : // are derived from a non-wrapping induction variable because non-wrapping
952 : // could be flow-sensitive.
953 : //
954 : // Look through the potentially overflowing instruction to try to prove
955 : // non-wrapping for the *specific* value of Ptr.
956 :
957 : // The arithmetic implied by an inbounds GEP can't overflow.
958 : auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
959 255 : if (!GEP || !GEP->isInBounds())
960 37 : return false;
961 :
962 : // Make sure there is only one non-const index and analyze that.
963 : Value *NonConstIndex = nullptr;
964 617 : for (Value *Index : make_range(GEP->idx_begin(), GEP->idx_end()))
965 380 : if (!isa<ConstantInt>(Index)) {
966 237 : if (NonConstIndex)
967 : return false;
968 : NonConstIndex = Index;
969 : }
970 237 : if (!NonConstIndex)
971 : // The recurrence is on the pointer, ignore for now.
972 : return false;
973 :
974 : // The index in GEP is signed. It is non-wrapping if it's derived from a NSW
975 : // AddRec using a NSW operation.
976 : if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(NonConstIndex))
977 220 : if (OBO->hasNoSignedWrap() &&
978 : // Assume constant for other the operand so that the AddRec can be
979 : // easily found.
980 66 : isa<ConstantInt>(OBO->getOperand(1))) {
981 33 : auto *OpScev = PSE.getSCEV(OBO->getOperand(0));
982 :
983 : if (auto *OpAR = dyn_cast<SCEVAddRecExpr>(OpScev))
984 33 : return OpAR->getLoop() == L && OpAR->getNoWrapFlags(SCEV::FlagNSW);
985 : }
986 :
987 : return false;
988 : }
989 :
990 : /// Check whether the access through \p Ptr has a constant stride.
991 19572 : int64_t llvm::getPtrStride(PredicatedScalarEvolution &PSE, Value *Ptr,
992 : const Loop *Lp, const ValueToValueMap &StridesMap,
993 : bool Assume, bool ShouldCheckWrap) {
994 19572 : Type *Ty = Ptr->getType();
995 : assert(Ty->isPointerTy() && "Unexpected non-ptr");
996 :
997 : // Make sure that the pointer does not point to aggregate types.
998 : auto *PtrTy = cast<PointerType>(Ty);
999 19572 : if (PtrTy->getElementType()->isAggregateType()) {
1000 : LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not a pointer to a scalar type"
1001 : << *Ptr << "\n");
1002 : return 0;
1003 : }
1004 :
1005 19572 : const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr);
1006 :
1007 : const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
1008 19572 : if (Assume && !AR)
1009 6609 : AR = PSE.getAsAddRec(Ptr);
1010 :
1011 19572 : if (!AR) {
1012 : LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not an AddRecExpr pointer " << *Ptr
1013 : << " SCEV: " << *PtrScev << "\n");
1014 : return 0;
1015 : }
1016 :
1017 : // The accesss function must stride over the innermost loop.
1018 11164 : if (Lp != AR->getLoop()) {
1019 : LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not striding over innermost loop "
1020 : << *Ptr << " SCEV: " << *AR << "\n");
1021 : return 0;
1022 : }
1023 :
1024 : // The address calculation must not wrap. Otherwise, a dependence could be
1025 : // inverted.
1026 : // An inbounds getelementptr that is a AddRec with a unit stride
1027 : // cannot wrap per definition. The unit stride requirement is checked later.
1028 : // An getelementptr without an inbounds attribute and unit stride would have
1029 : // to access the pointer value "0" which is undefined behavior in address
1030 : // space 0, therefore we can also vectorize this case.
1031 : bool IsInBoundsGEP = isInBoundsGep(Ptr);
1032 1534 : bool IsNoWrapAddRec = !ShouldCheckWrap ||
1033 12452 : PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW) ||
1034 1349 : isNoWrapAddRec(Ptr, AR, PSE, Lp);
1035 11140 : if (!IsNoWrapAddRec && !IsInBoundsGEP &&
1036 37 : NullPointerIsDefined(Lp->getHeader()->getParent(),
1037 : PtrTy->getAddressSpace())) {
1038 3 : if (Assume) {
1039 2 : PSE.setNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW);
1040 : IsNoWrapAddRec = true;
1041 : LLVM_DEBUG(dbgs() << "LAA: Pointer may wrap in the address space:\n"
1042 : << "LAA: Pointer: " << *Ptr << "\n"
1043 : << "LAA: SCEV: " << *AR << "\n"
1044 : << "LAA: Added an overflow assumption\n");
1045 : } else {
1046 : LLVM_DEBUG(
1047 : dbgs() << "LAA: Bad stride - Pointer may wrap in the address space "
1048 : << *Ptr << " SCEV: " << *AR << "\n");
1049 : return 0;
1050 : }
1051 : }
1052 :
1053 : // Check the step is constant.
1054 11102 : const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
1055 :
1056 : // Calculate the pointer stride and check if it is constant.
1057 : const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
1058 : if (!C) {
1059 : LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not a constant strided " << *Ptr
1060 : << " SCEV: " << *AR << "\n");
1061 : return 0;
1062 : }
1063 :
1064 11081 : auto &DL = Lp->getHeader()->getModule()->getDataLayout();
1065 11081 : int64_t Size = DL.getTypeAllocSize(PtrTy->getElementType());
1066 : const APInt &APStepVal = C->getAPInt();
1067 :
1068 : // Huge step value - give up.
1069 11081 : if (APStepVal.getBitWidth() > 64)
1070 : return 0;
1071 :
1072 : int64_t StepVal = APStepVal.getSExtValue();
1073 :
1074 : // Strided access.
1075 11081 : int64_t Stride = StepVal / Size;
1076 11081 : int64_t Rem = StepVal % Size;
1077 11081 : if (Rem)
1078 : return 0;
1079 :
1080 : // If the SCEV could wrap but we have an inbounds gep with a unit stride we
1081 : // know we can't "wrap around the address space". In case of address space
1082 : // zero we know that this won't happen without triggering undefined behavior.
1083 11081 : if (!IsNoWrapAddRec && Stride != 1 && Stride != -1 &&
1084 28 : (IsInBoundsGEP || !NullPointerIsDefined(Lp->getHeader()->getParent(),
1085 : PtrTy->getAddressSpace()))) {
1086 63 : if (Assume) {
1087 : // We can avoid this case by adding a run-time check.
1088 : LLVM_DEBUG(dbgs() << "LAA: Non unit strided pointer which is not either "
1089 : << "inbouds or in address space 0 may wrap:\n"
1090 : << "LAA: Pointer: " << *Ptr << "\n"
1091 : << "LAA: SCEV: " << *AR << "\n"
1092 : << "LAA: Added an overflow assumption\n");
1093 54 : PSE.setNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW);
1094 : } else
1095 : return 0;
1096 : }
1097 :
1098 : return Stride;
1099 : }
1100 :
1101 103785 : bool llvm::sortPtrAccesses(ArrayRef<Value *> VL, const DataLayout &DL,
1102 : ScalarEvolution &SE,
1103 : SmallVectorImpl<unsigned> &SortedIndices) {
1104 : assert(llvm::all_of(
1105 : VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
1106 : "Expected list of pointer operands.");
1107 : SmallVector<std::pair<int64_t, Value *>, 4> OffValPairs;
1108 : OffValPairs.reserve(VL.size());
1109 :
1110 : // Walk over the pointers, and map each of them to an offset relative to
1111 : // first pointer in the array.
1112 103785 : Value *Ptr0 = VL[0];
1113 103785 : const SCEV *Scev0 = SE.getSCEV(Ptr0);
1114 103785 : Value *Obj0 = GetUnderlyingObject(Ptr0, DL);
1115 :
1116 103785 : llvm::SmallSet<int64_t, 4> Offsets;
1117 327625 : for (auto *Ptr : VL) {
1118 : // TODO: Outline this code as a special, more time consuming, version of
1119 : // computeConstantDifference() function.
1120 451832 : if (Ptr->getType()->getPointerAddressSpace() !=
1121 225916 : Ptr0->getType()->getPointerAddressSpace())
1122 2076 : return false;
1123 : // If a pointer refers to a different underlying object, bail - the
1124 : // pointers are by definition incomparable.
1125 225916 : Value *CurrObj = GetUnderlyingObject(Ptr, DL);
1126 225916 : if (CurrObj != Obj0)
1127 : return false;
1128 :
1129 223977 : const SCEV *Scev = SE.getSCEV(Ptr);
1130 223977 : const auto *Diff = dyn_cast<SCEVConstant>(SE.getMinusSCEV(Scev, Scev0));
1131 : // The pointers may not have a constant offset from each other, or SCEV
1132 : // may just not be smart enough to figure out they do. Regardless,
1133 : // there's nothing we can do.
1134 : if (!Diff)
1135 : return false;
1136 :
1137 : // Check if the pointer with the same offset is found.
1138 223842 : int64_t Offset = Diff->getAPInt().getSExtValue();
1139 223842 : if (!Offsets.insert(Offset).second)
1140 : return false;
1141 223840 : OffValPairs.emplace_back(Offset, Ptr);
1142 : }
1143 : SortedIndices.clear();
1144 101709 : SortedIndices.resize(VL.size());
1145 : std::iota(SortedIndices.begin(), SortedIndices.end(), 0);
1146 :
1147 : // Sort the memory accesses and keep the order of their uses in UseOrder.
1148 : std::stable_sort(SortedIndices.begin(), SortedIndices.end(),
1149 : [&OffValPairs](unsigned Left, unsigned Right) {
1150 447070 : return OffValPairs[Left].first < OffValPairs[Right].first;
1151 : });
1152 :
1153 : // Check if the order is consecutive already.
1154 101709 : if (llvm::all_of(SortedIndices, [&SortedIndices](const unsigned I) {
1155 0 : return I == SortedIndices[I];
1156 : }))
1157 : SortedIndices.clear();
1158 :
1159 : return true;
1160 : }
1161 :
1162 : /// Take the address space operand from the Load/Store instruction.
1163 : /// Returns -1 if this is not a valid Load/Store instruction.
1164 896512 : static unsigned getAddressSpaceOperand(Value *I) {
1165 : if (LoadInst *L = dyn_cast<LoadInst>(I))
1166 12426 : return L->getPointerAddressSpace();
1167 : if (StoreInst *S = dyn_cast<StoreInst>(I))
1168 884086 : return S->getPointerAddressSpace();
1169 : return -1;
1170 : }
1171 :
1172 : /// Returns true if the memory operations \p A and \p B are consecutive.
1173 448256 : bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL,
1174 : ScalarEvolution &SE, bool CheckType) {
1175 : Value *PtrA = getLoadStorePointerOperand(A);
1176 : Value *PtrB = getLoadStorePointerOperand(B);
1177 448256 : unsigned ASA = getAddressSpaceOperand(A);
1178 448256 : unsigned ASB = getAddressSpaceOperand(B);
1179 :
1180 : // Check that the address spaces match and that the pointers are valid.
1181 448256 : if (!PtrA || !PtrB || (ASA != ASB))
1182 : return false;
1183 :
1184 : // Make sure that A and B are different pointers.
1185 448246 : if (PtrA == PtrB)
1186 : return false;
1187 :
1188 : // Make sure that A and B have the same type if required.
1189 442766 : if (CheckType && PtrA->getType() != PtrB->getType())
1190 : return false;
1191 :
1192 : unsigned IdxWidth = DL.getIndexSizeInBits(ASA);
1193 419955 : Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
1194 : APInt Size(IdxWidth, DL.getTypeStoreSize(Ty));
1195 :
1196 : APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
1197 : PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
1198 : PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
1199 :
1200 : // OffsetDelta = OffsetB - OffsetA;
1201 419955 : const SCEV *OffsetSCEVA = SE.getConstant(OffsetA);
1202 419955 : const SCEV *OffsetSCEVB = SE.getConstant(OffsetB);
1203 419955 : const SCEV *OffsetDeltaSCEV = SE.getMinusSCEV(OffsetSCEVB, OffsetSCEVA);
1204 : const SCEVConstant *OffsetDeltaC = dyn_cast<SCEVConstant>(OffsetDeltaSCEV);
1205 : const APInt &OffsetDelta = OffsetDeltaC->getAPInt();
1206 : // Check if they are based on the same pointer. That makes the offsets
1207 : // sufficient.
1208 419955 : if (PtrA == PtrB)
1209 : return OffsetDelta == Size;
1210 :
1211 : // Compute the necessary base pointer delta to have the necessary final delta
1212 : // equal to the size.
1213 : // BaseDelta = Size - OffsetDelta;
1214 9336 : const SCEV *SizeSCEV = SE.getConstant(Size);
1215 9336 : const SCEV *BaseDelta = SE.getMinusSCEV(SizeSCEV, OffsetDeltaSCEV);
1216 :
1217 : // Otherwise compute the distance with SCEV between the base pointers.
1218 9336 : const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
1219 9336 : const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
1220 9336 : const SCEV *X = SE.getAddExpr(PtrSCEVA, BaseDelta);
1221 9336 : return X == PtrSCEVB;
1222 : }
1223 :
1224 5077 : bool MemoryDepChecker::Dependence::isSafeForVectorization(DepType Type) {
1225 : switch (Type) {
1226 : case NoDep:
1227 : case Forward:
1228 : case BackwardVectorizable:
1229 : return true;
1230 :
1231 : case Unknown:
1232 : case ForwardButPreventsForwarding:
1233 : case Backward:
1234 : case BackwardVectorizableButPreventsForwarding:
1235 : return false;
1236 : }
1237 0 : llvm_unreachable("unexpected DepType!");
1238 : }
1239 :
1240 123 : bool MemoryDepChecker::Dependence::isBackward() const {
1241 123 : switch (Type) {
1242 : case NoDep:
1243 : case Forward:
1244 : case ForwardButPreventsForwarding:
1245 : case Unknown:
1246 : return false;
1247 :
1248 51 : case BackwardVectorizable:
1249 : case Backward:
1250 : case BackwardVectorizableButPreventsForwarding:
1251 51 : return true;
1252 : }
1253 0 : llvm_unreachable("unexpected DepType!");
1254 : }
1255 :
1256 23 : bool MemoryDepChecker::Dependence::isPossiblyBackward() const {
1257 23 : return isBackward() || Type == Unknown;
1258 : }
1259 :
1260 0 : bool MemoryDepChecker::Dependence::isForward() const {
1261 0 : switch (Type) {
1262 : case Forward:
1263 : case ForwardButPreventsForwarding:
1264 : return true;
1265 :
1266 : case NoDep:
1267 : case Unknown:
1268 : case BackwardVectorizable:
1269 : case Backward:
1270 : case BackwardVectorizableButPreventsForwarding:
1271 : return false;
1272 : }
1273 0 : llvm_unreachable("unexpected DepType!");
1274 : }
1275 :
1276 47 : bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance,
1277 : uint64_t TypeByteSize) {
1278 : // If loads occur at a distance that is not a multiple of a feasible vector
1279 : // factor store-load forwarding does not take place.
1280 : // Positive dependences might cause troubles because vectorizing them might
1281 : // prevent store-load forwarding making vectorized code run a lot slower.
1282 : // a[i] = a[i-3] ^ a[i-8];
1283 : // The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and
1284 : // hence on your typical architecture store-load forwarding does not take
1285 : // place. Vectorizing in such cases does not make sense.
1286 : // Store-load forwarding distance.
1287 :
1288 : // After this many iterations store-to-load forwarding conflicts should not
1289 : // cause any slowdowns.
1290 47 : const uint64_t NumItersForStoreLoadThroughMemory = 8 * TypeByteSize;
1291 : // Maximum vector factor.
1292 : uint64_t MaxVFWithoutSLForwardIssues = std::min(
1293 47 : VectorizerParams::MaxVectorWidth * TypeByteSize, MaxSafeDepDistBytes);
1294 :
1295 : // Compute the smallest VF at which the store and load would be misaligned.
1296 96 : for (uint64_t VF = 2 * TypeByteSize; VF <= MaxVFWithoutSLForwardIssues;
1297 49 : VF *= 2) {
1298 : // If the number of vector iteration between the store and the load are
1299 : // small we could incur conflicts.
1300 89 : if (Distance % VF && Distance / VF < NumItersForStoreLoadThroughMemory) {
1301 40 : MaxVFWithoutSLForwardIssues = (VF >>= 1);
1302 40 : break;
1303 : }
1304 : }
1305 :
1306 47 : if (MaxVFWithoutSLForwardIssues < 2 * TypeByteSize) {
1307 : LLVM_DEBUG(
1308 : dbgs() << "LAA: Distance " << Distance
1309 : << " that could cause a store-load forwarding conflict\n");
1310 : return true;
1311 : }
1312 :
1313 25 : if (MaxVFWithoutSLForwardIssues < MaxSafeDepDistBytes &&
1314 : MaxVFWithoutSLForwardIssues !=
1315 : VectorizerParams::MaxVectorWidth * TypeByteSize)
1316 18 : MaxSafeDepDistBytes = MaxVFWithoutSLForwardIssues;
1317 : return false;
1318 : }
1319 :
1320 : /// Given a non-constant (unknown) dependence-distance \p Dist between two
1321 : /// memory accesses, that have the same stride whose absolute value is given
1322 : /// in \p Stride, and that have the same type size \p TypeByteSize,
1323 : /// in a loop whose takenCount is \p BackedgeTakenCount, check if it is
1324 : /// possible to prove statically that the dependence distance is larger
1325 : /// than the range that the accesses will travel through the execution of
1326 : /// the loop. If so, return true; false otherwise. This is useful for
1327 : /// example in loops such as the following (PR31098):
1328 : /// for (i = 0; i < D; ++i) {
1329 : /// = out[i];
1330 : /// out[i+D] =
1331 : /// }
1332 50 : static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
1333 : const SCEV &BackedgeTakenCount,
1334 : const SCEV &Dist, uint64_t Stride,
1335 : uint64_t TypeByteSize) {
1336 :
1337 : // If we can prove that
1338 : // (**) |Dist| > BackedgeTakenCount * Step
1339 : // where Step is the absolute stride of the memory accesses in bytes,
1340 : // then there is no dependence.
1341 : //
1342 : // Ratioanle:
1343 : // We basically want to check if the absolute distance (|Dist/Step|)
1344 : // is >= the loop iteration count (or > BackedgeTakenCount).
1345 : // This is equivalent to the Strong SIV Test (Practical Dependence Testing,
1346 : // Section 4.2.1); Note, that for vectorization it is sufficient to prove
1347 : // that the dependence distance is >= VF; This is checked elsewhere.
1348 : // But in some cases we can prune unknown dependence distances early, and
1349 : // even before selecting the VF, and without a runtime test, by comparing
1350 : // the distance against the loop iteration count. Since the vectorized code
1351 : // will be executed only if LoopCount >= VF, proving distance >= LoopCount
1352 : // also guarantees that distance >= VF.
1353 : //
1354 50 : const uint64_t ByteStride = Stride * TypeByteSize;
1355 50 : const SCEV *Step = SE.getConstant(BackedgeTakenCount.getType(), ByteStride);
1356 50 : const SCEV *Product = SE.getMulExpr(&BackedgeTakenCount, Step);
1357 :
1358 : const SCEV *CastedDist = &Dist;
1359 : const SCEV *CastedProduct = Product;
1360 50 : uint64_t DistTypeSize = DL.getTypeAllocSize(Dist.getType());
1361 50 : uint64_t ProductTypeSize = DL.getTypeAllocSize(Product->getType());
1362 :
1363 : // The dependence distance can be positive/negative, so we sign extend Dist;
1364 : // The multiplication of the absolute stride in bytes and the
1365 : // backdgeTakenCount is non-negative, so we zero extend Product.
1366 50 : if (DistTypeSize > ProductTypeSize)
1367 0 : CastedProduct = SE.getZeroExtendExpr(Product, Dist.getType());
1368 : else
1369 50 : CastedDist = SE.getNoopOrSignExtend(&Dist, Product->getType());
1370 :
1371 : // Is Dist - (BackedgeTakenCount * Step) > 0 ?
1372 : // (If so, then we have proven (**) because |Dist| >= Dist)
1373 50 : const SCEV *Minus = SE.getMinusSCEV(CastedDist, CastedProduct);
1374 50 : if (SE.isKnownPositive(Minus))
1375 : return true;
1376 :
1377 : // Second try: Is -Dist - (BackedgeTakenCount * Step) > 0 ?
1378 : // (If so, then we have proven (**) because |Dist| >= -1*Dist)
1379 34 : const SCEV *NegDist = SE.getNegativeSCEV(CastedDist);
1380 34 : Minus = SE.getMinusSCEV(NegDist, CastedProduct);
1381 34 : if (SE.isKnownPositive(Minus))
1382 8 : return true;
1383 :
1384 : return false;
1385 : }
1386 :
1387 : /// Check the dependence for two accesses with the same stride \p Stride.
1388 : /// \p Distance is the positive distance and \p TypeByteSize is type size in
1389 : /// bytes.
1390 : ///
1391 : /// \returns true if they are independent.
1392 : static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride,
1393 : uint64_t TypeByteSize) {
1394 : assert(Stride > 1 && "The stride must be greater than 1");
1395 : assert(TypeByteSize > 0 && "The type size in byte must be non-zero");
1396 : assert(Distance > 0 && "The distance must be non-zero");
1397 :
1398 : // Skip if the distance is not multiple of type byte size.
1399 167 : if (Distance % TypeByteSize)
1400 : return false;
1401 :
1402 155 : uint64_t ScaledDist = Distance / TypeByteSize;
1403 :
1404 : // No dependence if the scaled distance is not multiple of the stride.
1405 : // E.g.
1406 : // for (i = 0; i < 1024 ; i += 4)
1407 : // A[i+2] = A[i] + 1;
1408 : //
1409 : // Two accesses in memory (scaled distance is 2, stride is 4):
1410 : // | A[0] | | | | A[4] | | | |
1411 : // | | | A[2] | | | | A[6] | |
1412 : //
1413 : // E.g.
1414 : // for (i = 0; i < 1024 ; i += 3)
1415 : // A[i+4] = A[i] + 1;
1416 : //
1417 : // Two accesses in memory (scaled distance is 4, stride is 3):
1418 : // | A[0] | | | A[3] | | | A[6] | | |
1419 : // | | | | | A[4] | | | A[7] | |
1420 155 : return ScaledDist % Stride;
1421 : }
1422 :
1423 : MemoryDepChecker::Dependence::DepType
1424 5077 : MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
1425 : const MemAccessInfo &B, unsigned BIdx,
1426 : const ValueToValueMap &Strides) {
1427 : assert (AIdx < BIdx && "Must pass arguments in program order");
1428 :
1429 5077 : Value *APtr = A.getPointer();
1430 5077 : Value *BPtr = B.getPointer();
1431 : bool AIsWrite = A.getInt();
1432 : bool BIsWrite = B.getInt();
1433 :
1434 : // Two reads are independent.
1435 5077 : if (!AIsWrite && !BIsWrite)
1436 : return Dependence::NoDep;
1437 :
1438 : // We cannot check pointers in different address spaces.
1439 7616 : if (APtr->getType()->getPointerAddressSpace() !=
1440 3808 : BPtr->getType()->getPointerAddressSpace())
1441 : return Dependence::Unknown;
1442 :
1443 3808 : int64_t StrideAPtr = getPtrStride(PSE, APtr, InnermostLoop, Strides, true);
1444 3808 : int64_t StrideBPtr = getPtrStride(PSE, BPtr, InnermostLoop, Strides, true);
1445 :
1446 3808 : const SCEV *Src = PSE.getSCEV(APtr);
1447 3808 : const SCEV *Sink = PSE.getSCEV(BPtr);
1448 :
1449 : // If the induction step is negative we have to invert source and sink of the
1450 : // dependence.
1451 3808 : if (StrideAPtr < 0) {
1452 : std::swap(APtr, BPtr);
1453 : std::swap(Src, Sink);
1454 : std::swap(AIsWrite, BIsWrite);
1455 : std::swap(AIdx, BIdx);
1456 : std::swap(StrideAPtr, StrideBPtr);
1457 : }
1458 :
1459 3808 : const SCEV *Dist = PSE.getSE()->getMinusSCEV(Sink, Src);
1460 :
1461 : LLVM_DEBUG(dbgs() << "LAA: Src Scev: " << *Src << "Sink Scev: " << *Sink
1462 : << "(Induction step: " << StrideAPtr << ")\n");
1463 : LLVM_DEBUG(dbgs() << "LAA: Distance for " << *InstMap[AIdx] << " to "
1464 : << *InstMap[BIdx] << ": " << *Dist << "\n");
1465 :
1466 : // Need accesses with constant stride. We don't want to vectorize
1467 : // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in
1468 : // the address space.
1469 3808 : if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr){
1470 : LLVM_DEBUG(dbgs() << "Pointer access with non-constant stride\n");
1471 : return Dependence::Unknown;
1472 : }
1473 :
1474 542 : Type *ATy = APtr->getType()->getPointerElementType();
1475 542 : Type *BTy = BPtr->getType()->getPointerElementType();
1476 542 : auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout();
1477 542 : uint64_t TypeByteSize = DL.getTypeAllocSize(ATy);
1478 542 : uint64_t Stride = std::abs(StrideAPtr);
1479 : const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
1480 : if (!C) {
1481 100 : if (TypeByteSize == DL.getTypeAllocSize(BTy) &&
1482 50 : isSafeDependenceDistance(DL, *(PSE.getSE()),
1483 50 : *(PSE.getBackedgeTakenCount()), *Dist, Stride,
1484 : TypeByteSize))
1485 : return Dependence::NoDep;
1486 :
1487 : LLVM_DEBUG(dbgs() << "LAA: Dependence because of non-constant distance\n");
1488 26 : ShouldRetryWithRuntimeCheck = true;
1489 26 : return Dependence::Unknown;
1490 : }
1491 :
1492 : const APInt &Val = C->getAPInt();
1493 : int64_t Distance = Val.getSExtValue();
1494 :
1495 : // Attempt to prove strided accesses independent.
1496 647 : if (std::abs(Distance) > 0 && Stride > 1 && ATy == BTy &&
1497 167 : areStridedAccessesIndependent(std::abs(Distance), Stride, TypeByteSize)) {
1498 : LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n");
1499 : return Dependence::NoDep;
1500 : }
1501 :
1502 : // Negative distances are not plausible dependencies.
1503 359 : if (Val.isNegative()) {
1504 94 : bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
1505 46 : if (IsTrueDataDependence && EnableForwardingConflictDetection &&
1506 146 : (couldPreventStoreLoadForward(Val.abs().getZExtValue(), TypeByteSize) ||
1507 : ATy != BTy)) {
1508 : LLVM_DEBUG(dbgs() << "LAA: Forward but may prevent st->ld forwarding\n");
1509 : return Dependence::ForwardButPreventsForwarding;
1510 : }
1511 :
1512 : LLVM_DEBUG(dbgs() << "LAA: Dependence is negative\n");
1513 77 : return Dependence::Forward;
1514 : }
1515 :
1516 : // Write to the same location with the same size.
1517 : // Could be improved to assert type sizes are the same (i32 == float, etc).
1518 265 : if (Val == 0) {
1519 113 : if (ATy == BTy)
1520 : return Dependence::Forward;
1521 : LLVM_DEBUG(
1522 : dbgs() << "LAA: Zero dependence difference but different types\n");
1523 0 : return Dependence::Unknown;
1524 : }
1525 :
1526 : assert(Val.isStrictlyPositive() && "Expect a positive value");
1527 :
1528 152 : if (ATy != BTy) {
1529 : LLVM_DEBUG(
1530 : dbgs()
1531 : << "LAA: ReadWrite-Write positive dependency with different types\n");
1532 : return Dependence::Unknown;
1533 : }
1534 :
1535 : // Bail out early if passed-in parameters make vectorization not feasible.
1536 150 : unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ?
1537 150 : VectorizerParams::VectorizationFactor : 1);
1538 150 : unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ?
1539 150 : VectorizerParams::VectorizationInterleave : 1);
1540 : // The minimum number of iterations for a vectorized/unrolled version.
1541 150 : unsigned MinNumIter = std::max(ForcedFactor * ForcedUnroll, 2U);
1542 :
1543 : // It's not vectorizable if the distance is smaller than the minimum distance
1544 : // needed for a vectroized/unrolled version. Vectorizing one iteration in
1545 : // front needs TypeByteSize * Stride. Vectorizing the last iteration needs
1546 : // TypeByteSize (No need to plus the last gap distance).
1547 : //
1548 : // E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
1549 : // foo(int *A) {
1550 : // int *B = (int *)((char *)A + 14);
1551 : // for (i = 0 ; i < 1024 ; i += 2)
1552 : // B[i] = A[i] + 1;
1553 : // }
1554 : //
1555 : // Two accesses in memory (stride is 2):
1556 : // | A[0] | | A[2] | | A[4] | | A[6] | |
1557 : // | B[0] | | B[2] | | B[4] |
1558 : //
1559 : // Distance needs for vectorizing iterations except the last iteration:
1560 : // 4 * 2 * (MinNumIter - 1). Distance needs for the last iteration: 4.
1561 : // So the minimum distance needed is: 4 * 2 * (MinNumIter - 1) + 4.
1562 : //
1563 : // If MinNumIter is 2, it is vectorizable as the minimum distance needed is
1564 : // 12, which is less than distance.
1565 : //
1566 : // If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4),
1567 : // the minimum distance needed is 28, which is greater than distance. It is
1568 : // not safe to do vectorization.
1569 150 : uint64_t MinDistanceNeeded =
1570 150 : TypeByteSize * Stride * (MinNumIter - 1) + TypeByteSize;
1571 150 : if (MinDistanceNeeded > static_cast<uint64_t>(Distance)) {
1572 : LLVM_DEBUG(dbgs() << "LAA: Failure because of positive distance "
1573 : << Distance << '\n');
1574 : return Dependence::Backward;
1575 : }
1576 :
1577 : // Unsafe if the minimum distance needed is greater than max safe distance.
1578 47 : if (MinDistanceNeeded > MaxSafeDepDistBytes) {
1579 : LLVM_DEBUG(dbgs() << "LAA: Failure because it needs at least "
1580 : << MinDistanceNeeded << " size in bytes");
1581 : return Dependence::Backward;
1582 : }
1583 :
1584 : // Positive distance bigger than max vectorization factor.
1585 : // FIXME: Should use max factor instead of max distance in bytes, which could
1586 : // not handle different types.
1587 : // E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
1588 : // void foo (int *A, char *B) {
1589 : // for (unsigned i = 0; i < 1024; i++) {
1590 : // A[i+2] = A[i] + 1;
1591 : // B[i+2] = B[i] + 1;
1592 : // }
1593 : // }
1594 : //
1595 : // This case is currently unsafe according to the max safe distance. If we
1596 : // analyze the two accesses on array B, the max safe dependence distance
1597 : // is 2. Then we analyze the accesses on array A, the minimum distance needed
1598 : // is 8, which is less than 2 and forbidden vectorization, But actually
1599 : // both A and B could be vectorized by 2 iterations.
1600 47 : MaxSafeDepDistBytes =
1601 47 : std::min(static_cast<uint64_t>(Distance), MaxSafeDepDistBytes);
1602 :
1603 47 : bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
1604 50 : if (IsTrueDataDependence && EnableForwardingConflictDetection &&
1605 24 : couldPreventStoreLoadForward(Distance, TypeByteSize))
1606 : return Dependence::BackwardVectorizableButPreventsForwarding;
1607 :
1608 42 : uint64_t MaxVF = MaxSafeDepDistBytes / (TypeByteSize * Stride);
1609 : LLVM_DEBUG(dbgs() << "LAA: Positive distance " << Val.getSExtValue()
1610 : << " with max VF = " << MaxVF << '\n');
1611 42 : uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8;
1612 42 : MaxSafeRegisterWidth = std::min(MaxSafeRegisterWidth, MaxVFInBits);
1613 42 : return Dependence::BackwardVectorizable;
1614 : }
1615 :
1616 872 : bool MemoryDepChecker::areDepsSafe(DepCandidates &AccessSets,
1617 : MemAccessInfoList &CheckDeps,
1618 : const ValueToValueMap &Strides) {
1619 :
1620 872 : MaxSafeDepDistBytes = -1;
1621 : SmallPtrSet<MemAccessInfo, 8> Visited;
1622 3606 : for (MemAccessInfo CurAccess : CheckDeps) {
1623 2746 : if (Visited.count(CurAccess))
1624 : continue;
1625 :
1626 : // Get the relevant memory access set.
1627 : EquivalenceClasses<MemAccessInfo>::iterator I =
1628 1672 : AccessSets.findValue(AccessSets.getLeaderValue(CurAccess));
1629 :
1630 : // Check accesses within this set.
1631 : EquivalenceClasses<MemAccessInfo>::member_iterator AI =
1632 : AccessSets.member_begin(I);
1633 : EquivalenceClasses<MemAccessInfo>::member_iterator AE =
1634 : AccessSets.member_end();
1635 :
1636 : // Check every access pair.
1637 5002 : while (AI != AE) {
1638 3342 : Visited.insert(*AI);
1639 : EquivalenceClasses<MemAccessInfo>::member_iterator OI = std::next(AI);
1640 6971 : while (OI != AE) {
1641 : // Check every accessing instruction pair in program order.
1642 3641 : for (std::vector<unsigned>::iterator I1 = Accesses[*AI].begin(),
1643 7620 : I1E = Accesses[*AI].end(); I1 != I1E; ++I1)
1644 : for (std::vector<unsigned>::iterator I2 = Accesses[*OI].begin(),
1645 9056 : I2E = Accesses[*OI].end(); I2 != I2E; ++I2) {
1646 5077 : auto A = std::make_pair(&*AI, *I1);
1647 5077 : auto B = std::make_pair(&*OI, *I2);
1648 :
1649 : assert(*I1 != *I2);
1650 5077 : if (*I1 > *I2)
1651 : std::swap(A, B);
1652 :
1653 : Dependence::DepType Type =
1654 5077 : isDependent(*A.first, A.second, *B.first, B.second, Strides);
1655 5077 : SafeForVectorization &= Dependence::isSafeForVectorization(Type);
1656 :
1657 : // Gather dependences unless we accumulated MaxDependences
1658 : // dependences. In that case return as soon as we find the first
1659 : // unsafe dependence. This puts a limit on this quadratic
1660 : // algorithm.
1661 5077 : if (RecordDependences) {
1662 5077 : if (Type != Dependence::NoDep)
1663 7302 : Dependences.push_back(Dependence(A.second, B.second, Type));
1664 :
1665 10154 : if (Dependences.size() >= MaxDependences) {
1666 12 : RecordDependences = false;
1667 : Dependences.clear();
1668 : LLVM_DEBUG(dbgs()
1669 : << "Too many dependences, stopped recording\n");
1670 : }
1671 : }
1672 5077 : if (!RecordDependences && !SafeForVectorization)
1673 : return false;
1674 : }
1675 : ++OI;
1676 : }
1677 : AI++;
1678 : }
1679 : }
1680 :
1681 : LLVM_DEBUG(dbgs() << "Total Dependences: " << Dependences.size() << "\n");
1682 860 : return SafeForVectorization;
1683 : }
1684 :
1685 : SmallVector<Instruction *, 4>
1686 112 : MemoryDepChecker::getInstructionsForAccess(Value *Ptr, bool isWrite) const {
1687 : MemAccessInfo Access(Ptr, isWrite);
1688 112 : auto &IndexVector = Accesses.find(Access)->second;
1689 :
1690 : SmallVector<Instruction *, 4> Insts;
1691 : transform(IndexVector,
1692 : std::back_inserter(Insts),
1693 224 : [&](unsigned Idx) { return this->InstMap[Idx]; });
1694 112 : return Insts;
1695 : }
1696 :
1697 : const char *MemoryDepChecker::Dependence::DepName[] = {
1698 : "NoDep", "Unknown", "Forward", "ForwardButPreventsForwarding", "Backward",
1699 : "BackwardVectorizable", "BackwardVectorizableButPreventsForwarding"};
1700 :
1701 78 : void MemoryDepChecker::Dependence::print(
1702 : raw_ostream &OS, unsigned Depth,
1703 : const SmallVectorImpl<Instruction *> &Instrs) const {
1704 78 : OS.indent(Depth) << DepName[Type] << ":\n";
1705 78 : OS.indent(Depth + 2) << *Instrs[Source] << " -> \n";
1706 78 : OS.indent(Depth + 2) << *Instrs[Destination] << "\n";
1707 78 : }
1708 :
1709 5128 : bool LoopAccessInfo::canAnalyzeLoop() {
1710 : // We need to have a loop header.
1711 : LLVM_DEBUG(dbgs() << "LAA: Found a loop in "
1712 : << TheLoop->getHeader()->getParent()->getName() << ": "
1713 : << TheLoop->getHeader()->getName() << '\n');
1714 :
1715 : // We can only analyze innermost loops.
1716 10256 : if (!TheLoop->empty()) {
1717 : LLVM_DEBUG(dbgs() << "LAA: loop is not the innermost loop\n");
1718 11 : recordAnalysis("NotInnerMostLoop") << "loop is not the innermost loop";
1719 11 : return false;
1720 : }
1721 :
1722 : // We must have a single backedge.
1723 5117 : if (TheLoop->getNumBackEdges() != 1) {
1724 : LLVM_DEBUG(
1725 : dbgs() << "LAA: loop control flow is not understood by analyzer\n");
1726 : recordAnalysis("CFGNotUnderstood")
1727 0 : << "loop control flow is not understood by analyzer";
1728 0 : return false;
1729 : }
1730 :
1731 : // We must have a single exiting block.
1732 5117 : if (!TheLoop->getExitingBlock()) {
1733 : LLVM_DEBUG(
1734 : dbgs() << "LAA: loop control flow is not understood by analyzer\n");
1735 : recordAnalysis("CFGNotUnderstood")
1736 1592 : << "loop control flow is not understood by analyzer";
1737 1592 : return false;
1738 : }
1739 :
1740 : // We only handle bottom-tested loops, i.e. loop in which the condition is
1741 : // checked at the end of each iteration. With that we can assume that all
1742 : // instructions in the loop are executed the same number of times.
1743 3525 : if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
1744 : LLVM_DEBUG(
1745 : dbgs() << "LAA: loop control flow is not understood by analyzer\n");
1746 : recordAnalysis("CFGNotUnderstood")
1747 127 : << "loop control flow is not understood by analyzer";
1748 127 : return false;
1749 : }
1750 :
1751 : // ScalarEvolution needs to be able to find the exit count.
1752 3398 : const SCEV *ExitCount = PSE->getBackedgeTakenCount();
1753 3398 : if (ExitCount == PSE->getSE()->getCouldNotCompute()) {
1754 : recordAnalysis("CantComputeNumberOfIterations")
1755 1108 : << "could not determine number of loop iterations";
1756 : LLVM_DEBUG(dbgs() << "LAA: SCEV could not compute the loop exit count.\n");
1757 1108 : return false;
1758 : }
1759 :
1760 : return true;
1761 : }
1762 :
1763 2290 : void LoopAccessInfo::analyzeLoop(AliasAnalysis *AA, LoopInfo *LI,
1764 : const TargetLibraryInfo *TLI,
1765 : DominatorTree *DT) {
1766 : typedef SmallPtrSet<Value*, 16> ValueSet;
1767 :
1768 : // Holds the Load and Store instructions.
1769 : SmallVector<LoadInst *, 16> Loads;
1770 : SmallVector<StoreInst *, 16> Stores;
1771 :
1772 : // Holds all the different accesses in the loop.
1773 : unsigned NumReads = 0;
1774 : unsigned NumReadWrites = 0;
1775 :
1776 : PtrRtChecking->Pointers.clear();
1777 2290 : PtrRtChecking->Need = false;
1778 :
1779 2290 : const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
1780 :
1781 : // For each block.
1782 5221 : for (BasicBlock *BB : TheLoop->blocks()) {
1783 : // Scan the BB and collect legal loads and stores.
1784 40425 : for (Instruction &I : *BB) {
1785 : // If this is a load, save it. If this instruction can read from memory
1786 : // but is not a load, then we quit. Notice that we don't handle function
1787 : // calls that read or write.
1788 37494 : if (I.mayReadFromMemory()) {
1789 : // Many math library functions read the rounding mode. We will only
1790 : // vectorize a loop if it contains known function calls that don't set
1791 : // the flag. Therefore, it is safe to ignore this read from memory.
1792 : auto *Call = dyn_cast<CallInst>(&I);
1793 385 : if (Call && getVectorIntrinsicIDForCall(Call, TLI))
1794 5240 : continue;
1795 :
1796 : // If the function has an explicit vectorized counterpart, we can safely
1797 : // assume that it can be vectorized.
1798 5569 : if (Call && !Call->isNoBuiltin() && Call->getCalledFunction() &&
1799 308 : TLI->isFunctionVectorizable(Call->getCalledFunction()->getName()))
1800 : continue;
1801 :
1802 5563 : auto *Ld = dyn_cast<LoadInst>(&I);
1803 5567 : if (!Ld || (!Ld->isSimple() && !IsAnnotatedParallel)) {
1804 : recordAnalysis("NonSimpleLoad", Ld)
1805 714 : << "read with atomic ordering or volatile read";
1806 : LLVM_DEBUG(dbgs() << "LAA: Found a non-simple load.\n");
1807 357 : CanVecMem = false;
1808 357 : return;
1809 : }
1810 5206 : NumLoads++;
1811 5206 : Loads.push_back(Ld);
1812 5206 : DepChecker->addAccess(Ld);
1813 5206 : if (EnableMemAccessVersioning)
1814 5198 : collectStridedAccess(Ld);
1815 5206 : continue;
1816 : }
1817 :
1818 : // Save 'store' instructions. Abort if other instructions write to memory.
1819 31897 : if (I.mayWriteToMemory()) {
1820 4260 : auto *St = dyn_cast<StoreInst>(&I);
1821 4260 : if (!St) {
1822 : recordAnalysis("CantVectorizeInstruction", St)
1823 0 : << "instruction cannot be vectorized";
1824 0 : CanVecMem = false;
1825 0 : return;
1826 : }
1827 0 : if (!St->isSimple() && !IsAnnotatedParallel) {
1828 : recordAnalysis("NonSimpleStore", St)
1829 0 : << "write with atomic ordering or volatile write";
1830 : LLVM_DEBUG(dbgs() << "LAA: Found a non-simple store.\n");
1831 0 : CanVecMem = false;
1832 0 : return;
1833 : }
1834 4260 : NumStores++;
1835 4260 : Stores.push_back(St);
1836 4260 : DepChecker->addAccess(St);
1837 4260 : if (EnableMemAccessVersioning)
1838 4256 : collectStridedAccess(St);
1839 : }
1840 : } // Next instr.
1841 : } // Next block.
1842 :
1843 : // Now we have two lists that hold the loads and the stores.
1844 : // Next, we find the pointers that they use.
1845 :
1846 : // Check if we see any stores. If there are no stores, then we don't
1847 : // care if the pointers are *restrict*.
1848 1933 : if (!Stores.size()) {
1849 : LLVM_DEBUG(dbgs() << "LAA: Found a read-only loop!\n");
1850 345 : CanVecMem = true;
1851 345 : return;
1852 : }
1853 :
1854 : MemoryDepChecker::DepCandidates DependentAccesses;
1855 : AccessAnalysis Accesses(TheLoop->getHeader()->getModule()->getDataLayout(),
1856 2778 : TheLoop, AA, LI, DependentAccesses, *PSE);
1857 :
1858 : // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
1859 : // multiple times on the same object. If the ptr is accessed twice, once
1860 : // for read and once for write, it will only appear once (on the write
1861 : // list). This is okay, since we are going to check for conflicts between
1862 : // writes and between reads and writes, but not between reads and reads.
1863 : ValueSet Seen;
1864 :
1865 : // Record uniform store addresses to identify if we have multiple stores
1866 : // to the same address.
1867 : ValueSet UniformStores;
1868 :
1869 5501 : for (StoreInst *ST : Stores) {
1870 : Value *Ptr = ST->getPointerOperand();
1871 :
1872 3913 : if (isUniform(Ptr))
1873 1962 : HasMultipleStoresToLoopInvariantAddress |=
1874 1962 : !UniformStores.insert(Ptr).second;
1875 :
1876 : // If we did *not* see this pointer before, insert it to the read-write
1877 : // list. At this phase it is only a 'write' list.
1878 3913 : if (Seen.insert(Ptr).second) {
1879 3676 : ++NumReadWrites;
1880 :
1881 3676 : MemoryLocation Loc = MemoryLocation::get(ST);
1882 : // The TBAA metadata could have a control dependency on the predication
1883 : // condition, so we cannot rely on it when determining whether or not we
1884 : // need runtime pointer checks.
1885 3676 : if (blockNeedsPredication(ST->getParent(), TheLoop, DT))
1886 475 : Loc.AATags.TBAA = nullptr;
1887 :
1888 3676 : Accesses.addStore(Loc);
1889 : }
1890 : }
1891 :
1892 1588 : if (IsAnnotatedParallel) {
1893 : LLVM_DEBUG(
1894 : dbgs() << "LAA: A loop annotated parallel, ignore memory dependency "
1895 : << "checks.\n");
1896 9 : CanVecMem = true;
1897 9 : return;
1898 : }
1899 :
1900 5482 : for (LoadInst *LD : Loads) {
1901 : Value *Ptr = LD->getPointerOperand();
1902 : // If we did *not* see this pointer before, insert it to the
1903 : // read list. If we *did* see it before, then it is already in
1904 : // the read-write list. This allows us to vectorize expressions
1905 : // such as A[i] += x; Because the address of A[i] is a read-write
1906 : // pointer. This only works if the index of A[i] is consecutive.
1907 : // If the address of i is unknown (for example A[B[i]]) then we may
1908 : // read a few words, modify, and write a few words, and some of the
1909 : // words may be written to the same address.
1910 : bool IsReadOnlyPtr = false;
1911 6008 : if (Seen.insert(Ptr).second ||
1912 4210 : !getPtrStride(*PSE, Ptr, TheLoop, SymbolicStrides)) {
1913 3659 : ++NumReads;
1914 : IsReadOnlyPtr = true;
1915 : }
1916 :
1917 3903 : MemoryLocation Loc = MemoryLocation::get(LD);
1918 : // The TBAA metadata could have a control dependency on the predication
1919 : // condition, so we cannot rely on it when determining whether or not we
1920 : // need runtime pointer checks.
1921 3903 : if (blockNeedsPredication(LD->getParent(), TheLoop, DT))
1922 477 : Loc.AATags.TBAA = nullptr;
1923 :
1924 3903 : Accesses.addLoad(Loc, IsReadOnlyPtr);
1925 : }
1926 :
1927 : // If we write (or read-write) to a single destination and there are no
1928 : // other reads in this loop then is it safe to vectorize.
1929 1579 : if (NumReadWrites == 1 && NumReads == 0) {
1930 : LLVM_DEBUG(dbgs() << "LAA: Found a write-only loop!\n");
1931 252 : CanVecMem = true;
1932 252 : return;
1933 : }
1934 :
1935 : // Build dependence sets and check whether we need a runtime pointer bounds
1936 : // check.
1937 : Accesses.buildDependenceSets();
1938 :
1939 : // Find pointers with computable bounds. We are going to use this information
1940 : // to place a runtime bound check.
1941 2654 : bool CanDoRTIfNeeded = Accesses.canCheckPtrAtRT(*PtrRtChecking, PSE->getSE(),
1942 1327 : TheLoop, SymbolicStrides);
1943 1327 : if (!CanDoRTIfNeeded) {
1944 137 : recordAnalysis("CantIdentifyArrayBounds") << "cannot identify array bounds";
1945 : LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because we can't find "
1946 : << "the array bounds.\n");
1947 137 : CanVecMem = false;
1948 137 : return;
1949 : }
1950 :
1951 : LLVM_DEBUG(
1952 : dbgs() << "LAA: We can perform a memory runtime check if needed.\n");
1953 :
1954 1190 : CanVecMem = true;
1955 1190 : if (Accesses.isDependencyCheckNeeded()) {
1956 : LLVM_DEBUG(dbgs() << "LAA: Checking memory dependencies\n");
1957 872 : CanVecMem = DepChecker->areDepsSafe(
1958 : DependentAccesses, Accesses.getDependenciesToCheck(), SymbolicStrides);
1959 872 : MaxSafeDepDistBytes = DepChecker->getMaxSafeDepDistBytes();
1960 :
1961 872 : if (!CanVecMem && DepChecker->shouldRetryWithRuntimeCheck()) {
1962 : LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n");
1963 :
1964 : // Clear the dependency checks. We assume they are not needed.
1965 : Accesses.resetDepChecks(*DepChecker);
1966 :
1967 : PtrRtChecking->reset();
1968 20 : PtrRtChecking->Need = true;
1969 :
1970 20 : auto *SE = PSE->getSE();
1971 40 : CanDoRTIfNeeded = Accesses.canCheckPtrAtRT(*PtrRtChecking, SE, TheLoop,
1972 : SymbolicStrides, true);
1973 :
1974 : // Check that we found the bounds for the pointer.
1975 20 : if (!CanDoRTIfNeeded) {
1976 : recordAnalysis("CantCheckMemDepsAtRunTime")
1977 0 : << "cannot check memory dependencies at runtime";
1978 : LLVM_DEBUG(dbgs() << "LAA: Can't vectorize with memory checks\n");
1979 0 : CanVecMem = false;
1980 0 : return;
1981 : }
1982 :
1983 20 : CanVecMem = true;
1984 : }
1985 : }
1986 :
1987 1190 : if (CanVecMem)
1988 : LLVM_DEBUG(
1989 : dbgs() << "LAA: No unsafe dependent memory operations in loop. We"
1990 : << (PtrRtChecking->Need ? "" : " don't")
1991 : << " need runtime memory checks.\n");
1992 : else {
1993 : recordAnalysis("UnsafeMemDep")
1994 : << "unsafe dependent memory operations in loop. Use "
1995 : "#pragma loop distribute(enable) to allow loop distribution "
1996 : "to attempt to isolate the offending operations into a separate "
1997 688 : "loop";
1998 : LLVM_DEBUG(dbgs() << "LAA: unsafe dependent memory operations in loop\n");
1999 : }
2000 : }
2001 :
2002 35834 : bool LoopAccessInfo::blockNeedsPredication(BasicBlock *BB, Loop *TheLoop,
2003 : DominatorTree *DT) {
2004 : assert(TheLoop->contains(BB) && "Unknown block used");
2005 :
2006 : // Blocks that do not dominate the latch need predication.
2007 35834 : BasicBlock* Latch = TheLoop->getLoopLatch();
2008 35834 : return !DT->dominates(BB, Latch);
2009 : }
2010 :
2011 3676 : OptimizationRemarkAnalysis &LoopAccessInfo::recordAnalysis(StringRef RemarkName,
2012 : Instruction *I) {
2013 : assert(!Report && "Multiple reports generated");
2014 :
2015 3676 : Value *CodeRegion = TheLoop->getHeader();
2016 3676 : DebugLoc DL = TheLoop->getStartLoc();
2017 :
2018 3676 : if (I) {
2019 4 : CodeRegion = I->getParent();
2020 : // If there is no debug location attached to the instruction, revert back to
2021 : // using the loop's.
2022 4 : if (I->getDebugLoc())
2023 : DL = I->getDebugLoc();
2024 : }
2025 :
2026 7352 : Report = make_unique<OptimizationRemarkAnalysis>(DEBUG_TYPE, RemarkName, DL,
2027 : CodeRegion);
2028 3676 : return *Report;
2029 : }
2030 :
2031 8233 : bool LoopAccessInfo::isUniform(Value *V) const {
2032 8233 : auto *SE = PSE->getSE();
2033 : // Since we rely on SCEV for uniformity, if the type is not SCEVable, it is
2034 : // never considered uniform.
2035 : // TODO: Is this really what we want? Even without FP SCEV, we may want some
2036 : // trivially loop-invariant FP values to be considered uniform.
2037 8233 : if (!SE->isSCEVable(V->getType()))
2038 : return false;
2039 7706 : return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
2040 : }
2041 :
2042 : // FIXME: this function is currently a duplicate of the one in
2043 : // LoopVectorize.cpp.
2044 : static Instruction *getFirstInst(Instruction *FirstInst, Value *V,
2045 : Instruction *Loc) {
2046 1436 : if (FirstInst)
2047 : return FirstInst;
2048 : if (Instruction *I = dyn_cast<Instruction>(V))
2049 227 : return I->getParent() == Loc->getParent() ? I : nullptr;
2050 : return nullptr;
2051 : }
2052 :
2053 : namespace {
2054 :
2055 : /// IR Values for the lower and upper bounds of a pointer evolution. We
2056 : /// need to use value-handles because SCEV expansion can invalidate previously
2057 : /// expanded values. Thus expansion of a pointer can invalidate the bounds for
2058 : /// a previous one.
2059 : struct PointerBounds {
2060 : TrackingVH<Value> Start;
2061 : TrackingVH<Value> End;
2062 : };
2063 :
2064 : } // end anonymous namespace
2065 :
2066 : /// Expand code for the lower and upper bound of the pointer group \p CG
2067 : /// in \p TheLoop. \return the values for the bounds.
2068 : static PointerBounds
2069 718 : expandBounds(const RuntimePointerChecking::CheckingPtrGroup *CG, Loop *TheLoop,
2070 : Instruction *Loc, SCEVExpander &Exp, ScalarEvolution *SE,
2071 : const RuntimePointerChecking &PtrRtChecking) {
2072 718 : Value *Ptr = PtrRtChecking.Pointers[CG->Members[0]].PointerValue;
2073 718 : const SCEV *Sc = SE->getSCEV(Ptr);
2074 :
2075 718 : unsigned AS = Ptr->getType()->getPointerAddressSpace();
2076 718 : LLVMContext &Ctx = Loc->getContext();
2077 :
2078 : // Use this type for pointer arithmetic.
2079 718 : Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS);
2080 :
2081 718 : if (SE->isLoopInvariant(Sc, TheLoop)) {
2082 : LLVM_DEBUG(dbgs() << "LAA: Adding RT check for a loop invariant ptr:"
2083 : << *Ptr << "\n");
2084 : // Ptr could be in the loop body. If so, expand a new one at the correct
2085 : // location.
2086 : Instruction *Inst = dyn_cast<Instruction>(Ptr);
2087 8 : Value *NewPtr = (Inst && TheLoop->contains(Inst))
2088 4 : ? Exp.expandCodeFor(Sc, PtrArithTy, Loc)
2089 : : Ptr;
2090 : // We must return a half-open range, which means incrementing Sc.
2091 29 : const SCEV *ScPlusOne = SE->getAddExpr(Sc, SE->getOne(PtrArithTy));
2092 29 : Value *NewPtrPlusOne = Exp.expandCodeFor(ScPlusOne, PtrArithTy, Loc);
2093 : return {NewPtr, NewPtrPlusOne};
2094 : } else {
2095 : Value *Start = nullptr, *End = nullptr;
2096 : LLVM_DEBUG(dbgs() << "LAA: Adding RT check for range:\n");
2097 689 : Start = Exp.expandCodeFor(CG->Low, PtrArithTy, Loc);
2098 689 : End = Exp.expandCodeFor(CG->High, PtrArithTy, Loc);
2099 : LLVM_DEBUG(dbgs() << "Start: " << *CG->Low << " End: " << *CG->High
2100 : << "\n");
2101 : return {Start, End};
2102 : }
2103 : }
2104 :
2105 : /// Turns a collection of checks into a collection of expanded upper and
2106 : /// lower bounds for both pointers in the check.
2107 238 : static SmallVector<std::pair<PointerBounds, PointerBounds>, 4> expandBounds(
2108 : const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &PointerChecks,
2109 : Loop *L, Instruction *Loc, ScalarEvolution *SE, SCEVExpander &Exp,
2110 : const RuntimePointerChecking &PtrRtChecking) {
2111 : SmallVector<std::pair<PointerBounds, PointerBounds>, 4> ChecksWithBounds;
2112 :
2113 : // Here we're relying on the SCEV Expander's cache to only emit code for the
2114 : // same bounds once.
2115 : transform(
2116 : PointerChecks, std::back_inserter(ChecksWithBounds),
2117 : [&](const RuntimePointerChecking::PointerCheck &Check) {
2118 : PointerBounds
2119 : First = expandBounds(Check.first, L, Loc, Exp, SE, PtrRtChecking),
2120 : Second = expandBounds(Check.second, L, Loc, Exp, SE, PtrRtChecking);
2121 : return std::make_pair(First, Second);
2122 238 : });
2123 :
2124 238 : return ChecksWithBounds;
2125 : }
2126 :
2127 238 : std::pair<Instruction *, Instruction *> LoopAccessInfo::addRuntimeChecks(
2128 : Instruction *Loc,
2129 : const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &PointerChecks)
2130 : const {
2131 238 : const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout();
2132 238 : auto *SE = PSE->getSE();
2133 476 : SCEVExpander Exp(*SE, DL, "induction");
2134 : auto ExpandedChecks =
2135 476 : expandBounds(PointerChecks, TheLoop, Loc, SE, Exp, *PtrRtChecking);
2136 :
2137 238 : LLVMContext &Ctx = Loc->getContext();
2138 : Instruction *FirstInst = nullptr;
2139 238 : IRBuilder<> ChkBuilder(Loc);
2140 : // Our instructions might fold to a constant.
2141 : Value *MemoryRuntimeCheck = nullptr;
2142 :
2143 597 : for (const auto &Check : ExpandedChecks) {
2144 : const PointerBounds &A = Check.first, &B = Check.second;
2145 : // Check if two pointers (A and B) conflict where conflict is computed as:
2146 : // start(A) <= end(B) && start(B) <= end(A)
2147 359 : unsigned AS0 = A.Start->getType()->getPointerAddressSpace();
2148 359 : unsigned AS1 = B.Start->getType()->getPointerAddressSpace();
2149 :
2150 : assert((AS0 == B.End->getType()->getPointerAddressSpace()) &&
2151 : (AS1 == A.End->getType()->getPointerAddressSpace()) &&
2152 : "Trying to bounds check pointers with different address spaces");
2153 :
2154 359 : Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0);
2155 359 : Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1);
2156 :
2157 359 : Value *Start0 = ChkBuilder.CreateBitCast(A.Start, PtrArithTy0, "bc");
2158 359 : Value *Start1 = ChkBuilder.CreateBitCast(B.Start, PtrArithTy1, "bc");
2159 359 : Value *End0 = ChkBuilder.CreateBitCast(A.End, PtrArithTy1, "bc");
2160 359 : Value *End1 = ChkBuilder.CreateBitCast(B.End, PtrArithTy0, "bc");
2161 :
2162 : // [A|B].Start points to the first accessed byte under base [A|B].
2163 : // [A|B].End points to the last accessed byte, plus one.
2164 : // There is no conflict when the intervals are disjoint:
2165 : // NoConflict = (B.Start >= A.End) || (A.Start >= B.End)
2166 : //
2167 : // bound0 = (B.Start < A.End)
2168 : // bound1 = (A.Start < B.End)
2169 : // IsConflict = bound0 & bound1
2170 359 : Value *Cmp0 = ChkBuilder.CreateICmpULT(Start0, End1, "bound0");
2171 : FirstInst = getFirstInst(FirstInst, Cmp0, Loc);
2172 359 : Value *Cmp1 = ChkBuilder.CreateICmpULT(Start1, End0, "bound1");
2173 : FirstInst = getFirstInst(FirstInst, Cmp1, Loc);
2174 359 : Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict");
2175 : FirstInst = getFirstInst(FirstInst, IsConflict, Loc);
2176 359 : if (MemoryRuntimeCheck) {
2177 : IsConflict =
2178 132 : ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, "conflict.rdx");
2179 : FirstInst = getFirstInst(FirstInst, IsConflict, Loc);
2180 : }
2181 : MemoryRuntimeCheck = IsConflict;
2182 : }
2183 :
2184 238 : if (!MemoryRuntimeCheck)
2185 11 : return std::make_pair(nullptr, nullptr);
2186 :
2187 : // We have to do this trickery because the IRBuilder might fold the check to a
2188 : // constant expression in which case there is no Instruction anchored in a
2189 : // the block.
2190 227 : Instruction *Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck,
2191 227 : ConstantInt::getTrue(Ctx));
2192 227 : ChkBuilder.Insert(Check, "memcheck.conflict");
2193 : FirstInst = getFirstInst(FirstInst, Check, Loc);
2194 : return std::make_pair(FirstInst, Check);
2195 : }
2196 :
2197 : std::pair<Instruction *, Instruction *>
2198 906 : LoopAccessInfo::addRuntimeChecks(Instruction *Loc) const {
2199 906 : if (!PtrRtChecking->Need)
2200 708 : return std::make_pair(nullptr, nullptr);
2201 :
2202 198 : return addRuntimeChecks(Loc, PtrRtChecking->getChecks());
2203 : }
2204 :
2205 9454 : void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
2206 : Value *Ptr = nullptr;
2207 : if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess))
2208 : Ptr = LI->getPointerOperand();
2209 : else if (StoreInst *SI = dyn_cast<StoreInst>(MemAccess))
2210 : Ptr = SI->getPointerOperand();
2211 : else
2212 : return;
2213 :
2214 18908 : Value *Stride = getStrideFromPointer(Ptr, PSE->getSE(), TheLoop);
2215 9454 : if (!Stride)
2216 : return;
2217 :
2218 : LLVM_DEBUG(dbgs() << "LAA: Found a strided access that is a candidate for "
2219 : "versioning:");
2220 : LLVM_DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *Stride << "\n");
2221 :
2222 : // Avoid adding the "Stride == 1" predicate when we know that
2223 : // Stride >= Trip-Count. Such a predicate will effectively optimize a single
2224 : // or zero iteration loop, as Trip-Count <= Stride == 1.
2225 : //
2226 : // TODO: We are currently not making a very informed decision on when it is
2227 : // beneficial to apply stride versioning. It might make more sense that the
2228 : // users of this analysis (such as the vectorizer) will trigger it, based on
2229 : // their specific cost considerations; For example, in cases where stride
2230 : // versioning does not help resolving memory accesses/dependences, the
2231 : // vectorizer should evaluate the cost of the runtime test, and the benefit
2232 : // of various possible stride specializations, considering the alternatives
2233 : // of using gather/scatters (if available).
2234 :
2235 15 : const SCEV *StrideExpr = PSE->getSCEV(Stride);
2236 15 : const SCEV *BETakenCount = PSE->getBackedgeTakenCount();
2237 :
2238 : // Match the types so we can compare the stride and the BETakenCount.
2239 : // The Stride can be positive/negative, so we sign extend Stride;
2240 : // The backdgeTakenCount is non-negative, so we zero extend BETakenCount.
2241 15 : const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout();
2242 15 : uint64_t StrideTypeSize = DL.getTypeAllocSize(StrideExpr->getType());
2243 15 : uint64_t BETypeSize = DL.getTypeAllocSize(BETakenCount->getType());
2244 : const SCEV *CastedStride = StrideExpr;
2245 : const SCEV *CastedBECount = BETakenCount;
2246 15 : ScalarEvolution *SE = PSE->getSE();
2247 15 : if (BETypeSize >= StrideTypeSize)
2248 13 : CastedStride = SE->getNoopOrSignExtend(StrideExpr, BETakenCount->getType());
2249 : else
2250 2 : CastedBECount = SE->getZeroExtendExpr(BETakenCount, StrideExpr->getType());
2251 15 : const SCEV *StrideMinusBETaken = SE->getMinusSCEV(CastedStride, CastedBECount);
2252 : // Since TripCount == BackEdgeTakenCount + 1, checking:
2253 : // "Stride >= TripCount" is equivalent to checking:
2254 : // Stride - BETakenCount > 0
2255 15 : if (SE->isKnownPositive(StrideMinusBETaken)) {
2256 : LLVM_DEBUG(
2257 : dbgs() << "LAA: Stride>=TripCount; No point in versioning as the "
2258 : "Stride==1 predicate will imply that the loop executes "
2259 : "at most once.\n");
2260 : return;
2261 : }
2262 : LLVM_DEBUG(dbgs() << "LAA: Found a strided access that we can version.");
2263 :
2264 13 : SymbolicStrides[Ptr] = Stride;
2265 13 : StrideSet.insert(Stride);
2266 : }
2267 :
2268 5128 : LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
2269 : const TargetLibraryInfo *TLI, AliasAnalysis *AA,
2270 5128 : DominatorTree *DT, LoopInfo *LI)
2271 : : PSE(llvm::make_unique<PredicatedScalarEvolution>(*SE, *L)),
2272 : PtrRtChecking(llvm::make_unique<RuntimePointerChecking>(SE)),
2273 : DepChecker(llvm::make_unique<MemoryDepChecker>(*PSE, L)), TheLoop(L),
2274 : NumLoads(0), NumStores(0), MaxSafeDepDistBytes(-1), CanVecMem(false),
2275 5128 : HasMultipleStoresToLoopInvariantAddress(false) {
2276 5128 : if (canAnalyzeLoop())
2277 2290 : analyzeLoop(AA, LI, TLI, DT);
2278 5128 : }
2279 :
2280 105 : void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
2281 105 : if (CanVecMem) {
2282 66 : OS.indent(Depth) << "Memory dependences are safe";
2283 66 : if (MaxSafeDepDistBytes != -1ULL)
2284 12 : OS << " with a maximum dependence distance of " << MaxSafeDepDistBytes
2285 12 : << " bytes";
2286 66 : if (PtrRtChecking->Need)
2287 26 : OS << " with run-time checks";
2288 66 : OS << "\n";
2289 : }
2290 :
2291 105 : if (Report)
2292 117 : OS.indent(Depth) << "Report: " << Report->getMsg() << "\n";
2293 :
2294 : if (auto *Dependences = DepChecker->getDependences()) {
2295 105 : OS.indent(Depth) << "Dependences:\n";
2296 183 : for (auto &Dep : *Dependences) {
2297 78 : Dep.print(OS, Depth + 2, DepChecker->getMemoryInstructions());
2298 78 : OS << "\n";
2299 : }
2300 : } else
2301 0 : OS.indent(Depth) << "Too many dependences, not recorded\n";
2302 :
2303 : // List the pair of accesses need run-time checks to prove independence.
2304 105 : PtrRtChecking->print(OS, Depth);
2305 105 : OS << "\n";
2306 :
2307 105 : OS.indent(Depth) << "Multiple stores to invariant address were "
2308 208 : << (HasMultipleStoresToLoopInvariantAddress ? "" : "not ")
2309 105 : << "found in loop.\n";
2310 :
2311 105 : OS.indent(Depth) << "SCEV assumptions:\n";
2312 105 : PSE->getUnionPredicate().print(OS, Depth);
2313 :
2314 105 : OS << "\n";
2315 :
2316 105 : OS.indent(Depth) << "Expressions re-written:\n";
2317 105 : PSE->print(OS, Depth);
2318 105 : }
2319 :
2320 4547 : const LoopAccessInfo &LoopAccessLegacyAnalysis::getInfo(Loop *L) {
2321 4547 : auto &LAI = LoopAccessInfoMap[L];
2322 :
2323 4547 : if (!LAI)
2324 9094 : LAI = llvm::make_unique<LoopAccessInfo>(L, SE, TLI, AA, DT, LI);
2325 :
2326 4547 : return *LAI.get();
2327 : }
2328 :
2329 48 : void LoopAccessLegacyAnalysis::print(raw_ostream &OS, const Module *M) const {
2330 : LoopAccessLegacyAnalysis &LAA = *const_cast<LoopAccessLegacyAnalysis *>(this);
2331 :
2332 97 : for (Loop *TopLevelLoop : *LI)
2333 152 : for (Loop *L : depth_first(TopLevelLoop)) {
2334 54 : OS.indent(2) << L->getHeader()->getName() << ":\n";
2335 54 : auto &LAI = LAA.getInfo(L);
2336 54 : LAI.print(OS, 4);
2337 : }
2338 48 : }
2339 :
2340 41254 : bool LoopAccessLegacyAnalysis::runOnFunction(Function &F) {
2341 41254 : SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2342 41254 : auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2343 41254 : TLI = TLIP ? &TLIP->getTLI() : nullptr;
2344 41254 : AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2345 41254 : DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2346 41254 : LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2347 :
2348 41254 : return false;
2349 : }
2350 :
2351 4548 : void LoopAccessLegacyAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
2352 : AU.addRequired<ScalarEvolutionWrapperPass>();
2353 : AU.addRequired<AAResultsWrapperPass>();
2354 : AU.addRequired<DominatorTreeWrapperPass>();
2355 : AU.addRequired<LoopInfoWrapperPass>();
2356 :
2357 : AU.setPreservesAll();
2358 4548 : }
2359 :
2360 : char LoopAccessLegacyAnalysis::ID = 0;
2361 : static const char laa_name[] = "Loop Access Analysis";
2362 : #define LAA_NAME "loop-accesses"
2363 :
2364 33326 : INITIALIZE_PASS_BEGIN(LoopAccessLegacyAnalysis, LAA_NAME, laa_name, false, true)
2365 33326 : INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
2366 33326 : INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
2367 33326 : INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
2368 33326 : INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
2369 233370 : INITIALIZE_PASS_END(LoopAccessLegacyAnalysis, LAA_NAME, laa_name, false, true)
2370 :
2371 : AnalysisKey LoopAccessAnalysis::Key;
2372 :
2373 108 : LoopAccessInfo LoopAccessAnalysis::run(Loop &L, LoopAnalysisManager &AM,
2374 : LoopStandardAnalysisResults &AR) {
2375 108 : return LoopAccessInfo(&L, &AR.SE, &AR.TLI, &AR.AA, &AR.DT, &AR.LI);
2376 : }
2377 :
2378 : namespace llvm {
2379 :
2380 0 : Pass *createLAAPass() {
2381 0 : return new LoopAccessLegacyAnalysis();
2382 : }
2383 :
2384 : } // end namespace llvm
|