File: | lib/Transforms/Vectorize/LoadStoreVectorizer.cpp |
Warning: | line 1159, column 7 Value stored to 'Alignment' is never read |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This pass merges loads/stores to/from sequential memory addresses into vector |
10 | // loads/stores. Although there's nothing GPU-specific in here, this pass is |
11 | // motivated by the microarchitectural quirks of nVidia and AMD GPUs. |
12 | // |
13 | // (For simplicity below we talk about loads only, but everything also applies |
14 | // to stores.) |
15 | // |
16 | // This pass is intended to be run late in the pipeline, after other |
17 | // vectorization opportunities have been exploited. So the assumption here is |
18 | // that immediately following our new vector load we'll need to extract out the |
19 | // individual elements of the load, so we can operate on them individually. |
20 | // |
21 | // On CPUs this transformation is usually not beneficial, because extracting the |
22 | // elements of a vector register is expensive on most architectures. It's |
23 | // usually better just to load each element individually into its own scalar |
24 | // register. |
25 | // |
26 | // However, nVidia and AMD GPUs don't have proper vector registers. Instead, a |
27 | // "vector load" loads directly into a series of scalar registers. In effect, |
28 | // extracting the elements of the vector is free. It's therefore always |
29 | // beneficial to vectorize a sequence of loads on these architectures. |
30 | // |
31 | // Vectorizing (perhaps a better name might be "coalescing") loads can have |
32 | // large performance impacts on GPU kernels, and opportunities for vectorizing |
33 | // are common in GPU code. This pass tries very hard to find such |
34 | // opportunities; its runtime is quadratic in the number of loads in a BB. |
35 | // |
36 | // Some CPU architectures, such as ARM, have instructions that load into |
37 | // multiple scalar registers, similar to a GPU vectorized load. In theory ARM |
38 | // could use this pass (with some modifications), but currently it implements |
39 | // its own pass to do something similar to what we do here. |
40 | |
41 | #include "llvm/ADT/APInt.h" |
42 | #include "llvm/ADT/ArrayRef.h" |
43 | #include "llvm/ADT/MapVector.h" |
44 | #include "llvm/ADT/PostOrderIterator.h" |
45 | #include "llvm/ADT/STLExtras.h" |
46 | #include "llvm/ADT/SmallPtrSet.h" |
47 | #include "llvm/ADT/SmallVector.h" |
48 | #include "llvm/ADT/Statistic.h" |
49 | #include "llvm/ADT/iterator_range.h" |
50 | #include "llvm/Analysis/AliasAnalysis.h" |
51 | #include "llvm/Analysis/MemoryLocation.h" |
52 | #include "llvm/Analysis/OrderedBasicBlock.h" |
53 | #include "llvm/Analysis/ScalarEvolution.h" |
54 | #include "llvm/Analysis/TargetTransformInfo.h" |
55 | #include "llvm/Transforms/Utils/Local.h" |
56 | #include "llvm/Analysis/ValueTracking.h" |
57 | #include "llvm/Analysis/VectorUtils.h" |
58 | #include "llvm/IR/Attributes.h" |
59 | #include "llvm/IR/BasicBlock.h" |
60 | #include "llvm/IR/Constants.h" |
61 | #include "llvm/IR/DataLayout.h" |
62 | #include "llvm/IR/DerivedTypes.h" |
63 | #include "llvm/IR/Dominators.h" |
64 | #include "llvm/IR/Function.h" |
65 | #include "llvm/IR/IRBuilder.h" |
66 | #include "llvm/IR/InstrTypes.h" |
67 | #include "llvm/IR/Instruction.h" |
68 | #include "llvm/IR/Instructions.h" |
69 | #include "llvm/IR/IntrinsicInst.h" |
70 | #include "llvm/IR/Module.h" |
71 | #include "llvm/IR/Type.h" |
72 | #include "llvm/IR/User.h" |
73 | #include "llvm/IR/Value.h" |
74 | #include "llvm/Pass.h" |
75 | #include "llvm/Support/Casting.h" |
76 | #include "llvm/Support/Debug.h" |
77 | #include "llvm/Support/KnownBits.h" |
78 | #include "llvm/Support/MathExtras.h" |
79 | #include "llvm/Support/raw_ostream.h" |
80 | #include "llvm/Transforms/Vectorize.h" |
81 | #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" |
82 | #include <algorithm> |
83 | #include <cassert> |
84 | #include <cstdlib> |
85 | #include <tuple> |
86 | #include <utility> |
87 | |
88 | using namespace llvm; |
89 | |
90 | #define DEBUG_TYPE"load-store-vectorizer" "load-store-vectorizer" |
91 | |
92 | STATISTIC(NumVectorInstructions, "Number of vector accesses generated")static llvm::Statistic NumVectorInstructions = {"load-store-vectorizer" , "NumVectorInstructions", "Number of vector accesses generated" , {0}, {false}}; |
93 | STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized")static llvm::Statistic NumScalarsVectorized = {"load-store-vectorizer" , "NumScalarsVectorized", "Number of scalar accesses vectorized" , {0}, {false}}; |
94 | |
95 | // FIXME: Assuming stack alignment of 4 is always good enough |
96 | static const unsigned StackAdjustedAlignment = 4; |
97 | |
98 | namespace { |
99 | |
100 | /// ChainID is an arbitrary token that is allowed to be different only for the |
101 | /// accesses that are guaranteed to be considered non-consecutive by |
102 | /// Vectorizer::isConsecutiveAccess. It's used for grouping instructions |
103 | /// together and reducing the number of instructions the main search operates on |
104 | /// at a time, i.e. this is to reduce compile time and nothing else as the main |
105 | /// search has O(n^2) time complexity. The underlying type of ChainID should not |
106 | /// be relied upon. |
107 | using ChainID = const Value *; |
108 | using InstrList = SmallVector<Instruction *, 8>; |
109 | using InstrListMap = MapVector<ChainID, InstrList>; |
110 | |
111 | class Vectorizer { |
112 | Function &F; |
113 | AliasAnalysis &AA; |
114 | DominatorTree &DT; |
115 | ScalarEvolution &SE; |
116 | TargetTransformInfo &TTI; |
117 | const DataLayout &DL; |
118 | IRBuilder<> Builder; |
119 | |
120 | public: |
121 | Vectorizer(Function &F, AliasAnalysis &AA, DominatorTree &DT, |
122 | ScalarEvolution &SE, TargetTransformInfo &TTI) |
123 | : F(F), AA(AA), DT(DT), SE(SE), TTI(TTI), |
124 | DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {} |
125 | |
126 | bool run(); |
127 | |
128 | private: |
129 | unsigned getPointerAddressSpace(Value *I); |
130 | |
131 | unsigned getAlignment(LoadInst *LI) const { |
132 | unsigned Align = LI->getAlignment(); |
133 | if (Align != 0) |
134 | return Align; |
135 | |
136 | return DL.getABITypeAlignment(LI->getType()); |
137 | } |
138 | |
139 | unsigned getAlignment(StoreInst *SI) const { |
140 | unsigned Align = SI->getAlignment(); |
141 | if (Align != 0) |
142 | return Align; |
143 | |
144 | return DL.getABITypeAlignment(SI->getValueOperand()->getType()); |
145 | } |
146 | |
147 | static const unsigned MaxDepth = 3; |
148 | |
149 | bool isConsecutiveAccess(Value *A, Value *B); |
150 | bool areConsecutivePointers(Value *PtrA, Value *PtrB, const APInt &PtrDelta, |
151 | unsigned Depth = 0) const; |
152 | bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta, |
153 | unsigned Depth) const; |
154 | bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta, |
155 | unsigned Depth) const; |
156 | |
157 | /// After vectorization, reorder the instructions that I depends on |
158 | /// (the instructions defining its operands), to ensure they dominate I. |
159 | void reorder(Instruction *I); |
160 | |
161 | /// Returns the first and the last instructions in Chain. |
162 | std::pair<BasicBlock::iterator, BasicBlock::iterator> |
163 | getBoundaryInstrs(ArrayRef<Instruction *> Chain); |
164 | |
165 | /// Erases the original instructions after vectorizing. |
166 | void eraseInstructions(ArrayRef<Instruction *> Chain); |
167 | |
168 | /// "Legalize" the vector type that would be produced by combining \p |
169 | /// ElementSizeBits elements in \p Chain. Break into two pieces such that the |
170 | /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is |
171 | /// expected to have more than 4 elements. |
172 | std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>> |
173 | splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits); |
174 | |
175 | /// Finds the largest prefix of Chain that's vectorizable, checking for |
176 | /// intervening instructions which may affect the memory accessed by the |
177 | /// instructions within Chain. |
178 | /// |
179 | /// The elements of \p Chain must be all loads or all stores and must be in |
180 | /// address order. |
181 | ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain); |
182 | |
183 | /// Collects load and store instructions to vectorize. |
184 | std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB); |
185 | |
186 | /// Processes the collected instructions, the \p Map. The values of \p Map |
187 | /// should be all loads or all stores. |
188 | bool vectorizeChains(InstrListMap &Map); |
189 | |
190 | /// Finds the load/stores to consecutive memory addresses and vectorizes them. |
191 | bool vectorizeInstructions(ArrayRef<Instruction *> Instrs); |
192 | |
193 | /// Vectorizes the load instructions in Chain. |
194 | bool |
195 | vectorizeLoadChain(ArrayRef<Instruction *> Chain, |
196 | SmallPtrSet<Instruction *, 16> *InstructionsProcessed); |
197 | |
198 | /// Vectorizes the store instructions in Chain. |
199 | bool |
200 | vectorizeStoreChain(ArrayRef<Instruction *> Chain, |
201 | SmallPtrSet<Instruction *, 16> *InstructionsProcessed); |
202 | |
203 | /// Check if this load/store access is misaligned accesses. |
204 | bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace, |
205 | unsigned Alignment); |
206 | }; |
207 | |
208 | class LoadStoreVectorizerLegacyPass : public FunctionPass { |
209 | public: |
210 | static char ID; |
211 | |
212 | LoadStoreVectorizerLegacyPass() : FunctionPass(ID) { |
213 | initializeLoadStoreVectorizerLegacyPassPass(*PassRegistry::getPassRegistry()); |
214 | } |
215 | |
216 | bool runOnFunction(Function &F) override; |
217 | |
218 | StringRef getPassName() const override { |
219 | return "GPU Load and Store Vectorizer"; |
220 | } |
221 | |
222 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
223 | AU.addRequired<AAResultsWrapperPass>(); |
224 | AU.addRequired<ScalarEvolutionWrapperPass>(); |
225 | AU.addRequired<DominatorTreeWrapperPass>(); |
226 | AU.addRequired<TargetTransformInfoWrapperPass>(); |
227 | AU.setPreservesCFG(); |
228 | } |
229 | }; |
230 | |
231 | } // end anonymous namespace |
232 | |
233 | char LoadStoreVectorizerLegacyPass::ID = 0; |
234 | |
235 | INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,static void *initializeLoadStoreVectorizerLegacyPassPassOnce( PassRegistry &Registry) { |
236 | "Vectorize load and Store instructions", false, false)static void *initializeLoadStoreVectorizerLegacyPassPassOnce( PassRegistry &Registry) { |
237 | INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)initializeSCEVAAWrapperPassPass(Registry); |
238 | INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)initializeDominatorTreeWrapperPassPass(Registry); |
239 | INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)initializeAAResultsWrapperPassPass(Registry); |
240 | INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)initializeGlobalsAAWrapperPassPass(Registry); |
241 | INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)initializeTargetTransformInfoWrapperPassPass(Registry); |
242 | INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,PassInfo *PI = new PassInfo( "Vectorize load and store instructions" , "load-store-vectorizer", &LoadStoreVectorizerLegacyPass ::ID, PassInfo::NormalCtor_t(callDefaultCtor<LoadStoreVectorizerLegacyPass >), false, false); Registry.registerPass(*PI, true); return PI; } static llvm::once_flag InitializeLoadStoreVectorizerLegacyPassPassFlag ; void llvm::initializeLoadStoreVectorizerLegacyPassPass(PassRegistry &Registry) { llvm::call_once(InitializeLoadStoreVectorizerLegacyPassPassFlag , initializeLoadStoreVectorizerLegacyPassPassOnce, std::ref(Registry )); } |
243 | "Vectorize load and store instructions", false, false)PassInfo *PI = new PassInfo( "Vectorize load and store instructions" , "load-store-vectorizer", &LoadStoreVectorizerLegacyPass ::ID, PassInfo::NormalCtor_t(callDefaultCtor<LoadStoreVectorizerLegacyPass >), false, false); Registry.registerPass(*PI, true); return PI; } static llvm::once_flag InitializeLoadStoreVectorizerLegacyPassPassFlag ; void llvm::initializeLoadStoreVectorizerLegacyPassPass(PassRegistry &Registry) { llvm::call_once(InitializeLoadStoreVectorizerLegacyPassPassFlag , initializeLoadStoreVectorizerLegacyPassPassOnce, std::ref(Registry )); } |
244 | |
245 | Pass *llvm::createLoadStoreVectorizerPass() { |
246 | return new LoadStoreVectorizerLegacyPass(); |
247 | } |
248 | |
249 | bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) { |
250 | // Don't vectorize when the attribute NoImplicitFloat is used. |
251 | if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat)) |
252 | return false; |
253 | |
254 | AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); |
255 | DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); |
256 | ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); |
257 | TargetTransformInfo &TTI = |
258 | getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); |
259 | |
260 | Vectorizer V(F, AA, DT, SE, TTI); |
261 | return V.run(); |
262 | } |
263 | |
264 | PreservedAnalyses LoadStoreVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) { |
265 | // Don't vectorize when the attribute NoImplicitFloat is used. |
266 | if (F.hasFnAttribute(Attribute::NoImplicitFloat)) |
267 | return PreservedAnalyses::all(); |
268 | |
269 | AliasAnalysis &AA = AM.getResult<AAManager>(F); |
270 | DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F); |
271 | ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F); |
272 | TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F); |
273 | |
274 | Vectorizer V(F, AA, DT, SE, TTI); |
275 | bool Changed = V.run(); |
276 | PreservedAnalyses PA; |
277 | PA.preserveSet<CFGAnalyses>(); |
278 | return Changed ? PA : PreservedAnalyses::all(); |
279 | } |
280 | |
281 | // The real propagateMetadata expects a SmallVector<Value*>, but we deal in |
282 | // vectors of Instructions. |
283 | static void propagateMetadata(Instruction *I, ArrayRef<Instruction *> IL) { |
284 | SmallVector<Value *, 8> VL(IL.begin(), IL.end()); |
285 | propagateMetadata(I, VL); |
286 | } |
287 | |
288 | // Vectorizer Implementation |
289 | bool Vectorizer::run() { |
290 | bool Changed = false; |
291 | |
292 | // Scan the blocks in the function in post order. |
293 | for (BasicBlock *BB : post_order(&F)) { |
294 | InstrListMap LoadRefs, StoreRefs; |
295 | std::tie(LoadRefs, StoreRefs) = collectInstructions(BB); |
296 | Changed |= vectorizeChains(LoadRefs); |
297 | Changed |= vectorizeChains(StoreRefs); |
298 | } |
299 | |
300 | return Changed; |
301 | } |
302 | |
303 | unsigned Vectorizer::getPointerAddressSpace(Value *I) { |
304 | if (LoadInst *L = dyn_cast<LoadInst>(I)) |
305 | return L->getPointerAddressSpace(); |
306 | if (StoreInst *S = dyn_cast<StoreInst>(I)) |
307 | return S->getPointerAddressSpace(); |
308 | return -1; |
309 | } |
310 | |
311 | // FIXME: Merge with llvm::isConsecutiveAccess |
312 | bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) { |
313 | Value *PtrA = getLoadStorePointerOperand(A); |
314 | Value *PtrB = getLoadStorePointerOperand(B); |
315 | unsigned ASA = getPointerAddressSpace(A); |
316 | unsigned ASB = getPointerAddressSpace(B); |
317 | |
318 | // Check that the address spaces match and that the pointers are valid. |
319 | if (!PtrA || !PtrB || (ASA != ASB)) |
320 | return false; |
321 | |
322 | // Make sure that A and B are different pointers of the same size type. |
323 | Type *PtrATy = PtrA->getType()->getPointerElementType(); |
324 | Type *PtrBTy = PtrB->getType()->getPointerElementType(); |
325 | if (PtrA == PtrB || |
326 | PtrATy->isVectorTy() != PtrBTy->isVectorTy() || |
327 | DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) || |
328 | DL.getTypeStoreSize(PtrATy->getScalarType()) != |
329 | DL.getTypeStoreSize(PtrBTy->getScalarType())) |
330 | return false; |
331 | |
332 | unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA); |
333 | APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy)); |
334 | |
335 | return areConsecutivePointers(PtrA, PtrB, Size); |
336 | } |
337 | |
338 | bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB, |
339 | const APInt &PtrDelta, |
340 | unsigned Depth) const { |
341 | unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType()); |
342 | APInt OffsetA(PtrBitWidth, 0); |
343 | APInt OffsetB(PtrBitWidth, 0); |
344 | PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); |
345 | PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB); |
346 | |
347 | APInt OffsetDelta = OffsetB - OffsetA; |
348 | |
349 | // Check if they are based on the same pointer. That makes the offsets |
350 | // sufficient. |
351 | if (PtrA == PtrB) |
352 | return OffsetDelta == PtrDelta; |
353 | |
354 | // Compute the necessary base pointer delta to have the necessary final delta |
355 | // equal to the pointer delta requested. |
356 | APInt BaseDelta = PtrDelta - OffsetDelta; |
357 | |
358 | // Compute the distance with SCEV between the base pointers. |
359 | const SCEV *PtrSCEVA = SE.getSCEV(PtrA); |
360 | const SCEV *PtrSCEVB = SE.getSCEV(PtrB); |
361 | const SCEV *C = SE.getConstant(BaseDelta); |
362 | const SCEV *X = SE.getAddExpr(PtrSCEVA, C); |
363 | if (X == PtrSCEVB) |
364 | return true; |
365 | |
366 | // The above check will not catch the cases where one of the pointers is |
367 | // factorized but the other one is not, such as (C + (S * (A + B))) vs |
368 | // (AS + BS). Get the minus scev. That will allow re-combining the expresions |
369 | // and getting the simplified difference. |
370 | const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA); |
371 | if (C == Dist) |
372 | return true; |
373 | |
374 | // Sometimes even this doesn't work, because SCEV can't always see through |
375 | // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking |
376 | // things the hard way. |
377 | return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth); |
378 | } |
379 | |
380 | bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB, |
381 | APInt PtrDelta, |
382 | unsigned Depth) const { |
383 | auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA); |
384 | auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB); |
385 | if (!GEPA || !GEPB) |
386 | return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth); |
387 | |
388 | // Look through GEPs after checking they're the same except for the last |
389 | // index. |
390 | if (GEPA->getNumOperands() != GEPB->getNumOperands() || |
391 | GEPA->getPointerOperand() != GEPB->getPointerOperand()) |
392 | return false; |
393 | gep_type_iterator GTIA = gep_type_begin(GEPA); |
394 | gep_type_iterator GTIB = gep_type_begin(GEPB); |
395 | for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) { |
396 | if (GTIA.getOperand() != GTIB.getOperand()) |
397 | return false; |
398 | ++GTIA; |
399 | ++GTIB; |
400 | } |
401 | |
402 | Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand()); |
403 | Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand()); |
404 | if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() || |
405 | OpA->getType() != OpB->getType()) |
406 | return false; |
407 | |
408 | if (PtrDelta.isNegative()) { |
409 | if (PtrDelta.isMinSignedValue()) |
410 | return false; |
411 | PtrDelta.negate(); |
412 | std::swap(OpA, OpB); |
413 | } |
414 | uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType()); |
415 | if (PtrDelta.urem(Stride) != 0) |
416 | return false; |
417 | unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits(); |
418 | APInt IdxDiff = PtrDelta.udiv(Stride).zextOrSelf(IdxBitWidth); |
419 | |
420 | // Only look through a ZExt/SExt. |
421 | if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA)) |
422 | return false; |
423 | |
424 | bool Signed = isa<SExtInst>(OpA); |
425 | |
426 | // At this point A could be a function parameter, i.e. not an instruction |
427 | Value *ValA = OpA->getOperand(0); |
428 | OpB = dyn_cast<Instruction>(OpB->getOperand(0)); |
429 | if (!OpB || ValA->getType() != OpB->getType()) |
430 | return false; |
431 | |
432 | // Now we need to prove that adding IdxDiff to ValA won't overflow. |
433 | bool Safe = false; |
434 | // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to |
435 | // ValA, we're okay. |
436 | if (OpB->getOpcode() == Instruction::Add && |
437 | isa<ConstantInt>(OpB->getOperand(1)) && |
438 | IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue())) { |
439 | if (Signed) |
440 | Safe = cast<BinaryOperator>(OpB)->hasNoSignedWrap(); |
441 | else |
442 | Safe = cast<BinaryOperator>(OpB)->hasNoUnsignedWrap(); |
443 | } |
444 | |
445 | unsigned BitWidth = ValA->getType()->getScalarSizeInBits(); |
446 | |
447 | // Second attempt: |
448 | // If all set bits of IdxDiff or any higher order bit other than the sign bit |
449 | // are known to be zero in ValA, we can add Diff to it while guaranteeing no |
450 | // overflow of any sort. |
451 | if (!Safe) { |
452 | OpA = dyn_cast<Instruction>(ValA); |
453 | if (!OpA) |
454 | return false; |
455 | KnownBits Known(BitWidth); |
456 | computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT); |
457 | APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth()); |
458 | if (Signed) |
459 | BitsAllowedToBeSet.clearBit(BitWidth - 1); |
460 | if (BitsAllowedToBeSet.ult(IdxDiff)) |
461 | return false; |
462 | } |
463 | |
464 | const SCEV *OffsetSCEVA = SE.getSCEV(ValA); |
465 | const SCEV *OffsetSCEVB = SE.getSCEV(OpB); |
466 | const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth)); |
467 | const SCEV *X = SE.getAddExpr(OffsetSCEVA, C); |
468 | return X == OffsetSCEVB; |
469 | } |
470 | |
471 | bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB, |
472 | const APInt &PtrDelta, |
473 | unsigned Depth) const { |
474 | if (Depth++ == MaxDepth) |
475 | return false; |
476 | |
477 | if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) { |
478 | if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) { |
479 | return SelectA->getCondition() == SelectB->getCondition() && |
480 | areConsecutivePointers(SelectA->getTrueValue(), |
481 | SelectB->getTrueValue(), PtrDelta, Depth) && |
482 | areConsecutivePointers(SelectA->getFalseValue(), |
483 | SelectB->getFalseValue(), PtrDelta, Depth); |
484 | } |
485 | } |
486 | return false; |
487 | } |
488 | |
489 | void Vectorizer::reorder(Instruction *I) { |
490 | OrderedBasicBlock OBB(I->getParent()); |
491 | SmallPtrSet<Instruction *, 16> InstructionsToMove; |
492 | SmallVector<Instruction *, 16> Worklist; |
493 | |
494 | Worklist.push_back(I); |
495 | while (!Worklist.empty()) { |
496 | Instruction *IW = Worklist.pop_back_val(); |
497 | int NumOperands = IW->getNumOperands(); |
498 | for (int i = 0; i < NumOperands; i++) { |
499 | Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i)); |
500 | if (!IM || IM->getOpcode() == Instruction::PHI) |
501 | continue; |
502 | |
503 | // If IM is in another BB, no need to move it, because this pass only |
504 | // vectorizes instructions within one BB. |
505 | if (IM->getParent() != I->getParent()) |
506 | continue; |
507 | |
508 | if (!OBB.dominates(IM, I)) { |
509 | InstructionsToMove.insert(IM); |
510 | Worklist.push_back(IM); |
511 | } |
512 | } |
513 | } |
514 | |
515 | // All instructions to move should follow I. Start from I, not from begin(). |
516 | for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E; |
517 | ++BBI) { |
518 | if (!InstructionsToMove.count(&*BBI)) |
519 | continue; |
520 | Instruction *IM = &*BBI; |
521 | --BBI; |
522 | IM->removeFromParent(); |
523 | IM->insertBefore(I); |
524 | } |
525 | } |
526 | |
527 | std::pair<BasicBlock::iterator, BasicBlock::iterator> |
528 | Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) { |
529 | Instruction *C0 = Chain[0]; |
530 | BasicBlock::iterator FirstInstr = C0->getIterator(); |
531 | BasicBlock::iterator LastInstr = C0->getIterator(); |
532 | |
533 | BasicBlock *BB = C0->getParent(); |
534 | unsigned NumFound = 0; |
535 | for (Instruction &I : *BB) { |
536 | if (!is_contained(Chain, &I)) |
537 | continue; |
538 | |
539 | ++NumFound; |
540 | if (NumFound == 1) { |
541 | FirstInstr = I.getIterator(); |
542 | } |
543 | if (NumFound == Chain.size()) { |
544 | LastInstr = I.getIterator(); |
545 | break; |
546 | } |
547 | } |
548 | |
549 | // Range is [first, last). |
550 | return std::make_pair(FirstInstr, ++LastInstr); |
551 | } |
552 | |
553 | void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) { |
554 | SmallVector<Instruction *, 16> Instrs; |
555 | for (Instruction *I : Chain) { |
556 | Value *PtrOperand = getLoadStorePointerOperand(I); |
557 | assert(PtrOperand && "Instruction must have a pointer operand.")((PtrOperand && "Instruction must have a pointer operand." ) ? static_cast<void> (0) : __assert_fail ("PtrOperand && \"Instruction must have a pointer operand.\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 557, __PRETTY_FUNCTION__)); |
558 | Instrs.push_back(I); |
559 | if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand)) |
560 | Instrs.push_back(GEP); |
561 | } |
562 | |
563 | // Erase instructions. |
564 | for (Instruction *I : Instrs) |
565 | if (I->use_empty()) |
566 | I->eraseFromParent(); |
567 | } |
568 | |
569 | std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>> |
570 | Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain, |
571 | unsigned ElementSizeBits) { |
572 | unsigned ElementSizeBytes = ElementSizeBits / 8; |
573 | unsigned SizeBytes = ElementSizeBytes * Chain.size(); |
574 | unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes; |
575 | if (NumLeft == Chain.size()) { |
576 | if ((NumLeft & 1) == 0) |
577 | NumLeft /= 2; // Split even in half |
578 | else |
579 | --NumLeft; // Split off last element |
580 | } else if (NumLeft == 0) |
581 | NumLeft = 1; |
582 | return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft)); |
583 | } |
584 | |
585 | ArrayRef<Instruction *> |
586 | Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) { |
587 | // These are in BB order, unlike Chain, which is in address order. |
588 | SmallVector<Instruction *, 16> MemoryInstrs; |
589 | SmallVector<Instruction *, 16> ChainInstrs; |
590 | |
591 | bool IsLoadChain = isa<LoadInst>(Chain[0]); |
592 | LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { for (Instruction *I : Chain) { if (IsLoadChain) ((isa<LoadInst>(I) && "All elements of Chain must be loads, or all must be stores." ) ? static_cast<void> (0) : __assert_fail ("isa<LoadInst>(I) && \"All elements of Chain must be loads, or all must be stores.\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 596, __PRETTY_FUNCTION__)); else ((isa<StoreInst>(I) && "All elements of Chain must be loads, or all must be stores." ) ? static_cast<void> (0) : __assert_fail ("isa<StoreInst>(I) && \"All elements of Chain must be loads, or all must be stores.\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 599, __PRETTY_FUNCTION__)); } }; } } while (false) |
593 | for (Instruction *I : Chain) {do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { for (Instruction *I : Chain) { if (IsLoadChain) ((isa<LoadInst>(I) && "All elements of Chain must be loads, or all must be stores." ) ? static_cast<void> (0) : __assert_fail ("isa<LoadInst>(I) && \"All elements of Chain must be loads, or all must be stores.\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 596, __PRETTY_FUNCTION__)); else ((isa<StoreInst>(I) && "All elements of Chain must be loads, or all must be stores." ) ? static_cast<void> (0) : __assert_fail ("isa<StoreInst>(I) && \"All elements of Chain must be loads, or all must be stores.\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 599, __PRETTY_FUNCTION__)); } }; } } while (false) |
594 | if (IsLoadChain)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { for (Instruction *I : Chain) { if (IsLoadChain) ((isa<LoadInst>(I) && "All elements of Chain must be loads, or all must be stores." ) ? static_cast<void> (0) : __assert_fail ("isa<LoadInst>(I) && \"All elements of Chain must be loads, or all must be stores.\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 596, __PRETTY_FUNCTION__)); else ((isa<StoreInst>(I) && "All elements of Chain must be loads, or all must be stores." ) ? static_cast<void> (0) : __assert_fail ("isa<StoreInst>(I) && \"All elements of Chain must be loads, or all must be stores.\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 599, __PRETTY_FUNCTION__)); } }; } } while (false) |
595 | assert(isa<LoadInst>(I) &&do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { for (Instruction *I : Chain) { if (IsLoadChain) ((isa<LoadInst>(I) && "All elements of Chain must be loads, or all must be stores." ) ? static_cast<void> (0) : __assert_fail ("isa<LoadInst>(I) && \"All elements of Chain must be loads, or all must be stores.\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 596, __PRETTY_FUNCTION__)); else ((isa<StoreInst>(I) && "All elements of Chain must be loads, or all must be stores." ) ? static_cast<void> (0) : __assert_fail ("isa<StoreInst>(I) && \"All elements of Chain must be loads, or all must be stores.\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 599, __PRETTY_FUNCTION__)); } }; } } while (false) |
596 | "All elements of Chain must be loads, or all must be stores.");do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { for (Instruction *I : Chain) { if (IsLoadChain) ((isa<LoadInst>(I) && "All elements of Chain must be loads, or all must be stores." ) ? static_cast<void> (0) : __assert_fail ("isa<LoadInst>(I) && \"All elements of Chain must be loads, or all must be stores.\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 596, __PRETTY_FUNCTION__)); else ((isa<StoreInst>(I) && "All elements of Chain must be loads, or all must be stores." ) ? static_cast<void> (0) : __assert_fail ("isa<StoreInst>(I) && \"All elements of Chain must be loads, or all must be stores.\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 599, __PRETTY_FUNCTION__)); } }; } } while (false) |
597 | elsedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { for (Instruction *I : Chain) { if (IsLoadChain) ((isa<LoadInst>(I) && "All elements of Chain must be loads, or all must be stores." ) ? static_cast<void> (0) : __assert_fail ("isa<LoadInst>(I) && \"All elements of Chain must be loads, or all must be stores.\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 596, __PRETTY_FUNCTION__)); else ((isa<StoreInst>(I) && "All elements of Chain must be loads, or all must be stores." ) ? static_cast<void> (0) : __assert_fail ("isa<StoreInst>(I) && \"All elements of Chain must be loads, or all must be stores.\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 599, __PRETTY_FUNCTION__)); } }; } } while (false) |
598 | assert(isa<StoreInst>(I) &&do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { for (Instruction *I : Chain) { if (IsLoadChain) ((isa<LoadInst>(I) && "All elements of Chain must be loads, or all must be stores." ) ? static_cast<void> (0) : __assert_fail ("isa<LoadInst>(I) && \"All elements of Chain must be loads, or all must be stores.\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 596, __PRETTY_FUNCTION__)); else ((isa<StoreInst>(I) && "All elements of Chain must be loads, or all must be stores." ) ? static_cast<void> (0) : __assert_fail ("isa<StoreInst>(I) && \"All elements of Chain must be loads, or all must be stores.\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 599, __PRETTY_FUNCTION__)); } }; } } while (false) |
599 | "All elements of Chain must be loads, or all must be stores.");do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { for (Instruction *I : Chain) { if (IsLoadChain) ((isa<LoadInst>(I) && "All elements of Chain must be loads, or all must be stores." ) ? static_cast<void> (0) : __assert_fail ("isa<LoadInst>(I) && \"All elements of Chain must be loads, or all must be stores.\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 596, __PRETTY_FUNCTION__)); else ((isa<StoreInst>(I) && "All elements of Chain must be loads, or all must be stores." ) ? static_cast<void> (0) : __assert_fail ("isa<StoreInst>(I) && \"All elements of Chain must be loads, or all must be stores.\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 599, __PRETTY_FUNCTION__)); } }; } } while (false) |
600 | }do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { for (Instruction *I : Chain) { if (IsLoadChain) ((isa<LoadInst>(I) && "All elements of Chain must be loads, or all must be stores." ) ? static_cast<void> (0) : __assert_fail ("isa<LoadInst>(I) && \"All elements of Chain must be loads, or all must be stores.\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 596, __PRETTY_FUNCTION__)); else ((isa<StoreInst>(I) && "All elements of Chain must be loads, or all must be stores." ) ? static_cast<void> (0) : __assert_fail ("isa<StoreInst>(I) && \"All elements of Chain must be loads, or all must be stores.\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 599, __PRETTY_FUNCTION__)); } }; } } while (false) |
601 | })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { for (Instruction *I : Chain) { if (IsLoadChain) ((isa<LoadInst>(I) && "All elements of Chain must be loads, or all must be stores." ) ? static_cast<void> (0) : __assert_fail ("isa<LoadInst>(I) && \"All elements of Chain must be loads, or all must be stores.\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 596, __PRETTY_FUNCTION__)); else ((isa<StoreInst>(I) && "All elements of Chain must be loads, or all must be stores." ) ? static_cast<void> (0) : __assert_fail ("isa<StoreInst>(I) && \"All elements of Chain must be loads, or all must be stores.\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 599, __PRETTY_FUNCTION__)); } }; } } while (false); |
602 | |
603 | for (Instruction &I : make_range(getBoundaryInstrs(Chain))) { |
604 | if (isa<LoadInst>(I) || isa<StoreInst>(I)) { |
605 | if (!is_contained(Chain, &I)) |
606 | MemoryInstrs.push_back(&I); |
607 | else |
608 | ChainInstrs.push_back(&I); |
609 | } else if (isa<IntrinsicInst>(&I) && |
610 | cast<IntrinsicInst>(&I)->getIntrinsicID() == |
611 | Intrinsic::sideeffect) { |
612 | // Ignore llvm.sideeffect calls. |
613 | } else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) { |
614 | LLVM_DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << Ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { dbgs() << "LSV: Found may-write/throw operation: " << I << '\n'; } } while (false) |
615 | << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { dbgs() << "LSV: Found may-write/throw operation: " << I << '\n'; } } while (false); |
616 | break; |
617 | } else if (!IsLoadChain && (I.mayReadOrWriteMemory() || I.mayThrow())) { |
618 | LLVM_DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << Ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { dbgs() << "LSV: Found may-read/write/throw operation: " << I << '\n'; } } while (false) |
619 | << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { dbgs() << "LSV: Found may-read/write/throw operation: " << I << '\n'; } } while (false); |
620 | break; |
621 | } |
622 | } |
623 | |
624 | OrderedBasicBlock OBB(Chain[0]->getParent()); |
625 | |
626 | // Loop until we find an instruction in ChainInstrs that we can't vectorize. |
627 | unsigned ChainInstrIdx = 0; |
628 | Instruction *BarrierMemoryInstr = nullptr; |
629 | |
630 | for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) { |
631 | Instruction *ChainInstr = ChainInstrs[ChainInstrIdx]; |
632 | |
633 | // If a barrier memory instruction was found, chain instructions that follow |
634 | // will not be added to the valid prefix. |
635 | if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, ChainInstr)) |
636 | break; |
637 | |
638 | // Check (in BB order) if any instruction prevents ChainInstr from being |
639 | // vectorized. Find and store the first such "conflicting" instruction. |
640 | for (Instruction *MemInstr : MemoryInstrs) { |
641 | // If a barrier memory instruction was found, do not check past it. |
642 | if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, MemInstr)) |
643 | break; |
644 | |
645 | auto *MemLoad = dyn_cast<LoadInst>(MemInstr); |
646 | auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr); |
647 | if (MemLoad && ChainLoad) |
648 | continue; |
649 | |
650 | // We can ignore the alias if the we have a load store pair and the load |
651 | // is known to be invariant. The load cannot be clobbered by the store. |
652 | auto IsInvariantLoad = [](const LoadInst *LI) -> bool { |
653 | return LI->getMetadata(LLVMContext::MD_invariant_load); |
654 | }; |
655 | |
656 | // We can ignore the alias as long as the load comes before the store, |
657 | // because that means we won't be moving the load past the store to |
658 | // vectorize it (the vectorized load is inserted at the location of the |
659 | // first load in the chain). |
660 | if (isa<StoreInst>(MemInstr) && ChainLoad && |
661 | (IsInvariantLoad(ChainLoad) || OBB.dominates(ChainLoad, MemInstr))) |
662 | continue; |
663 | |
664 | // Same case, but in reverse. |
665 | if (MemLoad && isa<StoreInst>(ChainInstr) && |
666 | (IsInvariantLoad(MemLoad) || OBB.dominates(MemLoad, ChainInstr))) |
667 | continue; |
668 | |
669 | if (!AA.isNoAlias(MemoryLocation::get(MemInstr), |
670 | MemoryLocation::get(ChainInstr))) { |
671 | LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { dbgs() << "LSV: Found alias:\n" " Aliasing instruction and pointer:\n" << " " << *MemInstr << '\n' << " " << *getLoadStorePointerOperand (MemInstr) << '\n' << " Aliased instruction and pointer:\n" << " " << *ChainInstr << '\n' << " " << *getLoadStorePointerOperand(ChainInstr) << '\n' ; }; } } while (false) |
672 | dbgs() << "LSV: Found alias:\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { dbgs() << "LSV: Found alias:\n" " Aliasing instruction and pointer:\n" << " " << *MemInstr << '\n' << " " << *getLoadStorePointerOperand (MemInstr) << '\n' << " Aliased instruction and pointer:\n" << " " << *ChainInstr << '\n' << " " << *getLoadStorePointerOperand(ChainInstr) << '\n' ; }; } } while (false) |
673 | " Aliasing instruction and pointer:\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { dbgs() << "LSV: Found alias:\n" " Aliasing instruction and pointer:\n" << " " << *MemInstr << '\n' << " " << *getLoadStorePointerOperand (MemInstr) << '\n' << " Aliased instruction and pointer:\n" << " " << *ChainInstr << '\n' << " " << *getLoadStorePointerOperand(ChainInstr) << '\n' ; }; } } while (false) |
674 | << " " << *MemInstr << '\n'do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { dbgs() << "LSV: Found alias:\n" " Aliasing instruction and pointer:\n" << " " << *MemInstr << '\n' << " " << *getLoadStorePointerOperand (MemInstr) << '\n' << " Aliased instruction and pointer:\n" << " " << *ChainInstr << '\n' << " " << *getLoadStorePointerOperand(ChainInstr) << '\n' ; }; } } while (false) |
675 | << " " << *getLoadStorePointerOperand(MemInstr) << '\n'do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { dbgs() << "LSV: Found alias:\n" " Aliasing instruction and pointer:\n" << " " << *MemInstr << '\n' << " " << *getLoadStorePointerOperand (MemInstr) << '\n' << " Aliased instruction and pointer:\n" << " " << *ChainInstr << '\n' << " " << *getLoadStorePointerOperand(ChainInstr) << '\n' ; }; } } while (false) |
676 | << " Aliased instruction and pointer:\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { dbgs() << "LSV: Found alias:\n" " Aliasing instruction and pointer:\n" << " " << *MemInstr << '\n' << " " << *getLoadStorePointerOperand (MemInstr) << '\n' << " Aliased instruction and pointer:\n" << " " << *ChainInstr << '\n' << " " << *getLoadStorePointerOperand(ChainInstr) << '\n' ; }; } } while (false) |
677 | << " " << *ChainInstr << '\n'do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { dbgs() << "LSV: Found alias:\n" " Aliasing instruction and pointer:\n" << " " << *MemInstr << '\n' << " " << *getLoadStorePointerOperand (MemInstr) << '\n' << " Aliased instruction and pointer:\n" << " " << *ChainInstr << '\n' << " " << *getLoadStorePointerOperand(ChainInstr) << '\n' ; }; } } while (false) |
678 | << " " << *getLoadStorePointerOperand(ChainInstr) << '\n';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { dbgs() << "LSV: Found alias:\n" " Aliasing instruction and pointer:\n" << " " << *MemInstr << '\n' << " " << *getLoadStorePointerOperand (MemInstr) << '\n' << " Aliased instruction and pointer:\n" << " " << *ChainInstr << '\n' << " " << *getLoadStorePointerOperand(ChainInstr) << '\n' ; }; } } while (false) |
679 | })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { dbgs() << "LSV: Found alias:\n" " Aliasing instruction and pointer:\n" << " " << *MemInstr << '\n' << " " << *getLoadStorePointerOperand (MemInstr) << '\n' << " Aliased instruction and pointer:\n" << " " << *ChainInstr << '\n' << " " << *getLoadStorePointerOperand(ChainInstr) << '\n' ; }; } } while (false); |
680 | // Save this aliasing memory instruction as a barrier, but allow other |
681 | // instructions that precede the barrier to be vectorized with this one. |
682 | BarrierMemoryInstr = MemInstr; |
683 | break; |
684 | } |
685 | } |
686 | // Continue the search only for store chains, since vectorizing stores that |
687 | // precede an aliasing load is valid. Conversely, vectorizing loads is valid |
688 | // up to an aliasing store, but should not pull loads from further down in |
689 | // the basic block. |
690 | if (IsLoadChain && BarrierMemoryInstr) { |
691 | // The BarrierMemoryInstr is a store that precedes ChainInstr. |
692 | assert(OBB.dominates(BarrierMemoryInstr, ChainInstr))((OBB.dominates(BarrierMemoryInstr, ChainInstr)) ? static_cast <void> (0) : __assert_fail ("OBB.dominates(BarrierMemoryInstr, ChainInstr)" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 692, __PRETTY_FUNCTION__)); |
693 | break; |
694 | } |
695 | } |
696 | |
697 | // Find the largest prefix of Chain whose elements are all in |
698 | // ChainInstrs[0, ChainInstrIdx). This is the largest vectorizable prefix of |
699 | // Chain. (Recall that Chain is in address order, but ChainInstrs is in BB |
700 | // order.) |
701 | SmallPtrSet<Instruction *, 8> VectorizableChainInstrs( |
702 | ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx); |
703 | unsigned ChainIdx = 0; |
704 | for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) { |
705 | if (!VectorizableChainInstrs.count(Chain[ChainIdx])) |
706 | break; |
707 | } |
708 | return Chain.slice(0, ChainIdx); |
709 | } |
710 | |
711 | static ChainID getChainID(const Value *Ptr, const DataLayout &DL) { |
712 | const Value *ObjPtr = GetUnderlyingObject(Ptr, DL); |
713 | if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) { |
714 | // The select's themselves are distinct instructions even if they share the |
715 | // same condition and evaluate to consecutive pointers for true and false |
716 | // values of the condition. Therefore using the select's themselves for |
717 | // grouping instructions would put consecutive accesses into different lists |
718 | // and they won't be even checked for being consecutive, and won't be |
719 | // vectorized. |
720 | return Sel->getCondition(); |
721 | } |
722 | return ObjPtr; |
723 | } |
724 | |
725 | std::pair<InstrListMap, InstrListMap> |
726 | Vectorizer::collectInstructions(BasicBlock *BB) { |
727 | InstrListMap LoadRefs; |
728 | InstrListMap StoreRefs; |
729 | |
730 | for (Instruction &I : *BB) { |
731 | if (!I.mayReadOrWriteMemory()) |
732 | continue; |
733 | |
734 | if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { |
735 | if (!LI->isSimple()) |
736 | continue; |
737 | |
738 | // Skip if it's not legal. |
739 | if (!TTI.isLegalToVectorizeLoad(LI)) |
740 | continue; |
741 | |
742 | Type *Ty = LI->getType(); |
743 | if (!VectorType::isValidElementType(Ty->getScalarType())) |
744 | continue; |
745 | |
746 | // Skip weird non-byte sizes. They probably aren't worth the effort of |
747 | // handling correctly. |
748 | unsigned TySize = DL.getTypeSizeInBits(Ty); |
749 | if ((TySize % 8) != 0) |
750 | continue; |
751 | |
752 | // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain |
753 | // functions are currently using an integer type for the vectorized |
754 | // load/store, and does not support casting between the integer type and a |
755 | // vector of pointers (e.g. i64 to <2 x i16*>) |
756 | if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy()) |
757 | continue; |
758 | |
759 | Value *Ptr = LI->getPointerOperand(); |
760 | unsigned AS = Ptr->getType()->getPointerAddressSpace(); |
761 | unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); |
762 | |
763 | unsigned VF = VecRegSize / TySize; |
764 | VectorType *VecTy = dyn_cast<VectorType>(Ty); |
765 | |
766 | // No point in looking at these if they're too big to vectorize. |
767 | if (TySize > VecRegSize / 2 || |
768 | (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0)) |
769 | continue; |
770 | |
771 | // Make sure all the users of a vector are constant-index extracts. |
772 | if (isa<VectorType>(Ty) && !llvm::all_of(LI->users(), [](const User *U) { |
773 | const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U); |
774 | return EEI && isa<ConstantInt>(EEI->getOperand(1)); |
775 | })) |
776 | continue; |
777 | |
778 | // Save the load locations. |
779 | const ChainID ID = getChainID(Ptr, DL); |
780 | LoadRefs[ID].push_back(LI); |
781 | } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) { |
782 | if (!SI->isSimple()) |
783 | continue; |
784 | |
785 | // Skip if it's not legal. |
786 | if (!TTI.isLegalToVectorizeStore(SI)) |
787 | continue; |
788 | |
789 | Type *Ty = SI->getValueOperand()->getType(); |
790 | if (!VectorType::isValidElementType(Ty->getScalarType())) |
791 | continue; |
792 | |
793 | // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain |
794 | // functions are currently using an integer type for the vectorized |
795 | // load/store, and does not support casting between the integer type and a |
796 | // vector of pointers (e.g. i64 to <2 x i16*>) |
797 | if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy()) |
798 | continue; |
799 | |
800 | // Skip weird non-byte sizes. They probably aren't worth the effort of |
801 | // handling correctly. |
802 | unsigned TySize = DL.getTypeSizeInBits(Ty); |
803 | if ((TySize % 8) != 0) |
804 | continue; |
805 | |
806 | Value *Ptr = SI->getPointerOperand(); |
807 | unsigned AS = Ptr->getType()->getPointerAddressSpace(); |
808 | unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); |
809 | |
810 | unsigned VF = VecRegSize / TySize; |
811 | VectorType *VecTy = dyn_cast<VectorType>(Ty); |
812 | |
813 | // No point in looking at these if they're too big to vectorize. |
814 | if (TySize > VecRegSize / 2 || |
815 | (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0)) |
816 | continue; |
817 | |
818 | if (isa<VectorType>(Ty) && !llvm::all_of(SI->users(), [](const User *U) { |
819 | const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U); |
820 | return EEI && isa<ConstantInt>(EEI->getOperand(1)); |
821 | })) |
822 | continue; |
823 | |
824 | // Save store location. |
825 | const ChainID ID = getChainID(Ptr, DL); |
826 | StoreRefs[ID].push_back(SI); |
827 | } |
828 | } |
829 | |
830 | return {LoadRefs, StoreRefs}; |
831 | } |
832 | |
833 | bool Vectorizer::vectorizeChains(InstrListMap &Map) { |
834 | bool Changed = false; |
835 | |
836 | for (const std::pair<ChainID, InstrList> &Chain : Map) { |
837 | unsigned Size = Chain.second.size(); |
838 | if (Size < 2) |
839 | continue; |
840 | |
841 | LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n"; } } while (false); |
842 | |
843 | // Process the stores in chunks of 64. |
844 | for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) { |
845 | unsigned Len = std::min<unsigned>(CE - CI, 64); |
846 | ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len); |
847 | Changed |= vectorizeInstructions(Chunk); |
848 | } |
849 | } |
850 | |
851 | return Changed; |
852 | } |
853 | |
854 | bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) { |
855 | LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { dbgs() << "LSV: Vectorizing " << Instrs.size() << " instructions.\n"; } } while (false) |
856 | << " instructions.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { dbgs() << "LSV: Vectorizing " << Instrs.size() << " instructions.\n"; } } while (false); |
857 | SmallVector<int, 16> Heads, Tails; |
858 | int ConsecutiveChain[64]; |
859 | |
860 | // Do a quadratic search on all of the given loads/stores and find all of the |
861 | // pairs of loads/stores that follow each other. |
862 | for (int i = 0, e = Instrs.size(); i < e; ++i) { |
863 | ConsecutiveChain[i] = -1; |
864 | for (int j = e - 1; j >= 0; --j) { |
865 | if (i == j) |
866 | continue; |
867 | |
868 | if (isConsecutiveAccess(Instrs[i], Instrs[j])) { |
869 | if (ConsecutiveChain[i] != -1) { |
870 | int CurDistance = std::abs(ConsecutiveChain[i] - i); |
871 | int NewDistance = std::abs(ConsecutiveChain[i] - j); |
872 | if (j < i || NewDistance > CurDistance) |
873 | continue; // Should not insert. |
874 | } |
875 | |
876 | Tails.push_back(j); |
877 | Heads.push_back(i); |
878 | ConsecutiveChain[i] = j; |
879 | } |
880 | } |
881 | } |
882 | |
883 | bool Changed = false; |
884 | SmallPtrSet<Instruction *, 16> InstructionsProcessed; |
885 | |
886 | for (int Head : Heads) { |
887 | if (InstructionsProcessed.count(Instrs[Head])) |
888 | continue; |
889 | bool LongerChainExists = false; |
890 | for (unsigned TIt = 0; TIt < Tails.size(); TIt++) |
891 | if (Head == Tails[TIt] && |
892 | !InstructionsProcessed.count(Instrs[Heads[TIt]])) { |
893 | LongerChainExists = true; |
894 | break; |
895 | } |
896 | if (LongerChainExists) |
897 | continue; |
898 | |
899 | // We found an instr that starts a chain. Now follow the chain and try to |
900 | // vectorize it. |
901 | SmallVector<Instruction *, 16> Operands; |
902 | int I = Head; |
903 | while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) { |
904 | if (InstructionsProcessed.count(Instrs[I])) |
905 | break; |
906 | |
907 | Operands.push_back(Instrs[I]); |
908 | I = ConsecutiveChain[I]; |
909 | } |
910 | |
911 | bool Vectorized = false; |
912 | if (isa<LoadInst>(*Operands.begin())) |
913 | Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed); |
914 | else |
915 | Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed); |
916 | |
917 | Changed |= Vectorized; |
918 | } |
919 | |
920 | return Changed; |
921 | } |
922 | |
923 | bool Vectorizer::vectorizeStoreChain( |
924 | ArrayRef<Instruction *> Chain, |
925 | SmallPtrSet<Instruction *, 16> *InstructionsProcessed) { |
926 | StoreInst *S0 = cast<StoreInst>(Chain[0]); |
927 | |
928 | // If the vector has an int element, default to int for the whole store. |
929 | Type *StoreTy = nullptr; |
930 | for (Instruction *I : Chain) { |
931 | StoreTy = cast<StoreInst>(I)->getValueOperand()->getType(); |
932 | if (StoreTy->isIntOrIntVectorTy()) |
933 | break; |
934 | |
935 | if (StoreTy->isPtrOrPtrVectorTy()) { |
936 | StoreTy = Type::getIntNTy(F.getParent()->getContext(), |
937 | DL.getTypeSizeInBits(StoreTy)); |
938 | break; |
939 | } |
940 | } |
941 | assert(StoreTy && "Failed to find store type")((StoreTy && "Failed to find store type") ? static_cast <void> (0) : __assert_fail ("StoreTy && \"Failed to find store type\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 941, __PRETTY_FUNCTION__)); |
942 | |
943 | unsigned Sz = DL.getTypeSizeInBits(StoreTy); |
944 | unsigned AS = S0->getPointerAddressSpace(); |
945 | unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); |
946 | unsigned VF = VecRegSize / Sz; |
947 | unsigned ChainSize = Chain.size(); |
948 | unsigned Alignment = getAlignment(S0); |
949 | |
950 | if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) { |
951 | InstructionsProcessed->insert(Chain.begin(), Chain.end()); |
952 | return false; |
953 | } |
954 | |
955 | ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain); |
956 | if (NewChain.empty()) { |
957 | // No vectorization possible. |
958 | InstructionsProcessed->insert(Chain.begin(), Chain.end()); |
959 | return false; |
960 | } |
961 | if (NewChain.size() == 1) { |
962 | // Failed after the first instruction. Discard it and try the smaller chain. |
963 | InstructionsProcessed->insert(NewChain.front()); |
964 | return false; |
965 | } |
966 | |
967 | // Update Chain to the valid vectorizable subchain. |
968 | Chain = NewChain; |
969 | ChainSize = Chain.size(); |
970 | |
971 | // Check if it's legal to vectorize this chain. If not, split the chain and |
972 | // try again. |
973 | unsigned EltSzInBytes = Sz / 8; |
974 | unsigned SzInBytes = EltSzInBytes * ChainSize; |
975 | |
976 | VectorType *VecTy; |
977 | VectorType *VecStoreTy = dyn_cast<VectorType>(StoreTy); |
978 | if (VecStoreTy) |
979 | VecTy = VectorType::get(StoreTy->getScalarType(), |
980 | Chain.size() * VecStoreTy->getNumElements()); |
981 | else |
982 | VecTy = VectorType::get(StoreTy, Chain.size()); |
983 | |
984 | // If it's more than the max vector size or the target has a better |
985 | // vector factor, break it into two pieces. |
986 | unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy); |
987 | if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) { |
988 | LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { dbgs() << "LSV: Chain doesn't match with the vector factor." " Creating two separate arrays.\n"; } } while (false) |
989 | " Creating two separate arrays.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { dbgs() << "LSV: Chain doesn't match with the vector factor." " Creating two separate arrays.\n"; } } while (false); |
990 | return vectorizeStoreChain(Chain.slice(0, TargetVF), |
991 | InstructionsProcessed) | |
992 | vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed); |
993 | } |
994 | |
995 | LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { dbgs() << "LSV: Stores to vectorize:\n" ; for (Instruction *I : Chain) dbgs() << " " << * I << "\n"; }; } } while (false) |
996 | dbgs() << "LSV: Stores to vectorize:\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { dbgs() << "LSV: Stores to vectorize:\n" ; for (Instruction *I : Chain) dbgs() << " " << * I << "\n"; }; } } while (false) |
997 | for (Instruction *I : Chain)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { dbgs() << "LSV: Stores to vectorize:\n" ; for (Instruction *I : Chain) dbgs() << " " << * I << "\n"; }; } } while (false) |
998 | dbgs() << " " << *I << "\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { dbgs() << "LSV: Stores to vectorize:\n" ; for (Instruction *I : Chain) dbgs() << " " << * I << "\n"; }; } } while (false) |
999 | })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { dbgs() << "LSV: Stores to vectorize:\n" ; for (Instruction *I : Chain) dbgs() << " " << * I << "\n"; }; } } while (false); |
1000 | |
1001 | // We won't try again to vectorize the elements of the chain, regardless of |
1002 | // whether we succeed below. |
1003 | InstructionsProcessed->insert(Chain.begin(), Chain.end()); |
1004 | |
1005 | // If the store is going to be misaligned, don't vectorize it. |
1006 | if (accessIsMisaligned(SzInBytes, AS, Alignment)) { |
1007 | if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) { |
1008 | auto Chains = splitOddVectorElts(Chain, Sz); |
1009 | return vectorizeStoreChain(Chains.first, InstructionsProcessed) | |
1010 | vectorizeStoreChain(Chains.second, InstructionsProcessed); |
1011 | } |
1012 | |
1013 | unsigned NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(), |
1014 | StackAdjustedAlignment, |
1015 | DL, S0, nullptr, &DT); |
1016 | if (NewAlign != 0) |
1017 | Alignment = NewAlign; |
1018 | } |
1019 | |
1020 | if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) { |
1021 | auto Chains = splitOddVectorElts(Chain, Sz); |
1022 | return vectorizeStoreChain(Chains.first, InstructionsProcessed) | |
1023 | vectorizeStoreChain(Chains.second, InstructionsProcessed); |
1024 | } |
1025 | |
1026 | BasicBlock::iterator First, Last; |
1027 | std::tie(First, Last) = getBoundaryInstrs(Chain); |
1028 | Builder.SetInsertPoint(&*Last); |
1029 | |
1030 | Value *Vec = UndefValue::get(VecTy); |
1031 | |
1032 | if (VecStoreTy) { |
1033 | unsigned VecWidth = VecStoreTy->getNumElements(); |
1034 | for (unsigned I = 0, E = Chain.size(); I != E; ++I) { |
1035 | StoreInst *Store = cast<StoreInst>(Chain[I]); |
1036 | for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) { |
1037 | unsigned NewIdx = J + I * VecWidth; |
1038 | Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(), |
1039 | Builder.getInt32(J)); |
1040 | if (Extract->getType() != StoreTy->getScalarType()) |
1041 | Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType()); |
1042 | |
1043 | Value *Insert = |
1044 | Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx)); |
1045 | Vec = Insert; |
1046 | } |
1047 | } |
1048 | } else { |
1049 | for (unsigned I = 0, E = Chain.size(); I != E; ++I) { |
1050 | StoreInst *Store = cast<StoreInst>(Chain[I]); |
1051 | Value *Extract = Store->getValueOperand(); |
1052 | if (Extract->getType() != StoreTy->getScalarType()) |
1053 | Extract = |
1054 | Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType()); |
1055 | |
1056 | Value *Insert = |
1057 | Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I)); |
1058 | Vec = Insert; |
1059 | } |
1060 | } |
1061 | |
1062 | StoreInst *SI = Builder.CreateAlignedStore( |
1063 | Vec, |
1064 | Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)), |
1065 | Alignment); |
1066 | propagateMetadata(SI, Chain); |
1067 | |
1068 | eraseInstructions(Chain); |
1069 | ++NumVectorInstructions; |
1070 | NumScalarsVectorized += Chain.size(); |
1071 | return true; |
1072 | } |
1073 | |
1074 | bool Vectorizer::vectorizeLoadChain( |
1075 | ArrayRef<Instruction *> Chain, |
1076 | SmallPtrSet<Instruction *, 16> *InstructionsProcessed) { |
1077 | LoadInst *L0 = cast<LoadInst>(Chain[0]); |
1078 | |
1079 | // If the vector has an int element, default to int for the whole load. |
1080 | Type *LoadTy; |
1081 | for (const auto &V : Chain) { |
1082 | LoadTy = cast<LoadInst>(V)->getType(); |
1083 | if (LoadTy->isIntOrIntVectorTy()) |
1084 | break; |
1085 | |
1086 | if (LoadTy->isPtrOrPtrVectorTy()) { |
1087 | LoadTy = Type::getIntNTy(F.getParent()->getContext(), |
1088 | DL.getTypeSizeInBits(LoadTy)); |
1089 | break; |
1090 | } |
1091 | } |
1092 | |
1093 | unsigned Sz = DL.getTypeSizeInBits(LoadTy); |
1094 | unsigned AS = L0->getPointerAddressSpace(); |
1095 | unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); |
1096 | unsigned VF = VecRegSize / Sz; |
1097 | unsigned ChainSize = Chain.size(); |
1098 | unsigned Alignment = getAlignment(L0); |
1099 | |
1100 | if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) { |
1101 | InstructionsProcessed->insert(Chain.begin(), Chain.end()); |
1102 | return false; |
1103 | } |
1104 | |
1105 | ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain); |
1106 | if (NewChain.empty()) { |
1107 | // No vectorization possible. |
1108 | InstructionsProcessed->insert(Chain.begin(), Chain.end()); |
1109 | return false; |
1110 | } |
1111 | if (NewChain.size() == 1) { |
1112 | // Failed after the first instruction. Discard it and try the smaller chain. |
1113 | InstructionsProcessed->insert(NewChain.front()); |
1114 | return false; |
1115 | } |
1116 | |
1117 | // Update Chain to the valid vectorizable subchain. |
1118 | Chain = NewChain; |
1119 | ChainSize = Chain.size(); |
1120 | |
1121 | // Check if it's legal to vectorize this chain. If not, split the chain and |
1122 | // try again. |
1123 | unsigned EltSzInBytes = Sz / 8; |
1124 | unsigned SzInBytes = EltSzInBytes * ChainSize; |
1125 | VectorType *VecTy; |
1126 | VectorType *VecLoadTy = dyn_cast<VectorType>(LoadTy); |
1127 | if (VecLoadTy) |
1128 | VecTy = VectorType::get(LoadTy->getScalarType(), |
1129 | Chain.size() * VecLoadTy->getNumElements()); |
1130 | else |
1131 | VecTy = VectorType::get(LoadTy, Chain.size()); |
1132 | |
1133 | // If it's more than the max vector size or the target has a better |
1134 | // vector factor, break it into two pieces. |
1135 | unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy); |
1136 | if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) { |
1137 | LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { dbgs() << "LSV: Chain doesn't match with the vector factor." " Creating two separate arrays.\n"; } } while (false) |
1138 | " Creating two separate arrays.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { dbgs() << "LSV: Chain doesn't match with the vector factor." " Creating two separate arrays.\n"; } } while (false); |
1139 | return vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed) | |
1140 | vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed); |
1141 | } |
1142 | |
1143 | // We won't try again to vectorize the elements of the chain, regardless of |
1144 | // whether we succeed below. |
1145 | InstructionsProcessed->insert(Chain.begin(), Chain.end()); |
1146 | |
1147 | // If the load is going to be misaligned, don't vectorize it. |
1148 | if (accessIsMisaligned(SzInBytes, AS, Alignment)) { |
1149 | if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) { |
1150 | auto Chains = splitOddVectorElts(Chain, Sz); |
1151 | return vectorizeLoadChain(Chains.first, InstructionsProcessed) | |
1152 | vectorizeLoadChain(Chains.second, InstructionsProcessed); |
1153 | } |
1154 | |
1155 | unsigned NewAlign = getOrEnforceKnownAlignment(L0->getPointerOperand(), |
1156 | StackAdjustedAlignment, |
1157 | DL, L0, nullptr, &DT); |
1158 | if (NewAlign != 0) |
1159 | Alignment = NewAlign; |
Value stored to 'Alignment' is never read | |
1160 | |
1161 | Alignment = NewAlign; |
1162 | } |
1163 | |
1164 | if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) { |
1165 | auto Chains = splitOddVectorElts(Chain, Sz); |
1166 | return vectorizeLoadChain(Chains.first, InstructionsProcessed) | |
1167 | vectorizeLoadChain(Chains.second, InstructionsProcessed); |
1168 | } |
1169 | |
1170 | LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { dbgs() << "LSV: Loads to vectorize:\n" ; for (Instruction *I : Chain) I->dump(); }; } } while (false ) |
1171 | dbgs() << "LSV: Loads to vectorize:\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { dbgs() << "LSV: Loads to vectorize:\n" ; for (Instruction *I : Chain) I->dump(); }; } } while (false ) |
1172 | for (Instruction *I : Chain)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { dbgs() << "LSV: Loads to vectorize:\n" ; for (Instruction *I : Chain) I->dump(); }; } } while (false ) |
1173 | I->dump();do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { dbgs() << "LSV: Loads to vectorize:\n" ; for (Instruction *I : Chain) I->dump(); }; } } while (false ) |
1174 | })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { dbgs() << "LSV: Loads to vectorize:\n" ; for (Instruction *I : Chain) I->dump(); }; } } while (false ); |
1175 | |
1176 | // getVectorizablePrefix already computed getBoundaryInstrs. The value of |
1177 | // Last may have changed since then, but the value of First won't have. If it |
1178 | // matters, we could compute getBoundaryInstrs only once and reuse it here. |
1179 | BasicBlock::iterator First, Last; |
1180 | std::tie(First, Last) = getBoundaryInstrs(Chain); |
1181 | Builder.SetInsertPoint(&*First); |
1182 | |
1183 | Value *Bitcast = |
1184 | Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS)); |
1185 | LoadInst *LI = Builder.CreateAlignedLoad(VecTy, Bitcast, Alignment); |
1186 | propagateMetadata(LI, Chain); |
1187 | |
1188 | if (VecLoadTy) { |
1189 | SmallVector<Instruction *, 16> InstrsToErase; |
1190 | |
1191 | unsigned VecWidth = VecLoadTy->getNumElements(); |
1192 | for (unsigned I = 0, E = Chain.size(); I != E; ++I) { |
1193 | for (auto Use : Chain[I]->users()) { |
1194 | // All users of vector loads are ExtractElement instructions with |
1195 | // constant indices, otherwise we would have bailed before now. |
1196 | Instruction *UI = cast<Instruction>(Use); |
1197 | unsigned Idx = cast<ConstantInt>(UI->getOperand(1))->getZExtValue(); |
1198 | unsigned NewIdx = Idx + I * VecWidth; |
1199 | Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx), |
1200 | UI->getName()); |
1201 | if (V->getType() != UI->getType()) |
1202 | V = Builder.CreateBitCast(V, UI->getType()); |
1203 | |
1204 | // Replace the old instruction. |
1205 | UI->replaceAllUsesWith(V); |
1206 | InstrsToErase.push_back(UI); |
1207 | } |
1208 | } |
1209 | |
1210 | // Bitcast might not be an Instruction, if the value being loaded is a |
1211 | // constant. In that case, no need to reorder anything. |
1212 | if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast)) |
1213 | reorder(BitcastInst); |
1214 | |
1215 | for (auto I : InstrsToErase) |
1216 | I->eraseFromParent(); |
1217 | } else { |
1218 | for (unsigned I = 0, E = Chain.size(); I != E; ++I) { |
1219 | Value *CV = Chain[I]; |
1220 | Value *V = |
1221 | Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName()); |
1222 | if (V->getType() != CV->getType()) { |
1223 | V = Builder.CreateBitOrPointerCast(V, CV->getType()); |
1224 | } |
1225 | |
1226 | // Replace the old instruction. |
1227 | CV->replaceAllUsesWith(V); |
1228 | } |
1229 | |
1230 | if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast)) |
1231 | reorder(BitcastInst); |
1232 | } |
1233 | |
1234 | eraseInstructions(Chain); |
1235 | |
1236 | ++NumVectorInstructions; |
1237 | NumScalarsVectorized += Chain.size(); |
1238 | return true; |
1239 | } |
1240 | |
1241 | bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace, |
1242 | unsigned Alignment) { |
1243 | if (Alignment % SzInBytes == 0) |
1244 | return false; |
1245 | |
1246 | bool Fast = false; |
1247 | bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(), |
1248 | SzInBytes * 8, AddressSpace, |
1249 | Alignment, &Fast); |
1250 | LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allowsdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { dbgs() << "LSV: Target said misaligned is allowed? " << Allows << " and fast? " << Fast << "\n";; } } while (false) |
1251 | << " and fast? " << Fast << "\n";)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { dbgs() << "LSV: Target said misaligned is allowed? " << Allows << " and fast? " << Fast << "\n";; } } while (false); |
1252 | return !Allows || !Fast; |
1253 | } |