File: | lib/Transforms/Vectorize/LoadStoreVectorizer.cpp |
Warning: | line 1108, column 17 1st function call argument is an uninitialized value |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===// | |||
2 | // | |||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | |||
4 | // See https://llvm.org/LICENSE.txt for license information. | |||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | |||
6 | // | |||
7 | //===----------------------------------------------------------------------===// | |||
8 | // | |||
9 | // This pass merges loads/stores to/from sequential memory addresses into vector | |||
10 | // loads/stores. Although there's nothing GPU-specific in here, this pass is | |||
11 | // motivated by the microarchitectural quirks of nVidia and AMD GPUs. | |||
12 | // | |||
13 | // (For simplicity below we talk about loads only, but everything also applies | |||
14 | // to stores.) | |||
15 | // | |||
16 | // This pass is intended to be run late in the pipeline, after other | |||
17 | // vectorization opportunities have been exploited. So the assumption here is | |||
18 | // that immediately following our new vector load we'll need to extract out the | |||
19 | // individual elements of the load, so we can operate on them individually. | |||
20 | // | |||
21 | // On CPUs this transformation is usually not beneficial, because extracting the | |||
22 | // elements of a vector register is expensive on most architectures. It's | |||
23 | // usually better just to load each element individually into its own scalar | |||
24 | // register. | |||
25 | // | |||
26 | // However, nVidia and AMD GPUs don't have proper vector registers. Instead, a | |||
27 | // "vector load" loads directly into a series of scalar registers. In effect, | |||
28 | // extracting the elements of the vector is free. It's therefore always | |||
29 | // beneficial to vectorize a sequence of loads on these architectures. | |||
30 | // | |||
31 | // Vectorizing (perhaps a better name might be "coalescing") loads can have | |||
32 | // large performance impacts on GPU kernels, and opportunities for vectorizing | |||
33 | // are common in GPU code. This pass tries very hard to find such | |||
34 | // opportunities; its runtime is quadratic in the number of loads in a BB. | |||
35 | // | |||
36 | // Some CPU architectures, such as ARM, have instructions that load into | |||
37 | // multiple scalar registers, similar to a GPU vectorized load. In theory ARM | |||
38 | // could use this pass (with some modifications), but currently it implements | |||
39 | // its own pass to do something similar to what we do here. | |||
40 | ||||
41 | #include "llvm/ADT/APInt.h" | |||
42 | #include "llvm/ADT/ArrayRef.h" | |||
43 | #include "llvm/ADT/MapVector.h" | |||
44 | #include "llvm/ADT/PostOrderIterator.h" | |||
45 | #include "llvm/ADT/STLExtras.h" | |||
46 | #include "llvm/ADT/SmallPtrSet.h" | |||
47 | #include "llvm/ADT/SmallVector.h" | |||
48 | #include "llvm/ADT/Statistic.h" | |||
49 | #include "llvm/ADT/iterator_range.h" | |||
50 | #include "llvm/Analysis/AliasAnalysis.h" | |||
51 | #include "llvm/Analysis/MemoryLocation.h" | |||
52 | #include "llvm/Analysis/OrderedBasicBlock.h" | |||
53 | #include "llvm/Analysis/ScalarEvolution.h" | |||
54 | #include "llvm/Analysis/TargetTransformInfo.h" | |||
55 | #include "llvm/Transforms/Utils/Local.h" | |||
56 | #include "llvm/Analysis/ValueTracking.h" | |||
57 | #include "llvm/Analysis/VectorUtils.h" | |||
58 | #include "llvm/IR/Attributes.h" | |||
59 | #include "llvm/IR/BasicBlock.h" | |||
60 | #include "llvm/IR/Constants.h" | |||
61 | #include "llvm/IR/DataLayout.h" | |||
62 | #include "llvm/IR/DerivedTypes.h" | |||
63 | #include "llvm/IR/Dominators.h" | |||
64 | #include "llvm/IR/Function.h" | |||
65 | #include "llvm/IR/IRBuilder.h" | |||
66 | #include "llvm/IR/InstrTypes.h" | |||
67 | #include "llvm/IR/Instruction.h" | |||
68 | #include "llvm/IR/Instructions.h" | |||
69 | #include "llvm/IR/IntrinsicInst.h" | |||
70 | #include "llvm/IR/Module.h" | |||
71 | #include "llvm/IR/Type.h" | |||
72 | #include "llvm/IR/User.h" | |||
73 | #include "llvm/IR/Value.h" | |||
74 | #include "llvm/Pass.h" | |||
75 | #include "llvm/Support/Casting.h" | |||
76 | #include "llvm/Support/Debug.h" | |||
77 | #include "llvm/Support/KnownBits.h" | |||
78 | #include "llvm/Support/MathExtras.h" | |||
79 | #include "llvm/Support/raw_ostream.h" | |||
80 | #include "llvm/Transforms/Vectorize.h" | |||
81 | #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" | |||
82 | #include <algorithm> | |||
83 | #include <cassert> | |||
84 | #include <cstdlib> | |||
85 | #include <tuple> | |||
86 | #include <utility> | |||
87 | ||||
88 | using namespace llvm; | |||
89 | ||||
90 | #define DEBUG_TYPE"load-store-vectorizer" "load-store-vectorizer" | |||
91 | ||||
92 | STATISTIC(NumVectorInstructions, "Number of vector accesses generated")static llvm::Statistic NumVectorInstructions = {"load-store-vectorizer" , "NumVectorInstructions", "Number of vector accesses generated" , {0}, {false}}; | |||
93 | STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized")static llvm::Statistic NumScalarsVectorized = {"load-store-vectorizer" , "NumScalarsVectorized", "Number of scalar accesses vectorized" , {0}, {false}}; | |||
94 | ||||
95 | // FIXME: Assuming stack alignment of 4 is always good enough | |||
96 | static const unsigned StackAdjustedAlignment = 4; | |||
97 | ||||
98 | namespace { | |||
99 | ||||
100 | /// ChainID is an arbitrary token that is allowed to be different only for the | |||
101 | /// accesses that are guaranteed to be considered non-consecutive by | |||
102 | /// Vectorizer::isConsecutiveAccess. It's used for grouping instructions | |||
103 | /// together and reducing the number of instructions the main search operates on | |||
104 | /// at a time, i.e. this is to reduce compile time and nothing else as the main | |||
105 | /// search has O(n^2) time complexity. The underlying type of ChainID should not | |||
106 | /// be relied upon. | |||
107 | using ChainID = const Value *; | |||
108 | using InstrList = SmallVector<Instruction *, 8>; | |||
109 | using InstrListMap = MapVector<ChainID, InstrList>; | |||
110 | ||||
111 | class Vectorizer { | |||
112 | Function &F; | |||
113 | AliasAnalysis &AA; | |||
114 | DominatorTree &DT; | |||
115 | ScalarEvolution &SE; | |||
116 | TargetTransformInfo &TTI; | |||
117 | const DataLayout &DL; | |||
118 | IRBuilder<> Builder; | |||
119 | ||||
120 | public: | |||
121 | Vectorizer(Function &F, AliasAnalysis &AA, DominatorTree &DT, | |||
122 | ScalarEvolution &SE, TargetTransformInfo &TTI) | |||
123 | : F(F), AA(AA), DT(DT), SE(SE), TTI(TTI), | |||
124 | DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {} | |||
125 | ||||
126 | bool run(); | |||
127 | ||||
128 | private: | |||
129 | unsigned getPointerAddressSpace(Value *I); | |||
130 | ||||
131 | unsigned getAlignment(LoadInst *LI) const { | |||
132 | unsigned Align = LI->getAlignment(); | |||
133 | if (Align != 0) | |||
134 | return Align; | |||
135 | ||||
136 | return DL.getABITypeAlignment(LI->getType()); | |||
137 | } | |||
138 | ||||
139 | unsigned getAlignment(StoreInst *SI) const { | |||
140 | unsigned Align = SI->getAlignment(); | |||
141 | if (Align != 0) | |||
142 | return Align; | |||
143 | ||||
144 | return DL.getABITypeAlignment(SI->getValueOperand()->getType()); | |||
145 | } | |||
146 | ||||
147 | static const unsigned MaxDepth = 3; | |||
148 | ||||
149 | bool isConsecutiveAccess(Value *A, Value *B); | |||
150 | bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt PtrDelta, | |||
151 | unsigned Depth = 0) const; | |||
152 | bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta, | |||
153 | unsigned Depth) const; | |||
154 | bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta, | |||
155 | unsigned Depth) const; | |||
156 | ||||
157 | /// After vectorization, reorder the instructions that I depends on | |||
158 | /// (the instructions defining its operands), to ensure they dominate I. | |||
159 | void reorder(Instruction *I); | |||
160 | ||||
161 | /// Returns the first and the last instructions in Chain. | |||
162 | std::pair<BasicBlock::iterator, BasicBlock::iterator> | |||
163 | getBoundaryInstrs(ArrayRef<Instruction *> Chain); | |||
164 | ||||
165 | /// Erases the original instructions after vectorizing. | |||
166 | void eraseInstructions(ArrayRef<Instruction *> Chain); | |||
167 | ||||
168 | /// "Legalize" the vector type that would be produced by combining \p | |||
169 | /// ElementSizeBits elements in \p Chain. Break into two pieces such that the | |||
170 | /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is | |||
171 | /// expected to have more than 4 elements. | |||
172 | std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>> | |||
173 | splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits); | |||
174 | ||||
175 | /// Finds the largest prefix of Chain that's vectorizable, checking for | |||
176 | /// intervening instructions which may affect the memory accessed by the | |||
177 | /// instructions within Chain. | |||
178 | /// | |||
179 | /// The elements of \p Chain must be all loads or all stores and must be in | |||
180 | /// address order. | |||
181 | ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain); | |||
182 | ||||
183 | /// Collects load and store instructions to vectorize. | |||
184 | std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB); | |||
185 | ||||
186 | /// Processes the collected instructions, the \p Map. The values of \p Map | |||
187 | /// should be all loads or all stores. | |||
188 | bool vectorizeChains(InstrListMap &Map); | |||
189 | ||||
190 | /// Finds the load/stores to consecutive memory addresses and vectorizes them. | |||
191 | bool vectorizeInstructions(ArrayRef<Instruction *> Instrs); | |||
192 | ||||
193 | /// Vectorizes the load instructions in Chain. | |||
194 | bool | |||
195 | vectorizeLoadChain(ArrayRef<Instruction *> Chain, | |||
196 | SmallPtrSet<Instruction *, 16> *InstructionsProcessed); | |||
197 | ||||
198 | /// Vectorizes the store instructions in Chain. | |||
199 | bool | |||
200 | vectorizeStoreChain(ArrayRef<Instruction *> Chain, | |||
201 | SmallPtrSet<Instruction *, 16> *InstructionsProcessed); | |||
202 | ||||
203 | /// Check if this load/store access is misaligned accesses. | |||
204 | bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace, | |||
205 | unsigned Alignment); | |||
206 | }; | |||
207 | ||||
208 | class LoadStoreVectorizerLegacyPass : public FunctionPass { | |||
209 | public: | |||
210 | static char ID; | |||
211 | ||||
212 | LoadStoreVectorizerLegacyPass() : FunctionPass(ID) { | |||
213 | initializeLoadStoreVectorizerLegacyPassPass(*PassRegistry::getPassRegistry()); | |||
214 | } | |||
215 | ||||
216 | bool runOnFunction(Function &F) override; | |||
217 | ||||
218 | StringRef getPassName() const override { | |||
219 | return "GPU Load and Store Vectorizer"; | |||
220 | } | |||
221 | ||||
222 | void getAnalysisUsage(AnalysisUsage &AU) const override { | |||
223 | AU.addRequired<AAResultsWrapperPass>(); | |||
224 | AU.addRequired<ScalarEvolutionWrapperPass>(); | |||
225 | AU.addRequired<DominatorTreeWrapperPass>(); | |||
226 | AU.addRequired<TargetTransformInfoWrapperPass>(); | |||
227 | AU.setPreservesCFG(); | |||
228 | } | |||
229 | }; | |||
230 | ||||
231 | } // end anonymous namespace | |||
232 | ||||
233 | char LoadStoreVectorizerLegacyPass::ID = 0; | |||
234 | ||||
235 | INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,static void *initializeLoadStoreVectorizerLegacyPassPassOnce( PassRegistry &Registry) { | |||
236 | "Vectorize load and Store instructions", false, false)static void *initializeLoadStoreVectorizerLegacyPassPassOnce( PassRegistry &Registry) { | |||
237 | INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)initializeSCEVAAWrapperPassPass(Registry); | |||
238 | INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)initializeDominatorTreeWrapperPassPass(Registry); | |||
239 | INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)initializeAAResultsWrapperPassPass(Registry); | |||
240 | INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)initializeGlobalsAAWrapperPassPass(Registry); | |||
241 | INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)initializeTargetTransformInfoWrapperPassPass(Registry); | |||
242 | INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,PassInfo *PI = new PassInfo( "Vectorize load and store instructions" , "load-store-vectorizer", &LoadStoreVectorizerLegacyPass ::ID, PassInfo::NormalCtor_t(callDefaultCtor<LoadStoreVectorizerLegacyPass >), false, false); Registry.registerPass(*PI, true); return PI; } static llvm::once_flag InitializeLoadStoreVectorizerLegacyPassPassFlag ; void llvm::initializeLoadStoreVectorizerLegacyPassPass(PassRegistry &Registry) { llvm::call_once(InitializeLoadStoreVectorizerLegacyPassPassFlag , initializeLoadStoreVectorizerLegacyPassPassOnce, std::ref(Registry )); } | |||
243 | "Vectorize load and store instructions", false, false)PassInfo *PI = new PassInfo( "Vectorize load and store instructions" , "load-store-vectorizer", &LoadStoreVectorizerLegacyPass ::ID, PassInfo::NormalCtor_t(callDefaultCtor<LoadStoreVectorizerLegacyPass >), false, false); Registry.registerPass(*PI, true); return PI; } static llvm::once_flag InitializeLoadStoreVectorizerLegacyPassPassFlag ; void llvm::initializeLoadStoreVectorizerLegacyPassPass(PassRegistry &Registry) { llvm::call_once(InitializeLoadStoreVectorizerLegacyPassPassFlag , initializeLoadStoreVectorizerLegacyPassPassOnce, std::ref(Registry )); } | |||
244 | ||||
245 | Pass *llvm::createLoadStoreVectorizerPass() { | |||
246 | return new LoadStoreVectorizerLegacyPass(); | |||
247 | } | |||
248 | ||||
249 | bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) { | |||
250 | // Don't vectorize when the attribute NoImplicitFloat is used. | |||
251 | if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat)) | |||
252 | return false; | |||
253 | ||||
254 | AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); | |||
255 | DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); | |||
256 | ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); | |||
257 | TargetTransformInfo &TTI = | |||
258 | getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); | |||
259 | ||||
260 | Vectorizer V(F, AA, DT, SE, TTI); | |||
261 | return V.run(); | |||
262 | } | |||
263 | ||||
264 | PreservedAnalyses LoadStoreVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) { | |||
265 | // Don't vectorize when the attribute NoImplicitFloat is used. | |||
266 | if (F.hasFnAttribute(Attribute::NoImplicitFloat)) | |||
267 | return PreservedAnalyses::all(); | |||
268 | ||||
269 | AliasAnalysis &AA = AM.getResult<AAManager>(F); | |||
270 | DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F); | |||
271 | ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F); | |||
272 | TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F); | |||
273 | ||||
274 | Vectorizer V(F, AA, DT, SE, TTI); | |||
275 | bool Changed = V.run(); | |||
276 | PreservedAnalyses PA; | |||
277 | PA.preserveSet<CFGAnalyses>(); | |||
278 | return Changed ? PA : PreservedAnalyses::all(); | |||
279 | } | |||
280 | ||||
281 | // The real propagateMetadata expects a SmallVector<Value*>, but we deal in | |||
282 | // vectors of Instructions. | |||
283 | static void propagateMetadata(Instruction *I, ArrayRef<Instruction *> IL) { | |||
284 | SmallVector<Value *, 8> VL(IL.begin(), IL.end()); | |||
285 | propagateMetadata(I, VL); | |||
286 | } | |||
287 | ||||
288 | // Vectorizer Implementation | |||
289 | bool Vectorizer::run() { | |||
290 | bool Changed = false; | |||
291 | ||||
292 | // Scan the blocks in the function in post order. | |||
293 | for (BasicBlock *BB : post_order(&F)) { | |||
294 | InstrListMap LoadRefs, StoreRefs; | |||
295 | std::tie(LoadRefs, StoreRefs) = collectInstructions(BB); | |||
296 | Changed |= vectorizeChains(LoadRefs); | |||
297 | Changed |= vectorizeChains(StoreRefs); | |||
298 | } | |||
299 | ||||
300 | return Changed; | |||
301 | } | |||
302 | ||||
303 | unsigned Vectorizer::getPointerAddressSpace(Value *I) { | |||
304 | if (LoadInst *L = dyn_cast<LoadInst>(I)) | |||
305 | return L->getPointerAddressSpace(); | |||
306 | if (StoreInst *S = dyn_cast<StoreInst>(I)) | |||
307 | return S->getPointerAddressSpace(); | |||
308 | return -1; | |||
309 | } | |||
310 | ||||
311 | // FIXME: Merge with llvm::isConsecutiveAccess | |||
312 | bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) { | |||
313 | Value *PtrA = getLoadStorePointerOperand(A); | |||
314 | Value *PtrB = getLoadStorePointerOperand(B); | |||
315 | unsigned ASA = getPointerAddressSpace(A); | |||
316 | unsigned ASB = getPointerAddressSpace(B); | |||
317 | ||||
318 | // Check that the address spaces match and that the pointers are valid. | |||
319 | if (!PtrA || !PtrB || (ASA != ASB)) | |||
320 | return false; | |||
321 | ||||
322 | // Make sure that A and B are different pointers of the same size type. | |||
323 | Type *PtrATy = PtrA->getType()->getPointerElementType(); | |||
324 | Type *PtrBTy = PtrB->getType()->getPointerElementType(); | |||
325 | if (PtrA == PtrB || | |||
326 | PtrATy->isVectorTy() != PtrBTy->isVectorTy() || | |||
327 | DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) || | |||
328 | DL.getTypeStoreSize(PtrATy->getScalarType()) != | |||
329 | DL.getTypeStoreSize(PtrBTy->getScalarType())) | |||
330 | return false; | |||
331 | ||||
332 | unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA); | |||
333 | APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy)); | |||
334 | ||||
335 | return areConsecutivePointers(PtrA, PtrB, Size); | |||
336 | } | |||
337 | ||||
338 | bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB, | |||
339 | APInt PtrDelta, unsigned Depth) const { | |||
340 | unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType()); | |||
341 | APInt OffsetA(PtrBitWidth, 0); | |||
342 | APInt OffsetB(PtrBitWidth, 0); | |||
343 | PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); | |||
344 | PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB); | |||
345 | ||||
346 | unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType()); | |||
347 | ||||
348 | if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType())) | |||
349 | return false; | |||
350 | ||||
351 | // In case if we have to shrink the pointer | |||
352 | // stripAndAccumulateInBoundsConstantOffsets should properly handle a | |||
353 | // possible overflow and the value should fit into a smallest data type | |||
354 | // used in the cast/gep chain. | |||
355 | assert(OffsetA.getMinSignedBits() <= NewPtrBitWidth &&((OffsetA.getMinSignedBits() <= NewPtrBitWidth && OffsetB .getMinSignedBits() <= NewPtrBitWidth) ? static_cast<void > (0) : __assert_fail ("OffsetA.getMinSignedBits() <= NewPtrBitWidth && OffsetB.getMinSignedBits() <= NewPtrBitWidth" , "/build/llvm-toolchain-snapshot-10~svn371925/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 356, __PRETTY_FUNCTION__)) | |||
356 | OffsetB.getMinSignedBits() <= NewPtrBitWidth)((OffsetA.getMinSignedBits() <= NewPtrBitWidth && OffsetB .getMinSignedBits() <= NewPtrBitWidth) ? static_cast<void > (0) : __assert_fail ("OffsetA.getMinSignedBits() <= NewPtrBitWidth && OffsetB.getMinSignedBits() <= NewPtrBitWidth" , "/build/llvm-toolchain-snapshot-10~svn371925/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 356, __PRETTY_FUNCTION__)); | |||
357 | ||||
358 | OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth); | |||
359 | OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth); | |||
360 | PtrDelta = PtrDelta.sextOrTrunc(NewPtrBitWidth); | |||
361 | ||||
362 | APInt OffsetDelta = OffsetB - OffsetA; | |||
363 | ||||
364 | // Check if they are based on the same pointer. That makes the offsets | |||
365 | // sufficient. | |||
366 | if (PtrA == PtrB) | |||
367 | return OffsetDelta == PtrDelta; | |||
368 | ||||
369 | // Compute the necessary base pointer delta to have the necessary final delta | |||
370 | // equal to the pointer delta requested. | |||
371 | APInt BaseDelta = PtrDelta - OffsetDelta; | |||
372 | ||||
373 | // Compute the distance with SCEV between the base pointers. | |||
374 | const SCEV *PtrSCEVA = SE.getSCEV(PtrA); | |||
375 | const SCEV *PtrSCEVB = SE.getSCEV(PtrB); | |||
376 | const SCEV *C = SE.getConstant(BaseDelta); | |||
377 | const SCEV *X = SE.getAddExpr(PtrSCEVA, C); | |||
378 | if (X == PtrSCEVB) | |||
379 | return true; | |||
380 | ||||
381 | // The above check will not catch the cases where one of the pointers is | |||
382 | // factorized but the other one is not, such as (C + (S * (A + B))) vs | |||
383 | // (AS + BS). Get the minus scev. That will allow re-combining the expresions | |||
384 | // and getting the simplified difference. | |||
385 | const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA); | |||
386 | if (C == Dist) | |||
387 | return true; | |||
388 | ||||
389 | // Sometimes even this doesn't work, because SCEV can't always see through | |||
390 | // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking | |||
391 | // things the hard way. | |||
392 | return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth); | |||
393 | } | |||
394 | ||||
395 | bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB, | |||
396 | APInt PtrDelta, | |||
397 | unsigned Depth) const { | |||
398 | auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA); | |||
399 | auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB); | |||
400 | if (!GEPA || !GEPB) | |||
401 | return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth); | |||
402 | ||||
403 | // Look through GEPs after checking they're the same except for the last | |||
404 | // index. | |||
405 | if (GEPA->getNumOperands() != GEPB->getNumOperands() || | |||
406 | GEPA->getPointerOperand() != GEPB->getPointerOperand()) | |||
407 | return false; | |||
408 | gep_type_iterator GTIA = gep_type_begin(GEPA); | |||
409 | gep_type_iterator GTIB = gep_type_begin(GEPB); | |||
410 | for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) { | |||
411 | if (GTIA.getOperand() != GTIB.getOperand()) | |||
412 | return false; | |||
413 | ++GTIA; | |||
414 | ++GTIB; | |||
415 | } | |||
416 | ||||
417 | Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand()); | |||
418 | Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand()); | |||
419 | if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() || | |||
420 | OpA->getType() != OpB->getType()) | |||
421 | return false; | |||
422 | ||||
423 | if (PtrDelta.isNegative()) { | |||
424 | if (PtrDelta.isMinSignedValue()) | |||
425 | return false; | |||
426 | PtrDelta.negate(); | |||
427 | std::swap(OpA, OpB); | |||
428 | } | |||
429 | uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType()); | |||
430 | if (PtrDelta.urem(Stride) != 0) | |||
431 | return false; | |||
432 | unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits(); | |||
433 | APInt IdxDiff = PtrDelta.udiv(Stride).zextOrSelf(IdxBitWidth); | |||
434 | ||||
435 | // Only look through a ZExt/SExt. | |||
436 | if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA)) | |||
437 | return false; | |||
438 | ||||
439 | bool Signed = isa<SExtInst>(OpA); | |||
440 | ||||
441 | // At this point A could be a function parameter, i.e. not an instruction | |||
442 | Value *ValA = OpA->getOperand(0); | |||
443 | OpB = dyn_cast<Instruction>(OpB->getOperand(0)); | |||
444 | if (!OpB || ValA->getType() != OpB->getType()) | |||
445 | return false; | |||
446 | ||||
447 | // Now we need to prove that adding IdxDiff to ValA won't overflow. | |||
448 | bool Safe = false; | |||
449 | // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to | |||
450 | // ValA, we're okay. | |||
451 | if (OpB->getOpcode() == Instruction::Add && | |||
452 | isa<ConstantInt>(OpB->getOperand(1)) && | |||
453 | IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue())) { | |||
454 | if (Signed) | |||
455 | Safe = cast<BinaryOperator>(OpB)->hasNoSignedWrap(); | |||
456 | else | |||
457 | Safe = cast<BinaryOperator>(OpB)->hasNoUnsignedWrap(); | |||
458 | } | |||
459 | ||||
460 | unsigned BitWidth = ValA->getType()->getScalarSizeInBits(); | |||
461 | ||||
462 | // Second attempt: | |||
463 | // If all set bits of IdxDiff or any higher order bit other than the sign bit | |||
464 | // are known to be zero in ValA, we can add Diff to it while guaranteeing no | |||
465 | // overflow of any sort. | |||
466 | if (!Safe) { | |||
467 | OpA = dyn_cast<Instruction>(ValA); | |||
468 | if (!OpA) | |||
469 | return false; | |||
470 | KnownBits Known(BitWidth); | |||
471 | computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT); | |||
472 | APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth()); | |||
473 | if (Signed) | |||
474 | BitsAllowedToBeSet.clearBit(BitWidth - 1); | |||
475 | if (BitsAllowedToBeSet.ult(IdxDiff)) | |||
476 | return false; | |||
477 | } | |||
478 | ||||
479 | const SCEV *OffsetSCEVA = SE.getSCEV(ValA); | |||
480 | const SCEV *OffsetSCEVB = SE.getSCEV(OpB); | |||
481 | const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth)); | |||
482 | const SCEV *X = SE.getAddExpr(OffsetSCEVA, C); | |||
483 | return X == OffsetSCEVB; | |||
484 | } | |||
485 | ||||
486 | bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB, | |||
487 | const APInt &PtrDelta, | |||
488 | unsigned Depth) const { | |||
489 | if (Depth++ == MaxDepth) | |||
490 | return false; | |||
491 | ||||
492 | if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) { | |||
493 | if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) { | |||
494 | return SelectA->getCondition() == SelectB->getCondition() && | |||
495 | areConsecutivePointers(SelectA->getTrueValue(), | |||
496 | SelectB->getTrueValue(), PtrDelta, Depth) && | |||
497 | areConsecutivePointers(SelectA->getFalseValue(), | |||
498 | SelectB->getFalseValue(), PtrDelta, Depth); | |||
499 | } | |||
500 | } | |||
501 | return false; | |||
502 | } | |||
503 | ||||
504 | void Vectorizer::reorder(Instruction *I) { | |||
505 | OrderedBasicBlock OBB(I->getParent()); | |||
506 | SmallPtrSet<Instruction *, 16> InstructionsToMove; | |||
507 | SmallVector<Instruction *, 16> Worklist; | |||
508 | ||||
509 | Worklist.push_back(I); | |||
510 | while (!Worklist.empty()) { | |||
511 | Instruction *IW = Worklist.pop_back_val(); | |||
512 | int NumOperands = IW->getNumOperands(); | |||
513 | for (int i = 0; i < NumOperands; i++) { | |||
514 | Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i)); | |||
515 | if (!IM || IM->getOpcode() == Instruction::PHI) | |||
516 | continue; | |||
517 | ||||
518 | // If IM is in another BB, no need to move it, because this pass only | |||
519 | // vectorizes instructions within one BB. | |||
520 | if (IM->getParent() != I->getParent()) | |||
521 | continue; | |||
522 | ||||
523 | if (!OBB.dominates(IM, I)) { | |||
524 | InstructionsToMove.insert(IM); | |||
525 | Worklist.push_back(IM); | |||
526 | } | |||
527 | } | |||
528 | } | |||
529 | ||||
530 | // All instructions to move should follow I. Start from I, not from begin(). | |||
531 | for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E; | |||
532 | ++BBI) { | |||
533 | if (!InstructionsToMove.count(&*BBI)) | |||
534 | continue; | |||
535 | Instruction *IM = &*BBI; | |||
536 | --BBI; | |||
537 | IM->removeFromParent(); | |||
538 | IM->insertBefore(I); | |||
539 | } | |||
540 | } | |||
541 | ||||
542 | std::pair<BasicBlock::iterator, BasicBlock::iterator> | |||
543 | Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) { | |||
544 | Instruction *C0 = Chain[0]; | |||
545 | BasicBlock::iterator FirstInstr = C0->getIterator(); | |||
546 | BasicBlock::iterator LastInstr = C0->getIterator(); | |||
547 | ||||
548 | BasicBlock *BB = C0->getParent(); | |||
549 | unsigned NumFound = 0; | |||
550 | for (Instruction &I : *BB) { | |||
551 | if (!is_contained(Chain, &I)) | |||
552 | continue; | |||
553 | ||||
554 | ++NumFound; | |||
555 | if (NumFound == 1) { | |||
556 | FirstInstr = I.getIterator(); | |||
557 | } | |||
558 | if (NumFound == Chain.size()) { | |||
559 | LastInstr = I.getIterator(); | |||
560 | break; | |||
561 | } | |||
562 | } | |||
563 | ||||
564 | // Range is [first, last). | |||
565 | return std::make_pair(FirstInstr, ++LastInstr); | |||
566 | } | |||
567 | ||||
568 | void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) { | |||
569 | SmallVector<Instruction *, 16> Instrs; | |||
570 | for (Instruction *I : Chain) { | |||
571 | Value *PtrOperand = getLoadStorePointerOperand(I); | |||
572 | assert(PtrOperand && "Instruction must have a pointer operand.")((PtrOperand && "Instruction must have a pointer operand." ) ? static_cast<void> (0) : __assert_fail ("PtrOperand && \"Instruction must have a pointer operand.\"" , "/build/llvm-toolchain-snapshot-10~svn371925/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 572, __PRETTY_FUNCTION__)); | |||
573 | Instrs.push_back(I); | |||
574 | if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand)) | |||
575 | Instrs.push_back(GEP); | |||
576 | } | |||
577 | ||||
578 | // Erase instructions. | |||
579 | for (Instruction *I : Instrs) | |||
580 | if (I->use_empty()) | |||
581 | I->eraseFromParent(); | |||
582 | } | |||
583 | ||||
584 | std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>> | |||
585 | Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain, | |||
586 | unsigned ElementSizeBits) { | |||
587 | unsigned ElementSizeBytes = ElementSizeBits / 8; | |||
588 | unsigned SizeBytes = ElementSizeBytes * Chain.size(); | |||
589 | unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes; | |||
590 | if (NumLeft == Chain.size()) { | |||
591 | if ((NumLeft & 1) == 0) | |||
592 | NumLeft /= 2; // Split even in half | |||
593 | else | |||
594 | --NumLeft; // Split off last element | |||
595 | } else if (NumLeft == 0) | |||
596 | NumLeft = 1; | |||
597 | return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft)); | |||
598 | } | |||
599 | ||||
600 | ArrayRef<Instruction *> | |||
601 | Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) { | |||
602 | // These are in BB order, unlike Chain, which is in address order. | |||
603 | SmallVector<Instruction *, 16> MemoryInstrs; | |||
604 | SmallVector<Instruction *, 16> ChainInstrs; | |||
605 | ||||
606 | bool IsLoadChain = isa<LoadInst>(Chain[0]); | |||
607 | LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { for (Instruction *I : Chain) { if (IsLoadChain) ((isa<LoadInst>(I) && "All elements of Chain must be loads, or all must be stores." ) ? static_cast<void> (0) : __assert_fail ("isa<LoadInst>(I) && \"All elements of Chain must be loads, or all must be stores.\"" , "/build/llvm-toolchain-snapshot-10~svn371925/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 611, __PRETTY_FUNCTION__)); else ((isa<StoreInst>(I) && "All elements of Chain must be loads, or all must be stores." ) ? static_cast<void> (0) : __assert_fail ("isa<StoreInst>(I) && \"All elements of Chain must be loads, or all must be stores.\"" , "/build/llvm-toolchain-snapshot-10~svn371925/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 614, __PRETTY_FUNCTION__)); } }; } } while (false) | |||
608 | for (Instruction *I : Chain) {do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { for (Instruction *I : Chain) { if (IsLoadChain) ((isa<LoadInst>(I) && "All elements of Chain must be loads, or all must be stores." ) ? static_cast<void> (0) : __assert_fail ("isa<LoadInst>(I) && \"All elements of Chain must be loads, or all must be stores.\"" , "/build/llvm-toolchain-snapshot-10~svn371925/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 611, __PRETTY_FUNCTION__)); else ((isa<StoreInst>(I) && "All elements of Chain must be loads, or all must be stores." ) ? static_cast<void> (0) : __assert_fail ("isa<StoreInst>(I) && \"All elements of Chain must be loads, or all must be stores.\"" , "/build/llvm-toolchain-snapshot-10~svn371925/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 614, __PRETTY_FUNCTION__)); } }; } } while (false) | |||
609 | if (IsLoadChain)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { for (Instruction *I : Chain) { if (IsLoadChain) ((isa<LoadInst>(I) && "All elements of Chain must be loads, or all must be stores." ) ? static_cast<void> (0) : __assert_fail ("isa<LoadInst>(I) && \"All elements of Chain must be loads, or all must be stores.\"" , "/build/llvm-toolchain-snapshot-10~svn371925/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 611, __PRETTY_FUNCTION__)); else ((isa<StoreInst>(I) && "All elements of Chain must be loads, or all must be stores." ) ? static_cast<void> (0) : __assert_fail ("isa<StoreInst>(I) && \"All elements of Chain must be loads, or all must be stores.\"" , "/build/llvm-toolchain-snapshot-10~svn371925/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 614, __PRETTY_FUNCTION__)); } }; } } while (false) | |||
610 | assert(isa<LoadInst>(I) &&do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { for (Instruction *I : Chain) { if (IsLoadChain) ((isa<LoadInst>(I) && "All elements of Chain must be loads, or all must be stores." ) ? static_cast<void> (0) : __assert_fail ("isa<LoadInst>(I) && \"All elements of Chain must be loads, or all must be stores.\"" , "/build/llvm-toolchain-snapshot-10~svn371925/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 611, __PRETTY_FUNCTION__)); else ((isa<StoreInst>(I) && "All elements of Chain must be loads, or all must be stores." ) ? static_cast<void> (0) : __assert_fail ("isa<StoreInst>(I) && \"All elements of Chain must be loads, or all must be stores.\"" , "/build/llvm-toolchain-snapshot-10~svn371925/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 614, __PRETTY_FUNCTION__)); } }; } } while (false) | |||
611 | "All elements of Chain must be loads, or all must be stores.");do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { for (Instruction *I : Chain) { if (IsLoadChain) ((isa<LoadInst>(I) && "All elements of Chain must be loads, or all must be stores." ) ? static_cast<void> (0) : __assert_fail ("isa<LoadInst>(I) && \"All elements of Chain must be loads, or all must be stores.\"" , "/build/llvm-toolchain-snapshot-10~svn371925/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 611, __PRETTY_FUNCTION__)); else ((isa<StoreInst>(I) && "All elements of Chain must be loads, or all must be stores." ) ? static_cast<void> (0) : __assert_fail ("isa<StoreInst>(I) && \"All elements of Chain must be loads, or all must be stores.\"" , "/build/llvm-toolchain-snapshot-10~svn371925/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 614, __PRETTY_FUNCTION__)); } }; } } while (false) | |||
612 | elsedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { for (Instruction *I : Chain) { if (IsLoadChain) ((isa<LoadInst>(I) && "All elements of Chain must be loads, or all must be stores." ) ? static_cast<void> (0) : __assert_fail ("isa<LoadInst>(I) && \"All elements of Chain must be loads, or all must be stores.\"" , "/build/llvm-toolchain-snapshot-10~svn371925/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 611, __PRETTY_FUNCTION__)); else ((isa<StoreInst>(I) && "All elements of Chain must be loads, or all must be stores." ) ? static_cast<void> (0) : __assert_fail ("isa<StoreInst>(I) && \"All elements of Chain must be loads, or all must be stores.\"" , "/build/llvm-toolchain-snapshot-10~svn371925/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 614, __PRETTY_FUNCTION__)); } }; } } while (false) | |||
613 | assert(isa<StoreInst>(I) &&do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { for (Instruction *I : Chain) { if (IsLoadChain) ((isa<LoadInst>(I) && "All elements of Chain must be loads, or all must be stores." ) ? static_cast<void> (0) : __assert_fail ("isa<LoadInst>(I) && \"All elements of Chain must be loads, or all must be stores.\"" , "/build/llvm-toolchain-snapshot-10~svn371925/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 611, __PRETTY_FUNCTION__)); else ((isa<StoreInst>(I) && "All elements of Chain must be loads, or all must be stores." ) ? static_cast<void> (0) : __assert_fail ("isa<StoreInst>(I) && \"All elements of Chain must be loads, or all must be stores.\"" , "/build/llvm-toolchain-snapshot-10~svn371925/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 614, __PRETTY_FUNCTION__)); } }; } } while (false) | |||
614 | "All elements of Chain must be loads, or all must be stores.");do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { for (Instruction *I : Chain) { if (IsLoadChain) ((isa<LoadInst>(I) && "All elements of Chain must be loads, or all must be stores." ) ? static_cast<void> (0) : __assert_fail ("isa<LoadInst>(I) && \"All elements of Chain must be loads, or all must be stores.\"" , "/build/llvm-toolchain-snapshot-10~svn371925/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 611, __PRETTY_FUNCTION__)); else ((isa<StoreInst>(I) && "All elements of Chain must be loads, or all must be stores." ) ? static_cast<void> (0) : __assert_fail ("isa<StoreInst>(I) && \"All elements of Chain must be loads, or all must be stores.\"" , "/build/llvm-toolchain-snapshot-10~svn371925/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 614, __PRETTY_FUNCTION__)); } }; } } while (false) | |||
615 | }do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { for (Instruction *I : Chain) { if (IsLoadChain) ((isa<LoadInst>(I) && "All elements of Chain must be loads, or all must be stores." ) ? static_cast<void> (0) : __assert_fail ("isa<LoadInst>(I) && \"All elements of Chain must be loads, or all must be stores.\"" , "/build/llvm-toolchain-snapshot-10~svn371925/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 611, __PRETTY_FUNCTION__)); else ((isa<StoreInst>(I) && "All elements of Chain must be loads, or all must be stores." ) ? static_cast<void> (0) : __assert_fail ("isa<StoreInst>(I) && \"All elements of Chain must be loads, or all must be stores.\"" , "/build/llvm-toolchain-snapshot-10~svn371925/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 614, __PRETTY_FUNCTION__)); } }; } } while (false) | |||
616 | })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { for (Instruction *I : Chain) { if (IsLoadChain) ((isa<LoadInst>(I) && "All elements of Chain must be loads, or all must be stores." ) ? static_cast<void> (0) : __assert_fail ("isa<LoadInst>(I) && \"All elements of Chain must be loads, or all must be stores.\"" , "/build/llvm-toolchain-snapshot-10~svn371925/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 611, __PRETTY_FUNCTION__)); else ((isa<StoreInst>(I) && "All elements of Chain must be loads, or all must be stores." ) ? static_cast<void> (0) : __assert_fail ("isa<StoreInst>(I) && \"All elements of Chain must be loads, or all must be stores.\"" , "/build/llvm-toolchain-snapshot-10~svn371925/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 614, __PRETTY_FUNCTION__)); } }; } } while (false); | |||
617 | ||||
618 | for (Instruction &I : make_range(getBoundaryInstrs(Chain))) { | |||
619 | if (isa<LoadInst>(I) || isa<StoreInst>(I)) { | |||
620 | if (!is_contained(Chain, &I)) | |||
621 | MemoryInstrs.push_back(&I); | |||
622 | else | |||
623 | ChainInstrs.push_back(&I); | |||
624 | } else if (isa<IntrinsicInst>(&I) && | |||
625 | cast<IntrinsicInst>(&I)->getIntrinsicID() == | |||
626 | Intrinsic::sideeffect) { | |||
627 | // Ignore llvm.sideeffect calls. | |||
628 | } else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) { | |||
629 | LLVM_DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << Ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { dbgs() << "LSV: Found may-write/throw operation: " << I << '\n'; } } while (false) | |||
630 | << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { dbgs() << "LSV: Found may-write/throw operation: " << I << '\n'; } } while (false); | |||
631 | break; | |||
632 | } else if (!IsLoadChain && (I.mayReadOrWriteMemory() || I.mayThrow())) { | |||
633 | LLVM_DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << Ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { dbgs() << "LSV: Found may-read/write/throw operation: " << I << '\n'; } } while (false) | |||
634 | << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { dbgs() << "LSV: Found may-read/write/throw operation: " << I << '\n'; } } while (false); | |||
635 | break; | |||
636 | } | |||
637 | } | |||
638 | ||||
639 | OrderedBasicBlock OBB(Chain[0]->getParent()); | |||
640 | ||||
641 | // Loop until we find an instruction in ChainInstrs that we can't vectorize. | |||
642 | unsigned ChainInstrIdx = 0; | |||
643 | Instruction *BarrierMemoryInstr = nullptr; | |||
644 | ||||
645 | for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) { | |||
646 | Instruction *ChainInstr = ChainInstrs[ChainInstrIdx]; | |||
647 | ||||
648 | // If a barrier memory instruction was found, chain instructions that follow | |||
649 | // will not be added to the valid prefix. | |||
650 | if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, ChainInstr)) | |||
651 | break; | |||
652 | ||||
653 | // Check (in BB order) if any instruction prevents ChainInstr from being | |||
654 | // vectorized. Find and store the first such "conflicting" instruction. | |||
655 | for (Instruction *MemInstr : MemoryInstrs) { | |||
656 | // If a barrier memory instruction was found, do not check past it. | |||
657 | if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, MemInstr)) | |||
658 | break; | |||
659 | ||||
660 | auto *MemLoad = dyn_cast<LoadInst>(MemInstr); | |||
661 | auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr); | |||
662 | if (MemLoad && ChainLoad) | |||
663 | continue; | |||
664 | ||||
665 | // We can ignore the alias if the we have a load store pair and the load | |||
666 | // is known to be invariant. The load cannot be clobbered by the store. | |||
667 | auto IsInvariantLoad = [](const LoadInst *LI) -> bool { | |||
668 | return LI->hasMetadata(LLVMContext::MD_invariant_load); | |||
669 | }; | |||
670 | ||||
671 | // We can ignore the alias as long as the load comes before the store, | |||
672 | // because that means we won't be moving the load past the store to | |||
673 | // vectorize it (the vectorized load is inserted at the location of the | |||
674 | // first load in the chain). | |||
675 | if (isa<StoreInst>(MemInstr) && ChainLoad && | |||
676 | (IsInvariantLoad(ChainLoad) || OBB.dominates(ChainLoad, MemInstr))) | |||
677 | continue; | |||
678 | ||||
679 | // Same case, but in reverse. | |||
680 | if (MemLoad && isa<StoreInst>(ChainInstr) && | |||
681 | (IsInvariantLoad(MemLoad) || OBB.dominates(MemLoad, ChainInstr))) | |||
682 | continue; | |||
683 | ||||
684 | if (!AA.isNoAlias(MemoryLocation::get(MemInstr), | |||
685 | MemoryLocation::get(ChainInstr))) { | |||
686 | LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { dbgs() << "LSV: Found alias:\n" " Aliasing instruction and pointer:\n" << " " << *MemInstr << '\n' << " " << *getLoadStorePointerOperand (MemInstr) << '\n' << " Aliased instruction and pointer:\n" << " " << *ChainInstr << '\n' << " " << *getLoadStorePointerOperand(ChainInstr) << '\n' ; }; } } while (false) | |||
687 | dbgs() << "LSV: Found alias:\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { dbgs() << "LSV: Found alias:\n" " Aliasing instruction and pointer:\n" << " " << *MemInstr << '\n' << " " << *getLoadStorePointerOperand (MemInstr) << '\n' << " Aliased instruction and pointer:\n" << " " << *ChainInstr << '\n' << " " << *getLoadStorePointerOperand(ChainInstr) << '\n' ; }; } } while (false) | |||
688 | " Aliasing instruction and pointer:\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { dbgs() << "LSV: Found alias:\n" " Aliasing instruction and pointer:\n" << " " << *MemInstr << '\n' << " " << *getLoadStorePointerOperand (MemInstr) << '\n' << " Aliased instruction and pointer:\n" << " " << *ChainInstr << '\n' << " " << *getLoadStorePointerOperand(ChainInstr) << '\n' ; }; } } while (false) | |||
689 | << " " << *MemInstr << '\n'do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { dbgs() << "LSV: Found alias:\n" " Aliasing instruction and pointer:\n" << " " << *MemInstr << '\n' << " " << *getLoadStorePointerOperand (MemInstr) << '\n' << " Aliased instruction and pointer:\n" << " " << *ChainInstr << '\n' << " " << *getLoadStorePointerOperand(ChainInstr) << '\n' ; }; } } while (false) | |||
690 | << " " << *getLoadStorePointerOperand(MemInstr) << '\n'do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { dbgs() << "LSV: Found alias:\n" " Aliasing instruction and pointer:\n" << " " << *MemInstr << '\n' << " " << *getLoadStorePointerOperand (MemInstr) << '\n' << " Aliased instruction and pointer:\n" << " " << *ChainInstr << '\n' << " " << *getLoadStorePointerOperand(ChainInstr) << '\n' ; }; } } while (false) | |||
691 | << " Aliased instruction and pointer:\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { dbgs() << "LSV: Found alias:\n" " Aliasing instruction and pointer:\n" << " " << *MemInstr << '\n' << " " << *getLoadStorePointerOperand (MemInstr) << '\n' << " Aliased instruction and pointer:\n" << " " << *ChainInstr << '\n' << " " << *getLoadStorePointerOperand(ChainInstr) << '\n' ; }; } } while (false) | |||
692 | << " " << *ChainInstr << '\n'do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { dbgs() << "LSV: Found alias:\n" " Aliasing instruction and pointer:\n" << " " << *MemInstr << '\n' << " " << *getLoadStorePointerOperand (MemInstr) << '\n' << " Aliased instruction and pointer:\n" << " " << *ChainInstr << '\n' << " " << *getLoadStorePointerOperand(ChainInstr) << '\n' ; }; } } while (false) | |||
693 | << " " << *getLoadStorePointerOperand(ChainInstr) << '\n';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { dbgs() << "LSV: Found alias:\n" " Aliasing instruction and pointer:\n" << " " << *MemInstr << '\n' << " " << *getLoadStorePointerOperand (MemInstr) << '\n' << " Aliased instruction and pointer:\n" << " " << *ChainInstr << '\n' << " " << *getLoadStorePointerOperand(ChainInstr) << '\n' ; }; } } while (false) | |||
694 | })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { dbgs() << "LSV: Found alias:\n" " Aliasing instruction and pointer:\n" << " " << *MemInstr << '\n' << " " << *getLoadStorePointerOperand (MemInstr) << '\n' << " Aliased instruction and pointer:\n" << " " << *ChainInstr << '\n' << " " << *getLoadStorePointerOperand(ChainInstr) << '\n' ; }; } } while (false); | |||
695 | // Save this aliasing memory instruction as a barrier, but allow other | |||
696 | // instructions that precede the barrier to be vectorized with this one. | |||
697 | BarrierMemoryInstr = MemInstr; | |||
698 | break; | |||
699 | } | |||
700 | } | |||
701 | // Continue the search only for store chains, since vectorizing stores that | |||
702 | // precede an aliasing load is valid. Conversely, vectorizing loads is valid | |||
703 | // up to an aliasing store, but should not pull loads from further down in | |||
704 | // the basic block. | |||
705 | if (IsLoadChain && BarrierMemoryInstr) { | |||
706 | // The BarrierMemoryInstr is a store that precedes ChainInstr. | |||
707 | assert(OBB.dominates(BarrierMemoryInstr, ChainInstr))((OBB.dominates(BarrierMemoryInstr, ChainInstr)) ? static_cast <void> (0) : __assert_fail ("OBB.dominates(BarrierMemoryInstr, ChainInstr)" , "/build/llvm-toolchain-snapshot-10~svn371925/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 707, __PRETTY_FUNCTION__)); | |||
708 | break; | |||
709 | } | |||
710 | } | |||
711 | ||||
712 | // Find the largest prefix of Chain whose elements are all in | |||
713 | // ChainInstrs[0, ChainInstrIdx). This is the largest vectorizable prefix of | |||
714 | // Chain. (Recall that Chain is in address order, but ChainInstrs is in BB | |||
715 | // order.) | |||
716 | SmallPtrSet<Instruction *, 8> VectorizableChainInstrs( | |||
717 | ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx); | |||
718 | unsigned ChainIdx = 0; | |||
719 | for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) { | |||
720 | if (!VectorizableChainInstrs.count(Chain[ChainIdx])) | |||
721 | break; | |||
722 | } | |||
723 | return Chain.slice(0, ChainIdx); | |||
724 | } | |||
725 | ||||
726 | static ChainID getChainID(const Value *Ptr, const DataLayout &DL) { | |||
727 | const Value *ObjPtr = GetUnderlyingObject(Ptr, DL); | |||
728 | if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) { | |||
729 | // The select's themselves are distinct instructions even if they share the | |||
730 | // same condition and evaluate to consecutive pointers for true and false | |||
731 | // values of the condition. Therefore using the select's themselves for | |||
732 | // grouping instructions would put consecutive accesses into different lists | |||
733 | // and they won't be even checked for being consecutive, and won't be | |||
734 | // vectorized. | |||
735 | return Sel->getCondition(); | |||
736 | } | |||
737 | return ObjPtr; | |||
738 | } | |||
739 | ||||
740 | std::pair<InstrListMap, InstrListMap> | |||
741 | Vectorizer::collectInstructions(BasicBlock *BB) { | |||
742 | InstrListMap LoadRefs; | |||
743 | InstrListMap StoreRefs; | |||
744 | ||||
745 | for (Instruction &I : *BB) { | |||
746 | if (!I.mayReadOrWriteMemory()) | |||
747 | continue; | |||
748 | ||||
749 | if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { | |||
750 | if (!LI->isSimple()) | |||
751 | continue; | |||
752 | ||||
753 | // Skip if it's not legal. | |||
754 | if (!TTI.isLegalToVectorizeLoad(LI)) | |||
755 | continue; | |||
756 | ||||
757 | Type *Ty = LI->getType(); | |||
758 | if (!VectorType::isValidElementType(Ty->getScalarType())) | |||
759 | continue; | |||
760 | ||||
761 | // Skip weird non-byte sizes. They probably aren't worth the effort of | |||
762 | // handling correctly. | |||
763 | unsigned TySize = DL.getTypeSizeInBits(Ty); | |||
764 | if ((TySize % 8) != 0) | |||
765 | continue; | |||
766 | ||||
767 | // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain | |||
768 | // functions are currently using an integer type for the vectorized | |||
769 | // load/store, and does not support casting between the integer type and a | |||
770 | // vector of pointers (e.g. i64 to <2 x i16*>) | |||
771 | if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy()) | |||
772 | continue; | |||
773 | ||||
774 | Value *Ptr = LI->getPointerOperand(); | |||
775 | unsigned AS = Ptr->getType()->getPointerAddressSpace(); | |||
776 | unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); | |||
777 | ||||
778 | unsigned VF = VecRegSize / TySize; | |||
779 | VectorType *VecTy = dyn_cast<VectorType>(Ty); | |||
780 | ||||
781 | // No point in looking at these if they're too big to vectorize. | |||
782 | if (TySize > VecRegSize / 2 || | |||
783 | (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0)) | |||
784 | continue; | |||
785 | ||||
786 | // Make sure all the users of a vector are constant-index extracts. | |||
787 | if (isa<VectorType>(Ty) && !llvm::all_of(LI->users(), [](const User *U) { | |||
788 | const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U); | |||
789 | return EEI && isa<ConstantInt>(EEI->getOperand(1)); | |||
790 | })) | |||
791 | continue; | |||
792 | ||||
793 | // Save the load locations. | |||
794 | const ChainID ID = getChainID(Ptr, DL); | |||
795 | LoadRefs[ID].push_back(LI); | |||
796 | } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) { | |||
797 | if (!SI->isSimple()) | |||
798 | continue; | |||
799 | ||||
800 | // Skip if it's not legal. | |||
801 | if (!TTI.isLegalToVectorizeStore(SI)) | |||
802 | continue; | |||
803 | ||||
804 | Type *Ty = SI->getValueOperand()->getType(); | |||
805 | if (!VectorType::isValidElementType(Ty->getScalarType())) | |||
806 | continue; | |||
807 | ||||
808 | // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain | |||
809 | // functions are currently using an integer type for the vectorized | |||
810 | // load/store, and does not support casting between the integer type and a | |||
811 | // vector of pointers (e.g. i64 to <2 x i16*>) | |||
812 | if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy()) | |||
813 | continue; | |||
814 | ||||
815 | // Skip weird non-byte sizes. They probably aren't worth the effort of | |||
816 | // handling correctly. | |||
817 | unsigned TySize = DL.getTypeSizeInBits(Ty); | |||
818 | if ((TySize % 8) != 0) | |||
819 | continue; | |||
820 | ||||
821 | Value *Ptr = SI->getPointerOperand(); | |||
822 | unsigned AS = Ptr->getType()->getPointerAddressSpace(); | |||
823 | unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); | |||
824 | ||||
825 | unsigned VF = VecRegSize / TySize; | |||
826 | VectorType *VecTy = dyn_cast<VectorType>(Ty); | |||
827 | ||||
828 | // No point in looking at these if they're too big to vectorize. | |||
829 | if (TySize > VecRegSize / 2 || | |||
830 | (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0)) | |||
831 | continue; | |||
832 | ||||
833 | if (isa<VectorType>(Ty) && !llvm::all_of(SI->users(), [](const User *U) { | |||
834 | const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U); | |||
835 | return EEI && isa<ConstantInt>(EEI->getOperand(1)); | |||
836 | })) | |||
837 | continue; | |||
838 | ||||
839 | // Save store location. | |||
840 | const ChainID ID = getChainID(Ptr, DL); | |||
841 | StoreRefs[ID].push_back(SI); | |||
842 | } | |||
843 | } | |||
844 | ||||
845 | return {LoadRefs, StoreRefs}; | |||
846 | } | |||
847 | ||||
848 | bool Vectorizer::vectorizeChains(InstrListMap &Map) { | |||
849 | bool Changed = false; | |||
850 | ||||
851 | for (const std::pair<ChainID, InstrList> &Chain : Map) { | |||
852 | unsigned Size = Chain.second.size(); | |||
853 | if (Size < 2) | |||
| ||||
854 | continue; | |||
855 | ||||
856 | LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n"; } } while (false); | |||
857 | ||||
858 | // Process the stores in chunks of 64. | |||
859 | for (unsigned CI = 0, CE = Size; CI
| |||
860 | unsigned Len = std::min<unsigned>(CE - CI, 64); | |||
861 | ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len); | |||
862 | Changed |= vectorizeInstructions(Chunk); | |||
863 | } | |||
864 | } | |||
865 | ||||
866 | return Changed; | |||
867 | } | |||
868 | ||||
869 | bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) { | |||
870 | LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { dbgs() << "LSV: Vectorizing " << Instrs.size() << " instructions.\n"; } } while (false) | |||
871 | << " instructions.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { dbgs() << "LSV: Vectorizing " << Instrs.size() << " instructions.\n"; } } while (false); | |||
872 | SmallVector<int, 16> Heads, Tails; | |||
873 | int ConsecutiveChain[64]; | |||
874 | ||||
875 | // Do a quadratic search on all of the given loads/stores and find all of the | |||
876 | // pairs of loads/stores that follow each other. | |||
877 | for (int i = 0, e = Instrs.size(); i < e; ++i) { | |||
878 | ConsecutiveChain[i] = -1; | |||
879 | for (int j = e - 1; j >= 0; --j) { | |||
880 | if (i == j) | |||
881 | continue; | |||
882 | ||||
883 | if (isConsecutiveAccess(Instrs[i], Instrs[j])) { | |||
884 | if (ConsecutiveChain[i] != -1) { | |||
885 | int CurDistance = std::abs(ConsecutiveChain[i] - i); | |||
886 | int NewDistance = std::abs(ConsecutiveChain[i] - j); | |||
887 | if (j < i || NewDistance > CurDistance) | |||
888 | continue; // Should not insert. | |||
889 | } | |||
890 | ||||
891 | Tails.push_back(j); | |||
892 | Heads.push_back(i); | |||
893 | ConsecutiveChain[i] = j; | |||
894 | } | |||
895 | } | |||
896 | } | |||
897 | ||||
898 | bool Changed = false; | |||
899 | SmallPtrSet<Instruction *, 16> InstructionsProcessed; | |||
900 | ||||
901 | for (int Head : Heads) { | |||
902 | if (InstructionsProcessed.count(Instrs[Head])) | |||
903 | continue; | |||
904 | bool LongerChainExists = false; | |||
905 | for (unsigned TIt = 0; TIt < Tails.size(); TIt++) | |||
906 | if (Head == Tails[TIt] && | |||
907 | !InstructionsProcessed.count(Instrs[Heads[TIt]])) { | |||
908 | LongerChainExists = true; | |||
909 | break; | |||
910 | } | |||
911 | if (LongerChainExists
| |||
912 | continue; | |||
913 | ||||
914 | // We found an instr that starts a chain. Now follow the chain and try to | |||
915 | // vectorize it. | |||
916 | SmallVector<Instruction *, 16> Operands; | |||
917 | int I = Head; | |||
918 | while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) { | |||
919 | if (InstructionsProcessed.count(Instrs[I])) | |||
920 | break; | |||
921 | ||||
922 | Operands.push_back(Instrs[I]); | |||
923 | I = ConsecutiveChain[I]; | |||
924 | } | |||
925 | ||||
926 | bool Vectorized = false; | |||
927 | if (isa<LoadInst>(*Operands.begin())) | |||
928 | Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed); | |||
929 | else | |||
930 | Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed); | |||
931 | ||||
932 | Changed |= Vectorized; | |||
933 | } | |||
934 | ||||
935 | return Changed; | |||
936 | } | |||
937 | ||||
938 | bool Vectorizer::vectorizeStoreChain( | |||
939 | ArrayRef<Instruction *> Chain, | |||
940 | SmallPtrSet<Instruction *, 16> *InstructionsProcessed) { | |||
941 | StoreInst *S0 = cast<StoreInst>(Chain[0]); | |||
942 | ||||
943 | // If the vector has an int element, default to int for the whole store. | |||
944 | Type *StoreTy = nullptr; | |||
945 | for (Instruction *I : Chain) { | |||
946 | StoreTy = cast<StoreInst>(I)->getValueOperand()->getType(); | |||
947 | if (StoreTy->isIntOrIntVectorTy()) | |||
948 | break; | |||
949 | ||||
950 | if (StoreTy->isPtrOrPtrVectorTy()) { | |||
951 | StoreTy = Type::getIntNTy(F.getParent()->getContext(), | |||
952 | DL.getTypeSizeInBits(StoreTy)); | |||
953 | break; | |||
954 | } | |||
955 | } | |||
956 | assert(StoreTy && "Failed to find store type")((StoreTy && "Failed to find store type") ? static_cast <void> (0) : __assert_fail ("StoreTy && \"Failed to find store type\"" , "/build/llvm-toolchain-snapshot-10~svn371925/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp" , 956, __PRETTY_FUNCTION__)); | |||
957 | ||||
958 | unsigned Sz = DL.getTypeSizeInBits(StoreTy); | |||
959 | unsigned AS = S0->getPointerAddressSpace(); | |||
960 | unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); | |||
961 | unsigned VF = VecRegSize / Sz; | |||
962 | unsigned ChainSize = Chain.size(); | |||
963 | unsigned Alignment = getAlignment(S0); | |||
964 | ||||
965 | if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) { | |||
966 | InstructionsProcessed->insert(Chain.begin(), Chain.end()); | |||
967 | return false; | |||
968 | } | |||
969 | ||||
970 | ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain); | |||
971 | if (NewChain.empty()) { | |||
972 | // No vectorization possible. | |||
973 | InstructionsProcessed->insert(Chain.begin(), Chain.end()); | |||
974 | return false; | |||
975 | } | |||
976 | if (NewChain.size() == 1) { | |||
977 | // Failed after the first instruction. Discard it and try the smaller chain. | |||
978 | InstructionsProcessed->insert(NewChain.front()); | |||
979 | return false; | |||
980 | } | |||
981 | ||||
982 | // Update Chain to the valid vectorizable subchain. | |||
983 | Chain = NewChain; | |||
984 | ChainSize = Chain.size(); | |||
985 | ||||
986 | // Check if it's legal to vectorize this chain. If not, split the chain and | |||
987 | // try again. | |||
988 | unsigned EltSzInBytes = Sz / 8; | |||
989 | unsigned SzInBytes = EltSzInBytes * ChainSize; | |||
990 | ||||
991 | VectorType *VecTy; | |||
992 | VectorType *VecStoreTy = dyn_cast<VectorType>(StoreTy); | |||
993 | if (VecStoreTy) | |||
994 | VecTy = VectorType::get(StoreTy->getScalarType(), | |||
995 | Chain.size() * VecStoreTy->getNumElements()); | |||
996 | else | |||
997 | VecTy = VectorType::get(StoreTy, Chain.size()); | |||
998 | ||||
999 | // If it's more than the max vector size or the target has a better | |||
1000 | // vector factor, break it into two pieces. | |||
1001 | unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy); | |||
1002 | if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) { | |||
1003 | LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { dbgs() << "LSV: Chain doesn't match with the vector factor." " Creating two separate arrays.\n"; } } while (false) | |||
1004 | " Creating two separate arrays.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { dbgs() << "LSV: Chain doesn't match with the vector factor." " Creating two separate arrays.\n"; } } while (false); | |||
1005 | return vectorizeStoreChain(Chain.slice(0, TargetVF), | |||
1006 | InstructionsProcessed) | | |||
1007 | vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed); | |||
1008 | } | |||
1009 | ||||
1010 | LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { dbgs() << "LSV: Stores to vectorize:\n" ; for (Instruction *I : Chain) dbgs() << " " << * I << "\n"; }; } } while (false) | |||
1011 | dbgs() << "LSV: Stores to vectorize:\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { dbgs() << "LSV: Stores to vectorize:\n" ; for (Instruction *I : Chain) dbgs() << " " << * I << "\n"; }; } } while (false) | |||
1012 | for (Instruction *I : Chain)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { dbgs() << "LSV: Stores to vectorize:\n" ; for (Instruction *I : Chain) dbgs() << " " << * I << "\n"; }; } } while (false) | |||
1013 | dbgs() << " " << *I << "\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { dbgs() << "LSV: Stores to vectorize:\n" ; for (Instruction *I : Chain) dbgs() << " " << * I << "\n"; }; } } while (false) | |||
1014 | })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { dbgs() << "LSV: Stores to vectorize:\n" ; for (Instruction *I : Chain) dbgs() << " " << * I << "\n"; }; } } while (false); | |||
1015 | ||||
1016 | // We won't try again to vectorize the elements of the chain, regardless of | |||
1017 | // whether we succeed below. | |||
1018 | InstructionsProcessed->insert(Chain.begin(), Chain.end()); | |||
1019 | ||||
1020 | // If the store is going to be misaligned, don't vectorize it. | |||
1021 | if (accessIsMisaligned(SzInBytes, AS, Alignment)) { | |||
1022 | if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) { | |||
1023 | auto Chains = splitOddVectorElts(Chain, Sz); | |||
1024 | return vectorizeStoreChain(Chains.first, InstructionsProcessed) | | |||
1025 | vectorizeStoreChain(Chains.second, InstructionsProcessed); | |||
1026 | } | |||
1027 | ||||
1028 | unsigned NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(), | |||
1029 | StackAdjustedAlignment, | |||
1030 | DL, S0, nullptr, &DT); | |||
1031 | if (NewAlign != 0) | |||
1032 | Alignment = NewAlign; | |||
1033 | } | |||
1034 | ||||
1035 | if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) { | |||
1036 | auto Chains = splitOddVectorElts(Chain, Sz); | |||
1037 | return vectorizeStoreChain(Chains.first, InstructionsProcessed) | | |||
1038 | vectorizeStoreChain(Chains.second, InstructionsProcessed); | |||
1039 | } | |||
1040 | ||||
1041 | BasicBlock::iterator First, Last; | |||
1042 | std::tie(First, Last) = getBoundaryInstrs(Chain); | |||
1043 | Builder.SetInsertPoint(&*Last); | |||
1044 | ||||
1045 | Value *Vec = UndefValue::get(VecTy); | |||
1046 | ||||
1047 | if (VecStoreTy) { | |||
1048 | unsigned VecWidth = VecStoreTy->getNumElements(); | |||
1049 | for (unsigned I = 0, E = Chain.size(); I != E; ++I) { | |||
1050 | StoreInst *Store = cast<StoreInst>(Chain[I]); | |||
1051 | for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) { | |||
1052 | unsigned NewIdx = J + I * VecWidth; | |||
1053 | Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(), | |||
1054 | Builder.getInt32(J)); | |||
1055 | if (Extract->getType() != StoreTy->getScalarType()) | |||
1056 | Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType()); | |||
1057 | ||||
1058 | Value *Insert = | |||
1059 | Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx)); | |||
1060 | Vec = Insert; | |||
1061 | } | |||
1062 | } | |||
1063 | } else { | |||
1064 | for (unsigned I = 0, E = Chain.size(); I != E; ++I) { | |||
1065 | StoreInst *Store = cast<StoreInst>(Chain[I]); | |||
1066 | Value *Extract = Store->getValueOperand(); | |||
1067 | if (Extract->getType() != StoreTy->getScalarType()) | |||
1068 | Extract = | |||
1069 | Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType()); | |||
1070 | ||||
1071 | Value *Insert = | |||
1072 | Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I)); | |||
1073 | Vec = Insert; | |||
1074 | } | |||
1075 | } | |||
1076 | ||||
1077 | StoreInst *SI = Builder.CreateAlignedStore( | |||
1078 | Vec, | |||
1079 | Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)), | |||
1080 | Alignment); | |||
1081 | propagateMetadata(SI, Chain); | |||
1082 | ||||
1083 | eraseInstructions(Chain); | |||
1084 | ++NumVectorInstructions; | |||
1085 | NumScalarsVectorized += Chain.size(); | |||
1086 | return true; | |||
1087 | } | |||
1088 | ||||
1089 | bool Vectorizer::vectorizeLoadChain( | |||
1090 | ArrayRef<Instruction *> Chain, | |||
1091 | SmallPtrSet<Instruction *, 16> *InstructionsProcessed) { | |||
1092 | LoadInst *L0 = cast<LoadInst>(Chain[0]); | |||
1093 | ||||
1094 | // If the vector has an int element, default to int for the whole load. | |||
1095 | Type *LoadTy; | |||
1096 | for (const auto &V : Chain) { | |||
1097 | LoadTy = cast<LoadInst>(V)->getType(); | |||
1098 | if (LoadTy->isIntOrIntVectorTy()) | |||
1099 | break; | |||
1100 | ||||
1101 | if (LoadTy->isPtrOrPtrVectorTy()) { | |||
1102 | LoadTy = Type::getIntNTy(F.getParent()->getContext(), | |||
1103 | DL.getTypeSizeInBits(LoadTy)); | |||
1104 | break; | |||
1105 | } | |||
1106 | } | |||
1107 | ||||
1108 | unsigned Sz = DL.getTypeSizeInBits(LoadTy); | |||
| ||||
1109 | unsigned AS = L0->getPointerAddressSpace(); | |||
1110 | unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); | |||
1111 | unsigned VF = VecRegSize / Sz; | |||
1112 | unsigned ChainSize = Chain.size(); | |||
1113 | unsigned Alignment = getAlignment(L0); | |||
1114 | ||||
1115 | if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) { | |||
1116 | InstructionsProcessed->insert(Chain.begin(), Chain.end()); | |||
1117 | return false; | |||
1118 | } | |||
1119 | ||||
1120 | ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain); | |||
1121 | if (NewChain.empty()) { | |||
1122 | // No vectorization possible. | |||
1123 | InstructionsProcessed->insert(Chain.begin(), Chain.end()); | |||
1124 | return false; | |||
1125 | } | |||
1126 | if (NewChain.size() == 1) { | |||
1127 | // Failed after the first instruction. Discard it and try the smaller chain. | |||
1128 | InstructionsProcessed->insert(NewChain.front()); | |||
1129 | return false; | |||
1130 | } | |||
1131 | ||||
1132 | // Update Chain to the valid vectorizable subchain. | |||
1133 | Chain = NewChain; | |||
1134 | ChainSize = Chain.size(); | |||
1135 | ||||
1136 | // Check if it's legal to vectorize this chain. If not, split the chain and | |||
1137 | // try again. | |||
1138 | unsigned EltSzInBytes = Sz / 8; | |||
1139 | unsigned SzInBytes = EltSzInBytes * ChainSize; | |||
1140 | VectorType *VecTy; | |||
1141 | VectorType *VecLoadTy = dyn_cast<VectorType>(LoadTy); | |||
1142 | if (VecLoadTy) | |||
1143 | VecTy = VectorType::get(LoadTy->getScalarType(), | |||
1144 | Chain.size() * VecLoadTy->getNumElements()); | |||
1145 | else | |||
1146 | VecTy = VectorType::get(LoadTy, Chain.size()); | |||
1147 | ||||
1148 | // If it's more than the max vector size or the target has a better | |||
1149 | // vector factor, break it into two pieces. | |||
1150 | unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy); | |||
1151 | if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) { | |||
1152 | LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { dbgs() << "LSV: Chain doesn't match with the vector factor." " Creating two separate arrays.\n"; } } while (false) | |||
1153 | " Creating two separate arrays.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { dbgs() << "LSV: Chain doesn't match with the vector factor." " Creating two separate arrays.\n"; } } while (false); | |||
1154 | return vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed) | | |||
1155 | vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed); | |||
1156 | } | |||
1157 | ||||
1158 | // We won't try again to vectorize the elements of the chain, regardless of | |||
1159 | // whether we succeed below. | |||
1160 | InstructionsProcessed->insert(Chain.begin(), Chain.end()); | |||
1161 | ||||
1162 | // If the load is going to be misaligned, don't vectorize it. | |||
1163 | if (accessIsMisaligned(SzInBytes, AS, Alignment)) { | |||
1164 | if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) { | |||
1165 | auto Chains = splitOddVectorElts(Chain, Sz); | |||
1166 | return vectorizeLoadChain(Chains.first, InstructionsProcessed) | | |||
1167 | vectorizeLoadChain(Chains.second, InstructionsProcessed); | |||
1168 | } | |||
1169 | ||||
1170 | Alignment = getOrEnforceKnownAlignment( | |||
1171 | L0->getPointerOperand(), StackAdjustedAlignment, DL, L0, nullptr, &DT); | |||
1172 | } | |||
1173 | ||||
1174 | if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) { | |||
1175 | auto Chains = splitOddVectorElts(Chain, Sz); | |||
1176 | return vectorizeLoadChain(Chains.first, InstructionsProcessed) | | |||
1177 | vectorizeLoadChain(Chains.second, InstructionsProcessed); | |||
1178 | } | |||
1179 | ||||
1180 | LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { dbgs() << "LSV: Loads to vectorize:\n" ; for (Instruction *I : Chain) I->dump(); }; } } while (false ) | |||
1181 | dbgs() << "LSV: Loads to vectorize:\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { dbgs() << "LSV: Loads to vectorize:\n" ; for (Instruction *I : Chain) I->dump(); }; } } while (false ) | |||
1182 | for (Instruction *I : Chain)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { dbgs() << "LSV: Loads to vectorize:\n" ; for (Instruction *I : Chain) I->dump(); }; } } while (false ) | |||
1183 | I->dump();do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { dbgs() << "LSV: Loads to vectorize:\n" ; for (Instruction *I : Chain) I->dump(); }; } } while (false ) | |||
1184 | })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { { dbgs() << "LSV: Loads to vectorize:\n" ; for (Instruction *I : Chain) I->dump(); }; } } while (false ); | |||
1185 | ||||
1186 | // getVectorizablePrefix already computed getBoundaryInstrs. The value of | |||
1187 | // Last may have changed since then, but the value of First won't have. If it | |||
1188 | // matters, we could compute getBoundaryInstrs only once and reuse it here. | |||
1189 | BasicBlock::iterator First, Last; | |||
1190 | std::tie(First, Last) = getBoundaryInstrs(Chain); | |||
1191 | Builder.SetInsertPoint(&*First); | |||
1192 | ||||
1193 | Value *Bitcast = | |||
1194 | Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS)); | |||
1195 | LoadInst *LI = Builder.CreateAlignedLoad(VecTy, Bitcast, Alignment); | |||
1196 | propagateMetadata(LI, Chain); | |||
1197 | ||||
1198 | if (VecLoadTy) { | |||
1199 | SmallVector<Instruction *, 16> InstrsToErase; | |||
1200 | ||||
1201 | unsigned VecWidth = VecLoadTy->getNumElements(); | |||
1202 | for (unsigned I = 0, E = Chain.size(); I != E; ++I) { | |||
1203 | for (auto Use : Chain[I]->users()) { | |||
1204 | // All users of vector loads are ExtractElement instructions with | |||
1205 | // constant indices, otherwise we would have bailed before now. | |||
1206 | Instruction *UI = cast<Instruction>(Use); | |||
1207 | unsigned Idx = cast<ConstantInt>(UI->getOperand(1))->getZExtValue(); | |||
1208 | unsigned NewIdx = Idx + I * VecWidth; | |||
1209 | Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx), | |||
1210 | UI->getName()); | |||
1211 | if (V->getType() != UI->getType()) | |||
1212 | V = Builder.CreateBitCast(V, UI->getType()); | |||
1213 | ||||
1214 | // Replace the old instruction. | |||
1215 | UI->replaceAllUsesWith(V); | |||
1216 | InstrsToErase.push_back(UI); | |||
1217 | } | |||
1218 | } | |||
1219 | ||||
1220 | // Bitcast might not be an Instruction, if the value being loaded is a | |||
1221 | // constant. In that case, no need to reorder anything. | |||
1222 | if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast)) | |||
1223 | reorder(BitcastInst); | |||
1224 | ||||
1225 | for (auto I : InstrsToErase) | |||
1226 | I->eraseFromParent(); | |||
1227 | } else { | |||
1228 | for (unsigned I = 0, E = Chain.size(); I != E; ++I) { | |||
1229 | Value *CV = Chain[I]; | |||
1230 | Value *V = | |||
1231 | Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName()); | |||
1232 | if (V->getType() != CV->getType()) { | |||
1233 | V = Builder.CreateBitOrPointerCast(V, CV->getType()); | |||
1234 | } | |||
1235 | ||||
1236 | // Replace the old instruction. | |||
1237 | CV->replaceAllUsesWith(V); | |||
1238 | } | |||
1239 | ||||
1240 | if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast)) | |||
1241 | reorder(BitcastInst); | |||
1242 | } | |||
1243 | ||||
1244 | eraseInstructions(Chain); | |||
1245 | ||||
1246 | ++NumVectorInstructions; | |||
1247 | NumScalarsVectorized += Chain.size(); | |||
1248 | return true; | |||
1249 | } | |||
1250 | ||||
1251 | bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace, | |||
1252 | unsigned Alignment) { | |||
1253 | if (Alignment % SzInBytes == 0) | |||
1254 | return false; | |||
1255 | ||||
1256 | bool Fast = false; | |||
1257 | bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(), | |||
1258 | SzInBytes * 8, AddressSpace, | |||
1259 | Alignment, &Fast); | |||
1260 | LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allowsdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { dbgs() << "LSV: Target said misaligned is allowed? " << Allows << " and fast? " << Fast << "\n";; } } while (false) | |||
1261 | << " and fast? " << Fast << "\n";)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("load-store-vectorizer")) { dbgs() << "LSV: Target said misaligned is allowed? " << Allows << " and fast? " << Fast << "\n";; } } while (false); | |||
1262 | return !Allows || !Fast; | |||
1263 | } |