File: | lib/Transforms/Vectorize/LoopVectorize.cpp |
Location: | line 1219, column 5 |
Description: | Value stored to 'LoopID' is never read |
1 | //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// |
2 | // |
3 | // The LLVM Compiler Infrastructure |
4 | // |
5 | // This file is distributed under the University of Illinois Open Source |
6 | // License. See LICENSE.TXT for details. |
7 | // |
8 | //===----------------------------------------------------------------------===// |
9 | // |
10 | // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops |
11 | // and generates target-independent LLVM-IR. |
12 | // The vectorizer uses the TargetTransformInfo analysis to estimate the costs |
13 | // of instructions in order to estimate the profitability of vectorization. |
14 | // |
15 | // The loop vectorizer combines consecutive loop iterations into a single |
16 | // 'wide' iteration. After this transformation the index is incremented |
17 | // by the SIMD vector width, and not by one. |
18 | // |
19 | // This pass has three parts: |
20 | // 1. The main loop pass that drives the different parts. |
21 | // 2. LoopVectorizationLegality - A unit that checks for the legality |
22 | // of the vectorization. |
23 | // 3. InnerLoopVectorizer - A unit that performs the actual |
24 | // widening of instructions. |
25 | // 4. LoopVectorizationCostModel - A unit that checks for the profitability |
26 | // of vectorization. It decides on the optimal vector width, which |
27 | // can be one, if vectorization is not profitable. |
28 | // |
29 | //===----------------------------------------------------------------------===// |
30 | // |
31 | // The reduction-variable vectorization is based on the paper: |
32 | // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. |
33 | // |
34 | // Variable uniformity checks are inspired by: |
35 | // Karrenberg, R. and Hack, S. Whole Function Vectorization. |
36 | // |
37 | // Other ideas/concepts are from: |
38 | // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. |
39 | // |
40 | // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of |
41 | // Vectorizing Compilers. |
42 | // |
43 | //===----------------------------------------------------------------------===// |
44 | |
45 | #include "llvm/Transforms/Vectorize.h" |
46 | #include "llvm/ADT/DenseMap.h" |
47 | #include "llvm/ADT/EquivalenceClasses.h" |
48 | #include "llvm/ADT/Hashing.h" |
49 | #include "llvm/ADT/MapVector.h" |
50 | #include "llvm/ADT/SetVector.h" |
51 | #include "llvm/ADT/SmallPtrSet.h" |
52 | #include "llvm/ADT/SmallSet.h" |
53 | #include "llvm/ADT/SmallVector.h" |
54 | #include "llvm/ADT/Statistic.h" |
55 | #include "llvm/ADT/StringExtras.h" |
56 | #include "llvm/Analysis/AliasAnalysis.h" |
57 | #include "llvm/Analysis/AliasSetTracker.h" |
58 | #include "llvm/Analysis/AssumptionTracker.h" |
59 | #include "llvm/Analysis/BlockFrequencyInfo.h" |
60 | #include "llvm/Analysis/CodeMetrics.h" |
61 | #include "llvm/Analysis/LoopInfo.h" |
62 | #include "llvm/Analysis/LoopIterator.h" |
63 | #include "llvm/Analysis/LoopPass.h" |
64 | #include "llvm/Analysis/ScalarEvolution.h" |
65 | #include "llvm/Analysis/ScalarEvolutionExpander.h" |
66 | #include "llvm/Analysis/ScalarEvolutionExpressions.h" |
67 | #include "llvm/Analysis/TargetTransformInfo.h" |
68 | #include "llvm/Analysis/ValueTracking.h" |
69 | #include "llvm/IR/Constants.h" |
70 | #include "llvm/IR/DataLayout.h" |
71 | #include "llvm/IR/DebugInfo.h" |
72 | #include "llvm/IR/DerivedTypes.h" |
73 | #include "llvm/IR/DiagnosticInfo.h" |
74 | #include "llvm/IR/Dominators.h" |
75 | #include "llvm/IR/Function.h" |
76 | #include "llvm/IR/IRBuilder.h" |
77 | #include "llvm/IR/Instructions.h" |
78 | #include "llvm/IR/IntrinsicInst.h" |
79 | #include "llvm/IR/LLVMContext.h" |
80 | #include "llvm/IR/Module.h" |
81 | #include "llvm/IR/PatternMatch.h" |
82 | #include "llvm/IR/Type.h" |
83 | #include "llvm/IR/Value.h" |
84 | #include "llvm/IR/ValueHandle.h" |
85 | #include "llvm/IR/Verifier.h" |
86 | #include "llvm/Pass.h" |
87 | #include "llvm/Support/BranchProbability.h" |
88 | #include "llvm/Support/CommandLine.h" |
89 | #include "llvm/Support/Debug.h" |
90 | #include "llvm/Support/raw_ostream.h" |
91 | #include "llvm/Transforms/Scalar.h" |
92 | #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
93 | #include "llvm/Transforms/Utils/Local.h" |
94 | #include "llvm/Transforms/Utils/VectorUtils.h" |
95 | #include <algorithm> |
96 | #include <map> |
97 | #include <tuple> |
98 | |
99 | using namespace llvm; |
100 | using namespace llvm::PatternMatch; |
101 | |
102 | #define LV_NAME"loop-vectorize" "loop-vectorize" |
103 | #define DEBUG_TYPE"loop-vectorize" LV_NAME"loop-vectorize" |
104 | |
105 | STATISTIC(LoopsVectorized, "Number of loops vectorized")static llvm::Statistic LoopsVectorized = { "loop-vectorize", "Number of loops vectorized" , 0, 0 }; |
106 | STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization")static llvm::Statistic LoopsAnalyzed = { "loop-vectorize", "Number of loops analyzed for vectorization" , 0, 0 }; |
107 | |
108 | static cl::opt<unsigned> |
109 | VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, |
110 | cl::desc("Sets the SIMD width. Zero is autoselect.")); |
111 | |
112 | static cl::opt<unsigned> |
113 | VectorizationInterleave("force-vector-interleave", cl::init(0), cl::Hidden, |
114 | cl::desc("Sets the vectorization interleave count. " |
115 | "Zero is autoselect.")); |
116 | |
117 | static cl::opt<bool> |
118 | EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, |
119 | cl::desc("Enable if-conversion during vectorization.")); |
120 | |
121 | /// We don't vectorize loops with a known constant trip count below this number. |
122 | static cl::opt<unsigned> |
123 | TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), |
124 | cl::Hidden, |
125 | cl::desc("Don't vectorize loops with a constant " |
126 | "trip count that is smaller than this " |
127 | "value.")); |
128 | |
129 | /// This enables versioning on the strides of symbolically striding memory |
130 | /// accesses in code like the following. |
131 | /// for (i = 0; i < N; ++i) |
132 | /// A[i * Stride1] += B[i * Stride2] ... |
133 | /// |
134 | /// Will be roughly translated to |
135 | /// if (Stride1 == 1 && Stride2 == 1) { |
136 | /// for (i = 0; i < N; i+=4) |
137 | /// A[i:i+3] += ... |
138 | /// } else |
139 | /// ... |
140 | static cl::opt<bool> EnableMemAccessVersioning( |
141 | "enable-mem-access-versioning", cl::init(true), cl::Hidden, |
142 | cl::desc("Enable symblic stride memory access versioning")); |
143 | |
144 | /// We don't unroll loops with a known constant trip count below this number. |
145 | static const unsigned TinyTripCountUnrollThreshold = 128; |
146 | |
147 | /// When performing memory disambiguation checks at runtime do not make more |
148 | /// than this number of comparisons. |
149 | static const unsigned RuntimeMemoryCheckThreshold = 8; |
150 | |
151 | /// Maximum simd width. |
152 | static const unsigned MaxVectorWidth = 64; |
153 | |
154 | static cl::opt<unsigned> ForceTargetNumScalarRegs( |
155 | "force-target-num-scalar-regs", cl::init(0), cl::Hidden, |
156 | cl::desc("A flag that overrides the target's number of scalar registers.")); |
157 | |
158 | static cl::opt<unsigned> ForceTargetNumVectorRegs( |
159 | "force-target-num-vector-regs", cl::init(0), cl::Hidden, |
160 | cl::desc("A flag that overrides the target's number of vector registers.")); |
161 | |
162 | /// Maximum vectorization interleave count. |
163 | static const unsigned MaxInterleaveFactor = 16; |
164 | |
165 | static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( |
166 | "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, |
167 | cl::desc("A flag that overrides the target's max interleave factor for " |
168 | "scalar loops.")); |
169 | |
170 | static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( |
171 | "force-target-max-vector-interleave", cl::init(0), cl::Hidden, |
172 | cl::desc("A flag that overrides the target's max interleave factor for " |
173 | "vectorized loops.")); |
174 | |
175 | static cl::opt<unsigned> ForceTargetInstructionCost( |
176 | "force-target-instruction-cost", cl::init(0), cl::Hidden, |
177 | cl::desc("A flag that overrides the target's expected cost for " |
178 | "an instruction to a single constant value. Mostly " |
179 | "useful for getting consistent testing.")); |
180 | |
181 | static cl::opt<unsigned> SmallLoopCost( |
182 | "small-loop-cost", cl::init(20), cl::Hidden, |
183 | cl::desc("The cost of a loop that is considered 'small' by the unroller.")); |
184 | |
185 | static cl::opt<bool> LoopVectorizeWithBlockFrequency( |
186 | "loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden, |
187 | cl::desc("Enable the use of the block frequency analysis to access PGO " |
188 | "heuristics minimizing code growth in cold regions and being more " |
189 | "aggressive in hot regions.")); |
190 | |
191 | // Runtime unroll loops for load/store throughput. |
192 | static cl::opt<bool> EnableLoadStoreRuntimeUnroll( |
193 | "enable-loadstore-runtime-unroll", cl::init(true), cl::Hidden, |
194 | cl::desc("Enable runtime unrolling until load/store ports are saturated")); |
195 | |
196 | /// The number of stores in a loop that are allowed to need predication. |
197 | static cl::opt<unsigned> NumberOfStoresToPredicate( |
198 | "vectorize-num-stores-pred", cl::init(1), cl::Hidden, |
199 | cl::desc("Max number of stores to be predicated behind an if.")); |
200 | |
201 | static cl::opt<bool> EnableIndVarRegisterHeur( |
202 | "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, |
203 | cl::desc("Count the induction variable only once when unrolling")); |
204 | |
205 | static cl::opt<bool> EnableCondStoresVectorization( |
206 | "enable-cond-stores-vec", cl::init(false), cl::Hidden, |
207 | cl::desc("Enable if predication of stores during vectorization.")); |
208 | |
209 | static cl::opt<unsigned> MaxNestedScalarReductionUF( |
210 | "max-nested-scalar-reduction-unroll", cl::init(2), cl::Hidden, |
211 | cl::desc("The maximum unroll factor to use when unrolling a scalar " |
212 | "reduction in a nested loop.")); |
213 | |
214 | namespace { |
215 | |
216 | // Forward declarations. |
217 | class LoopVectorizationLegality; |
218 | class LoopVectorizationCostModel; |
219 | class LoopVectorizeHints; |
220 | |
221 | /// Optimization analysis message produced during vectorization. Messages inform |
222 | /// the user why vectorization did not occur. |
223 | class Report { |
224 | std::string Message; |
225 | raw_string_ostream Out; |
226 | Instruction *Instr; |
227 | |
228 | public: |
229 | Report(Instruction *I = nullptr) : Out(Message), Instr(I) { |
230 | Out << "loop not vectorized: "; |
231 | } |
232 | |
233 | template <typename A> Report &operator<<(const A &Value) { |
234 | Out << Value; |
235 | return *this; |
236 | } |
237 | |
238 | Instruction *getInstr() { return Instr; } |
239 | |
240 | std::string &str() { return Out.str(); } |
241 | operator Twine() { return Out.str(); } |
242 | }; |
243 | |
244 | /// InnerLoopVectorizer vectorizes loops which contain only one basic |
245 | /// block to a specified vectorization factor (VF). |
246 | /// This class performs the widening of scalars into vectors, or multiple |
247 | /// scalars. This class also implements the following features: |
248 | /// * It inserts an epilogue loop for handling loops that don't have iteration |
249 | /// counts that are known to be a multiple of the vectorization factor. |
250 | /// * It handles the code generation for reduction variables. |
251 | /// * Scalarization (implementation using scalars) of un-vectorizable |
252 | /// instructions. |
253 | /// InnerLoopVectorizer does not perform any vectorization-legality |
254 | /// checks, and relies on the caller to check for the different legality |
255 | /// aspects. The InnerLoopVectorizer relies on the |
256 | /// LoopVectorizationLegality class to provide information about the induction |
257 | /// and reduction variables that were found to a given vectorization factor. |
258 | class InnerLoopVectorizer { |
259 | public: |
260 | InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, |
261 | DominatorTree *DT, const DataLayout *DL, |
262 | const TargetLibraryInfo *TLI, unsigned VecWidth, |
263 | unsigned UnrollFactor) |
264 | : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), TLI(TLI), |
265 | VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()), |
266 | Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor), |
267 | Legal(nullptr) {} |
268 | |
269 | // Perform the actual loop widening (vectorization). |
270 | void vectorize(LoopVectorizationLegality *L) { |
271 | Legal = L; |
272 | // Create a new empty loop. Unlink the old loop and connect the new one. |
273 | createEmptyLoop(); |
274 | // Widen each instruction in the old loop to a new one in the new loop. |
275 | // Use the Legality module to find the induction and reduction variables. |
276 | vectorizeLoop(); |
277 | // Register the new loop and update the analysis passes. |
278 | updateAnalysis(); |
279 | } |
280 | |
281 | virtual ~InnerLoopVectorizer() {} |
282 | |
283 | protected: |
284 | /// A small list of PHINodes. |
285 | typedef SmallVector<PHINode*, 4> PhiVector; |
286 | /// When we unroll loops we have multiple vector values for each scalar. |
287 | /// This data structure holds the unrolled and vectorized values that |
288 | /// originated from one scalar instruction. |
289 | typedef SmallVector<Value*, 2> VectorParts; |
290 | |
291 | // When we if-convert we need create edge masks. We have to cache values so |
292 | // that we don't end up with exponential recursion/IR. |
293 | typedef DenseMap<std::pair<BasicBlock*, BasicBlock*>, |
294 | VectorParts> EdgeMaskCache; |
295 | |
296 | /// \brief Add code that checks at runtime if the accessed arrays overlap. |
297 | /// |
298 | /// Returns a pair of instructions where the first element is the first |
299 | /// instruction generated in possibly a sequence of instructions and the |
300 | /// second value is the final comparator value or NULL if no check is needed. |
301 | std::pair<Instruction *, Instruction *> addRuntimeCheck(Instruction *Loc); |
302 | |
303 | /// \brief Add checks for strides that where assumed to be 1. |
304 | /// |
305 | /// Returns the last check instruction and the first check instruction in the |
306 | /// pair as (first, last). |
307 | std::pair<Instruction *, Instruction *> addStrideCheck(Instruction *Loc); |
308 | |
309 | /// Create an empty loop, based on the loop ranges of the old loop. |
310 | void createEmptyLoop(); |
311 | /// Copy and widen the instructions from the old loop. |
312 | virtual void vectorizeLoop(); |
313 | |
314 | /// \brief The Loop exit block may have single value PHI nodes where the |
315 | /// incoming value is 'Undef'. While vectorizing we only handled real values |
316 | /// that were defined inside the loop. Here we fix the 'undef case'. |
317 | /// See PR14725. |
318 | void fixLCSSAPHIs(); |
319 | |
320 | /// A helper function that computes the predicate of the block BB, assuming |
321 | /// that the header block of the loop is set to True. It returns the *entry* |
322 | /// mask for the block BB. |
323 | VectorParts createBlockInMask(BasicBlock *BB); |
324 | /// A helper function that computes the predicate of the edge between SRC |
325 | /// and DST. |
326 | VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst); |
327 | |
328 | /// A helper function to vectorize a single BB within the innermost loop. |
329 | void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV); |
330 | |
331 | /// Vectorize a single PHINode in a block. This method handles the induction |
332 | /// variable canonicalization. It supports both VF = 1 for unrolled loops and |
333 | /// arbitrary length vectors. |
334 | void widenPHIInstruction(Instruction *PN, VectorParts &Entry, |
335 | unsigned UF, unsigned VF, PhiVector *PV); |
336 | |
337 | /// Insert the new loop to the loop hierarchy and pass manager |
338 | /// and update the analysis passes. |
339 | void updateAnalysis(); |
340 | |
341 | /// This instruction is un-vectorizable. Implement it as a sequence |
342 | /// of scalars. If \p IfPredicateStore is true we need to 'hide' each |
343 | /// scalarized instruction behind an if block predicated on the control |
344 | /// dependence of the instruction. |
345 | virtual void scalarizeInstruction(Instruction *Instr, |
346 | bool IfPredicateStore=false); |
347 | |
348 | /// Vectorize Load and Store instructions, |
349 | virtual void vectorizeMemoryInstruction(Instruction *Instr); |
350 | |
351 | /// Create a broadcast instruction. This method generates a broadcast |
352 | /// instruction (shuffle) for loop invariant values and for the induction |
353 | /// value. If this is the induction variable then we extend it to N, N+1, ... |
354 | /// this is needed because each iteration in the loop corresponds to a SIMD |
355 | /// element. |
356 | virtual Value *getBroadcastInstrs(Value *V); |
357 | |
358 | /// This function adds 0, 1, 2 ... to each vector element, starting at zero. |
359 | /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...). |
360 | /// The sequence starts at StartIndex. |
361 | virtual Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate); |
362 | |
363 | /// When we go over instructions in the basic block we rely on previous |
364 | /// values within the current basic block or on loop invariant values. |
365 | /// When we widen (vectorize) values we place them in the map. If the values |
366 | /// are not within the map, they have to be loop invariant, so we simply |
367 | /// broadcast them into a vector. |
368 | VectorParts &getVectorValue(Value *V); |
369 | |
370 | /// Generate a shuffle sequence that will reverse the vector Vec. |
371 | virtual Value *reverseVector(Value *Vec); |
372 | |
373 | /// This is a helper class that holds the vectorizer state. It maps scalar |
374 | /// instructions to vector instructions. When the code is 'unrolled' then |
375 | /// then a single scalar value is mapped to multiple vector parts. The parts |
376 | /// are stored in the VectorPart type. |
377 | struct ValueMap { |
378 | /// C'tor. UnrollFactor controls the number of vectors ('parts') that |
379 | /// are mapped. |
380 | ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {} |
381 | |
382 | /// \return True if 'Key' is saved in the Value Map. |
383 | bool has(Value *Key) const { return MapStorage.count(Key); } |
384 | |
385 | /// Initializes a new entry in the map. Sets all of the vector parts to the |
386 | /// save value in 'Val'. |
387 | /// \return A reference to a vector with splat values. |
388 | VectorParts &splat(Value *Key, Value *Val) { |
389 | VectorParts &Entry = MapStorage[Key]; |
390 | Entry.assign(UF, Val); |
391 | return Entry; |
392 | } |
393 | |
394 | ///\return A reference to the value that is stored at 'Key'. |
395 | VectorParts &get(Value *Key) { |
396 | VectorParts &Entry = MapStorage[Key]; |
397 | if (Entry.empty()) |
398 | Entry.resize(UF); |
399 | assert(Entry.size() == UF)((Entry.size() == UF) ? static_cast<void> (0) : __assert_fail ("Entry.size() == UF", "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 399, __PRETTY_FUNCTION__)); |
400 | return Entry; |
401 | } |
402 | |
403 | private: |
404 | /// The unroll factor. Each entry in the map stores this number of vector |
405 | /// elements. |
406 | unsigned UF; |
407 | |
408 | /// Map storage. We use std::map and not DenseMap because insertions to a |
409 | /// dense map invalidates its iterators. |
410 | std::map<Value *, VectorParts> MapStorage; |
411 | }; |
412 | |
413 | /// The original loop. |
414 | Loop *OrigLoop; |
415 | /// Scev analysis to use. |
416 | ScalarEvolution *SE; |
417 | /// Loop Info. |
418 | LoopInfo *LI; |
419 | /// Dominator Tree. |
420 | DominatorTree *DT; |
421 | /// Alias Analysis. |
422 | AliasAnalysis *AA; |
423 | /// Data Layout. |
424 | const DataLayout *DL; |
425 | /// Target Library Info. |
426 | const TargetLibraryInfo *TLI; |
427 | |
428 | /// The vectorization SIMD factor to use. Each vector will have this many |
429 | /// vector elements. |
430 | unsigned VF; |
431 | |
432 | protected: |
433 | /// The vectorization unroll factor to use. Each scalar is vectorized to this |
434 | /// many different vector instructions. |
435 | unsigned UF; |
436 | |
437 | /// The builder that we use |
438 | IRBuilder<> Builder; |
439 | |
440 | // --- Vectorization state --- |
441 | |
442 | /// The vector-loop preheader. |
443 | BasicBlock *LoopVectorPreHeader; |
444 | /// The scalar-loop preheader. |
445 | BasicBlock *LoopScalarPreHeader; |
446 | /// Middle Block between the vector and the scalar. |
447 | BasicBlock *LoopMiddleBlock; |
448 | ///The ExitBlock of the scalar loop. |
449 | BasicBlock *LoopExitBlock; |
450 | ///The vector loop body. |
451 | SmallVector<BasicBlock *, 4> LoopVectorBody; |
452 | ///The scalar loop body. |
453 | BasicBlock *LoopScalarBody; |
454 | /// A list of all bypass blocks. The first block is the entry of the loop. |
455 | SmallVector<BasicBlock *, 4> LoopBypassBlocks; |
456 | |
457 | /// The new Induction variable which was added to the new block. |
458 | PHINode *Induction; |
459 | /// The induction variable of the old basic block. |
460 | PHINode *OldInduction; |
461 | /// Holds the extended (to the widest induction type) start index. |
462 | Value *ExtendedIdx; |
463 | /// Maps scalars to widened vectors. |
464 | ValueMap WidenMap; |
465 | EdgeMaskCache MaskCache; |
466 | |
467 | LoopVectorizationLegality *Legal; |
468 | }; |
469 | |
470 | class InnerLoopUnroller : public InnerLoopVectorizer { |
471 | public: |
472 | InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, |
473 | DominatorTree *DT, const DataLayout *DL, |
474 | const TargetLibraryInfo *TLI, unsigned UnrollFactor) : |
475 | InnerLoopVectorizer(OrigLoop, SE, LI, DT, DL, TLI, 1, UnrollFactor) { } |
476 | |
477 | private: |
478 | void scalarizeInstruction(Instruction *Instr, |
479 | bool IfPredicateStore = false) override; |
480 | void vectorizeMemoryInstruction(Instruction *Instr) override; |
481 | Value *getBroadcastInstrs(Value *V) override; |
482 | Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate) override; |
483 | Value *reverseVector(Value *Vec) override; |
484 | }; |
485 | |
486 | /// \brief Look for a meaningful debug location on the instruction or it's |
487 | /// operands. |
488 | static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { |
489 | if (!I) |
490 | return I; |
491 | |
492 | DebugLoc Empty; |
493 | if (I->getDebugLoc() != Empty) |
494 | return I; |
495 | |
496 | for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { |
497 | if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) |
498 | if (OpInst->getDebugLoc() != Empty) |
499 | return OpInst; |
500 | } |
501 | |
502 | return I; |
503 | } |
504 | |
505 | /// \brief Set the debug location in the builder using the debug location in the |
506 | /// instruction. |
507 | static void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { |
508 | if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) |
509 | B.SetCurrentDebugLocation(Inst->getDebugLoc()); |
510 | else |
511 | B.SetCurrentDebugLocation(DebugLoc()); |
512 | } |
513 | |
514 | #ifndef NDEBUG |
515 | /// \return string containing a file name and a line # for the given loop. |
516 | static std::string getDebugLocString(const Loop *L) { |
517 | std::string Result; |
518 | if (L) { |
519 | raw_string_ostream OS(Result); |
520 | const DebugLoc LoopDbgLoc = L->getStartLoc(); |
521 | if (!LoopDbgLoc.isUnknown()) |
522 | LoopDbgLoc.print(L->getHeader()->getContext(), OS); |
523 | else |
524 | // Just print the module name. |
525 | OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); |
526 | OS.flush(); |
527 | } |
528 | return Result; |
529 | } |
530 | #endif |
531 | |
532 | /// \brief Propagate known metadata from one instruction to another. |
533 | static void propagateMetadata(Instruction *To, const Instruction *From) { |
534 | SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata; |
535 | From->getAllMetadataOtherThanDebugLoc(Metadata); |
536 | |
537 | for (auto M : Metadata) { |
538 | unsigned Kind = M.first; |
539 | |
540 | // These are safe to transfer (this is safe for TBAA, even when we |
541 | // if-convert, because should that metadata have had a control dependency |
542 | // on the condition, and thus actually aliased with some other |
543 | // non-speculated memory access when the condition was false, this would be |
544 | // caught by the runtime overlap checks). |
545 | if (Kind != LLVMContext::MD_tbaa && |
546 | Kind != LLVMContext::MD_alias_scope && |
547 | Kind != LLVMContext::MD_noalias && |
548 | Kind != LLVMContext::MD_fpmath) |
549 | continue; |
550 | |
551 | To->setMetadata(Kind, M.second); |
552 | } |
553 | } |
554 | |
555 | /// \brief Propagate known metadata from one instruction to a vector of others. |
556 | static void propagateMetadata(SmallVectorImpl<Value *> &To, const Instruction *From) { |
557 | for (Value *V : To) |
558 | if (Instruction *I = dyn_cast<Instruction>(V)) |
559 | propagateMetadata(I, From); |
560 | } |
561 | |
562 | /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and |
563 | /// to what vectorization factor. |
564 | /// This class does not look at the profitability of vectorization, only the |
565 | /// legality. This class has two main kinds of checks: |
566 | /// * Memory checks - The code in canVectorizeMemory checks if vectorization |
567 | /// will change the order of memory accesses in a way that will change the |
568 | /// correctness of the program. |
569 | /// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory |
570 | /// checks for a number of different conditions, such as the availability of a |
571 | /// single induction variable, that all types are supported and vectorize-able, |
572 | /// etc. This code reflects the capabilities of InnerLoopVectorizer. |
573 | /// This class is also used by InnerLoopVectorizer for identifying |
574 | /// induction variable and the different reduction variables. |
575 | class LoopVectorizationLegality { |
576 | public: |
577 | unsigned NumLoads; |
578 | unsigned NumStores; |
579 | unsigned NumPredStores; |
580 | |
581 | LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, const DataLayout *DL, |
582 | DominatorTree *DT, TargetLibraryInfo *TLI, |
583 | AliasAnalysis *AA, Function *F, |
584 | const TargetTransformInfo *TTI) |
585 | : NumLoads(0), NumStores(0), NumPredStores(0), TheLoop(L), SE(SE), DL(DL), |
586 | DT(DT), TLI(TLI), AA(AA), TheFunction(F), TTI(TTI), Induction(nullptr), |
587 | WidestIndTy(nullptr), HasFunNoNaNAttr(false), MaxSafeDepDistBytes(-1U) { |
588 | } |
589 | |
590 | /// This enum represents the kinds of reductions that we support. |
591 | enum ReductionKind { |
592 | RK_NoReduction, ///< Not a reduction. |
593 | RK_IntegerAdd, ///< Sum of integers. |
594 | RK_IntegerMult, ///< Product of integers. |
595 | RK_IntegerOr, ///< Bitwise or logical OR of numbers. |
596 | RK_IntegerAnd, ///< Bitwise or logical AND of numbers. |
597 | RK_IntegerXor, ///< Bitwise or logical XOR of numbers. |
598 | RK_IntegerMinMax, ///< Min/max implemented in terms of select(cmp()). |
599 | RK_FloatAdd, ///< Sum of floats. |
600 | RK_FloatMult, ///< Product of floats. |
601 | RK_FloatMinMax ///< Min/max implemented in terms of select(cmp()). |
602 | }; |
603 | |
604 | /// This enum represents the kinds of inductions that we support. |
605 | enum InductionKind { |
606 | IK_NoInduction, ///< Not an induction variable. |
607 | IK_IntInduction, ///< Integer induction variable. Step = 1. |
608 | IK_ReverseIntInduction, ///< Reverse int induction variable. Step = -1. |
609 | IK_PtrInduction, ///< Pointer induction var. Step = sizeof(elem). |
610 | IK_ReversePtrInduction ///< Reverse ptr indvar. Step = - sizeof(elem). |
611 | }; |
612 | |
613 | // This enum represents the kind of minmax reduction. |
614 | enum MinMaxReductionKind { |
615 | MRK_Invalid, |
616 | MRK_UIntMin, |
617 | MRK_UIntMax, |
618 | MRK_SIntMin, |
619 | MRK_SIntMax, |
620 | MRK_FloatMin, |
621 | MRK_FloatMax |
622 | }; |
623 | |
624 | /// This struct holds information about reduction variables. |
625 | struct ReductionDescriptor { |
626 | ReductionDescriptor() : StartValue(nullptr), LoopExitInstr(nullptr), |
627 | Kind(RK_NoReduction), MinMaxKind(MRK_Invalid) {} |
628 | |
629 | ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K, |
630 | MinMaxReductionKind MK) |
631 | : StartValue(Start), LoopExitInstr(Exit), Kind(K), MinMaxKind(MK) {} |
632 | |
633 | // The starting value of the reduction. |
634 | // It does not have to be zero! |
635 | TrackingVH<Value> StartValue; |
636 | // The instruction who's value is used outside the loop. |
637 | Instruction *LoopExitInstr; |
638 | // The kind of the reduction. |
639 | ReductionKind Kind; |
640 | // If this a min/max reduction the kind of reduction. |
641 | MinMaxReductionKind MinMaxKind; |
642 | }; |
643 | |
644 | /// This POD struct holds information about a potential reduction operation. |
645 | struct ReductionInstDesc { |
646 | ReductionInstDesc(bool IsRedux, Instruction *I) : |
647 | IsReduction(IsRedux), PatternLastInst(I), MinMaxKind(MRK_Invalid) {} |
648 | |
649 | ReductionInstDesc(Instruction *I, MinMaxReductionKind K) : |
650 | IsReduction(true), PatternLastInst(I), MinMaxKind(K) {} |
651 | |
652 | // Is this instruction a reduction candidate. |
653 | bool IsReduction; |
654 | // The last instruction in a min/max pattern (select of the select(icmp()) |
655 | // pattern), or the current reduction instruction otherwise. |
656 | Instruction *PatternLastInst; |
657 | // If this is a min/max pattern the comparison predicate. |
658 | MinMaxReductionKind MinMaxKind; |
659 | }; |
660 | |
661 | /// This struct holds information about the memory runtime legality |
662 | /// check that a group of pointers do not overlap. |
663 | struct RuntimePointerCheck { |
664 | RuntimePointerCheck() : Need(false) {} |
665 | |
666 | /// Reset the state of the pointer runtime information. |
667 | void reset() { |
668 | Need = false; |
669 | Pointers.clear(); |
670 | Starts.clear(); |
671 | Ends.clear(); |
672 | IsWritePtr.clear(); |
673 | DependencySetId.clear(); |
674 | AliasSetId.clear(); |
675 | } |
676 | |
677 | /// Insert a pointer and calculate the start and end SCEVs. |
678 | void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr, |
679 | unsigned DepSetId, unsigned ASId, ValueToValueMap &Strides); |
680 | |
681 | /// This flag indicates if we need to add the runtime check. |
682 | bool Need; |
683 | /// Holds the pointers that we need to check. |
684 | SmallVector<TrackingVH<Value>, 2> Pointers; |
685 | /// Holds the pointer value at the beginning of the loop. |
686 | SmallVector<const SCEV*, 2> Starts; |
687 | /// Holds the pointer value at the end of the loop. |
688 | SmallVector<const SCEV*, 2> Ends; |
689 | /// Holds the information if this pointer is used for writing to memory. |
690 | SmallVector<bool, 2> IsWritePtr; |
691 | /// Holds the id of the set of pointers that could be dependent because of a |
692 | /// shared underlying object. |
693 | SmallVector<unsigned, 2> DependencySetId; |
694 | /// Holds the id of the disjoint alias set to which this pointer belongs. |
695 | SmallVector<unsigned, 2> AliasSetId; |
696 | }; |
697 | |
698 | /// A struct for saving information about induction variables. |
699 | struct InductionInfo { |
700 | InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {} |
701 | InductionInfo() : StartValue(nullptr), IK(IK_NoInduction) {} |
702 | /// Start value. |
703 | TrackingVH<Value> StartValue; |
704 | /// Induction kind. |
705 | InductionKind IK; |
706 | }; |
707 | |
708 | /// ReductionList contains the reduction descriptors for all |
709 | /// of the reductions that were found in the loop. |
710 | typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList; |
711 | |
712 | /// InductionList saves induction variables and maps them to the |
713 | /// induction descriptor. |
714 | typedef MapVector<PHINode*, InductionInfo> InductionList; |
715 | |
716 | /// Returns true if it is legal to vectorize this loop. |
717 | /// This does not mean that it is profitable to vectorize this |
718 | /// loop, only that it is legal to do so. |
719 | bool canVectorize(); |
720 | |
721 | /// Returns the Induction variable. |
722 | PHINode *getInduction() { return Induction; } |
723 | |
724 | /// Returns the reduction variables found in the loop. |
725 | ReductionList *getReductionVars() { return &Reductions; } |
726 | |
727 | /// Returns the induction variables found in the loop. |
728 | InductionList *getInductionVars() { return &Inductions; } |
729 | |
730 | /// Returns the widest induction type. |
731 | Type *getWidestInductionType() { return WidestIndTy; } |
732 | |
733 | /// Returns True if V is an induction variable in this loop. |
734 | bool isInductionVariable(const Value *V); |
735 | |
736 | /// Return true if the block BB needs to be predicated in order for the loop |
737 | /// to be vectorized. |
738 | bool blockNeedsPredication(BasicBlock *BB); |
739 | |
740 | /// Check if this pointer is consecutive when vectorizing. This happens |
741 | /// when the last index of the GEP is the induction variable, or that the |
742 | /// pointer itself is an induction variable. |
743 | /// This check allows us to vectorize A[idx] into a wide load/store. |
744 | /// Returns: |
745 | /// 0 - Stride is unknown or non-consecutive. |
746 | /// 1 - Address is consecutive. |
747 | /// -1 - Address is consecutive, and decreasing. |
748 | int isConsecutivePtr(Value *Ptr); |
749 | |
750 | /// Returns true if the value V is uniform within the loop. |
751 | bool isUniform(Value *V); |
752 | |
753 | /// Returns true if this instruction will remain scalar after vectorization. |
754 | bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); } |
755 | |
756 | /// Returns the information that we collected about runtime memory check. |
757 | RuntimePointerCheck *getRuntimePointerCheck() { return &PtrRtCheck; } |
758 | |
759 | /// This function returns the identity element (or neutral element) for |
760 | /// the operation K. |
761 | static Constant *getReductionIdentity(ReductionKind K, Type *Tp); |
762 | |
763 | unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; } |
764 | |
765 | bool hasStride(Value *V) { return StrideSet.count(V); } |
766 | bool mustCheckStrides() { return !StrideSet.empty(); } |
767 | SmallPtrSet<Value *, 8>::iterator strides_begin() { |
768 | return StrideSet.begin(); |
769 | } |
770 | SmallPtrSet<Value *, 8>::iterator strides_end() { return StrideSet.end(); } |
771 | |
772 | /// Returns true if the target machine supports masked store operation |
773 | /// for the given \p DataType and kind of access to \p Ptr. |
774 | bool isLegalMaskedStore(Type *DataType, Value *Ptr) { |
775 | return TTI->isLegalMaskedStore(DataType, isConsecutivePtr(Ptr)); |
776 | } |
777 | /// Returns true if the target machine supports masked load operation |
778 | /// for the given \p DataType and kind of access to \p Ptr. |
779 | bool isLegalMaskedLoad(Type *DataType, Value *Ptr) { |
780 | return TTI->isLegalMaskedLoad(DataType, isConsecutivePtr(Ptr)); |
781 | } |
782 | /// Returns true if vector representation of the instruction \p I |
783 | /// requires mask. |
784 | bool isMaskRequired(const Instruction* I) { |
785 | return (MaskedOp.count(I) != 0); |
786 | } |
787 | private: |
788 | /// Check if a single basic block loop is vectorizable. |
789 | /// At this point we know that this is a loop with a constant trip count |
790 | /// and we only need to check individual instructions. |
791 | bool canVectorizeInstrs(); |
792 | |
793 | /// When we vectorize loops we may change the order in which |
794 | /// we read and write from memory. This method checks if it is |
795 | /// legal to vectorize the code, considering only memory constrains. |
796 | /// Returns true if the loop is vectorizable |
797 | bool canVectorizeMemory(); |
798 | |
799 | /// Return true if we can vectorize this loop using the IF-conversion |
800 | /// transformation. |
801 | bool canVectorizeWithIfConvert(); |
802 | |
803 | /// Collect the variables that need to stay uniform after vectorization. |
804 | void collectLoopUniforms(); |
805 | |
806 | /// Return true if all of the instructions in the block can be speculatively |
807 | /// executed. \p SafePtrs is a list of addresses that are known to be legal |
808 | /// and we know that we can read from them without segfault. |
809 | bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs); |
810 | |
811 | /// Returns True, if 'Phi' is the kind of reduction variable for type |
812 | /// 'Kind'. If this is a reduction variable, it adds it to ReductionList. |
813 | bool AddReductionVar(PHINode *Phi, ReductionKind Kind); |
814 | /// Returns a struct describing if the instruction 'I' can be a reduction |
815 | /// variable of type 'Kind'. If the reduction is a min/max pattern of |
816 | /// select(icmp()) this function advances the instruction pointer 'I' from the |
817 | /// compare instruction to the select instruction and stores this pointer in |
818 | /// 'PatternLastInst' member of the returned struct. |
819 | ReductionInstDesc isReductionInstr(Instruction *I, ReductionKind Kind, |
820 | ReductionInstDesc &Desc); |
821 | /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction |
822 | /// pattern corresponding to a min(X, Y) or max(X, Y). |
823 | static ReductionInstDesc isMinMaxSelectCmpPattern(Instruction *I, |
824 | ReductionInstDesc &Prev); |
825 | /// Returns the induction kind of Phi. This function may return NoInduction |
826 | /// if the PHI is not an induction variable. |
827 | InductionKind isInductionVariable(PHINode *Phi); |
828 | |
829 | /// \brief Collect memory access with loop invariant strides. |
830 | /// |
831 | /// Looks for accesses like "a[i * StrideA]" where "StrideA" is loop |
832 | /// invariant. |
833 | void collectStridedAcccess(Value *LoadOrStoreInst); |
834 | |
835 | /// Report an analysis message to assist the user in diagnosing loops that are |
836 | /// not vectorized. |
837 | void emitAnalysis(Report &Message) { |
838 | DebugLoc DL = TheLoop->getStartLoc(); |
839 | if (Instruction *I = Message.getInstr()) |
840 | DL = I->getDebugLoc(); |
841 | emitOptimizationRemarkAnalysis(TheFunction->getContext(), DEBUG_TYPE"loop-vectorize", |
842 | *TheFunction, DL, Message.str()); |
843 | } |
844 | |
845 | /// The loop that we evaluate. |
846 | Loop *TheLoop; |
847 | /// Scev analysis. |
848 | ScalarEvolution *SE; |
849 | /// DataLayout analysis. |
850 | const DataLayout *DL; |
851 | /// Dominators. |
852 | DominatorTree *DT; |
853 | /// Target Library Info. |
854 | TargetLibraryInfo *TLI; |
855 | /// Alias analysis. |
856 | AliasAnalysis *AA; |
857 | /// Parent function |
858 | Function *TheFunction; |
859 | /// Target Transform Info |
860 | const TargetTransformInfo *TTI; |
861 | |
862 | // --- vectorization state --- // |
863 | |
864 | /// Holds the integer induction variable. This is the counter of the |
865 | /// loop. |
866 | PHINode *Induction; |
867 | /// Holds the reduction variables. |
868 | ReductionList Reductions; |
869 | /// Holds all of the induction variables that we found in the loop. |
870 | /// Notice that inductions don't need to start at zero and that induction |
871 | /// variables can be pointers. |
872 | InductionList Inductions; |
873 | /// Holds the widest induction type encountered. |
874 | Type *WidestIndTy; |
875 | |
876 | /// Allowed outside users. This holds the reduction |
877 | /// vars which can be accessed from outside the loop. |
878 | SmallPtrSet<Value*, 4> AllowedExit; |
879 | /// This set holds the variables which are known to be uniform after |
880 | /// vectorization. |
881 | SmallPtrSet<Instruction*, 4> Uniforms; |
882 | /// We need to check that all of the pointers in this list are disjoint |
883 | /// at runtime. |
884 | RuntimePointerCheck PtrRtCheck; |
885 | /// Can we assume the absence of NaNs. |
886 | bool HasFunNoNaNAttr; |
887 | |
888 | unsigned MaxSafeDepDistBytes; |
889 | |
890 | ValueToValueMap Strides; |
891 | SmallPtrSet<Value *, 8> StrideSet; |
892 | |
893 | /// While vectorizing these instructions we have to generate a |
894 | /// call to the appropriate masked intrinsic |
895 | SmallPtrSet<const Instruction*, 8> MaskedOp; |
896 | }; |
897 | |
898 | /// LoopVectorizationCostModel - estimates the expected speedups due to |
899 | /// vectorization. |
900 | /// In many cases vectorization is not profitable. This can happen because of |
901 | /// a number of reasons. In this class we mainly attempt to predict the |
902 | /// expected speedup/slowdowns due to the supported instruction set. We use the |
903 | /// TargetTransformInfo to query the different backends for the cost of |
904 | /// different operations. |
905 | class LoopVectorizationCostModel { |
906 | public: |
907 | LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI, |
908 | LoopVectorizationLegality *Legal, |
909 | const TargetTransformInfo &TTI, |
910 | const DataLayout *DL, const TargetLibraryInfo *TLI, |
911 | AssumptionTracker *AT, const Function *F, |
912 | const LoopVectorizeHints *Hints) |
913 | : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI), |
914 | TheFunction(F), Hints(Hints) { |
915 | CodeMetrics::collectEphemeralValues(L, AT, EphValues); |
916 | } |
917 | |
918 | /// Information about vectorization costs |
919 | struct VectorizationFactor { |
920 | unsigned Width; // Vector width with best cost |
921 | unsigned Cost; // Cost of the loop with that width |
922 | }; |
923 | /// \return The most profitable vectorization factor and the cost of that VF. |
924 | /// This method checks every power of two up to VF. If UserVF is not ZERO |
925 | /// then this vectorization factor will be selected if vectorization is |
926 | /// possible. |
927 | VectorizationFactor selectVectorizationFactor(bool OptForSize); |
928 | |
929 | /// \return The size (in bits) of the widest type in the code that |
930 | /// needs to be vectorized. We ignore values that remain scalar such as |
931 | /// 64 bit loop indices. |
932 | unsigned getWidestType(); |
933 | |
934 | /// \return The most profitable unroll factor. |
935 | /// If UserUF is non-zero then this method finds the best unroll-factor |
936 | /// based on register pressure and other parameters. |
937 | /// VF and LoopCost are the selected vectorization factor and the cost of the |
938 | /// selected VF. |
939 | unsigned selectUnrollFactor(bool OptForSize, unsigned VF, unsigned LoopCost); |
940 | |
941 | /// \brief A struct that represents some properties of the register usage |
942 | /// of a loop. |
943 | struct RegisterUsage { |
944 | /// Holds the number of loop invariant values that are used in the loop. |
945 | unsigned LoopInvariantRegs; |
946 | /// Holds the maximum number of concurrent live intervals in the loop. |
947 | unsigned MaxLocalUsers; |
948 | /// Holds the number of instructions in the loop. |
949 | unsigned NumInstructions; |
950 | }; |
951 | |
952 | /// \return information about the register usage of the loop. |
953 | RegisterUsage calculateRegisterUsage(); |
954 | |
955 | private: |
956 | /// Returns the expected execution cost. The unit of the cost does |
957 | /// not matter because we use the 'cost' units to compare different |
958 | /// vector widths. The cost that is returned is *not* normalized by |
959 | /// the factor width. |
960 | unsigned expectedCost(unsigned VF); |
961 | |
962 | /// Returns the execution time cost of an instruction for a given vector |
963 | /// width. Vector width of one means scalar. |
964 | unsigned getInstructionCost(Instruction *I, unsigned VF); |
965 | |
966 | /// A helper function for converting Scalar types to vector types. |
967 | /// If the incoming type is void, we return void. If the VF is 1, we return |
968 | /// the scalar type. |
969 | static Type* ToVectorTy(Type *Scalar, unsigned VF); |
970 | |
971 | /// Returns whether the instruction is a load or store and will be a emitted |
972 | /// as a vector operation. |
973 | bool isConsecutiveLoadOrStore(Instruction *I); |
974 | |
975 | /// Report an analysis message to assist the user in diagnosing loops that are |
976 | /// not vectorized. |
977 | void emitAnalysis(Report &Message) { |
978 | DebugLoc DL = TheLoop->getStartLoc(); |
979 | if (Instruction *I = Message.getInstr()) |
980 | DL = I->getDebugLoc(); |
981 | emitOptimizationRemarkAnalysis(TheFunction->getContext(), DEBUG_TYPE"loop-vectorize", |
982 | *TheFunction, DL, Message.str()); |
983 | } |
984 | |
985 | /// Values used only by @llvm.assume calls. |
986 | SmallPtrSet<const Value *, 32> EphValues; |
987 | |
988 | /// The loop that we evaluate. |
989 | Loop *TheLoop; |
990 | /// Scev analysis. |
991 | ScalarEvolution *SE; |
992 | /// Loop Info analysis. |
993 | LoopInfo *LI; |
994 | /// Vectorization legality. |
995 | LoopVectorizationLegality *Legal; |
996 | /// Vector target information. |
997 | const TargetTransformInfo &TTI; |
998 | /// Target data layout information. |
999 | const DataLayout *DL; |
1000 | /// Target Library Info. |
1001 | const TargetLibraryInfo *TLI; |
1002 | const Function *TheFunction; |
1003 | // Loop Vectorize Hint. |
1004 | const LoopVectorizeHints *Hints; |
1005 | }; |
1006 | |
1007 | /// Utility class for getting and setting loop vectorizer hints in the form |
1008 | /// of loop metadata. |
1009 | /// This class keeps a number of loop annotations locally (as member variables) |
1010 | /// and can, upon request, write them back as metadata on the loop. It will |
1011 | /// initially scan the loop for existing metadata, and will update the local |
1012 | /// values based on information in the loop. |
1013 | /// We cannot write all values to metadata, as the mere presence of some info, |
1014 | /// for example 'force', means a decision has been made. So, we need to be |
1015 | /// careful NOT to add them if the user hasn't specifically asked so. |
1016 | class LoopVectorizeHints { |
1017 | enum HintKind { |
1018 | HK_WIDTH, |
1019 | HK_UNROLL, |
1020 | HK_FORCE |
1021 | }; |
1022 | |
1023 | /// Hint - associates name and validation with the hint value. |
1024 | struct Hint { |
1025 | const char * Name; |
1026 | unsigned Value; // This may have to change for non-numeric values. |
1027 | HintKind Kind; |
1028 | |
1029 | Hint(const char * Name, unsigned Value, HintKind Kind) |
1030 | : Name(Name), Value(Value), Kind(Kind) { } |
1031 | |
1032 | bool validate(unsigned Val) { |
1033 | switch (Kind) { |
1034 | case HK_WIDTH: |
1035 | return isPowerOf2_32(Val) && Val <= MaxVectorWidth; |
1036 | case HK_UNROLL: |
1037 | return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor; |
1038 | case HK_FORCE: |
1039 | return (Val <= 1); |
1040 | } |
1041 | return false; |
1042 | } |
1043 | }; |
1044 | |
1045 | /// Vectorization width. |
1046 | Hint Width; |
1047 | /// Vectorization interleave factor. |
1048 | Hint Interleave; |
1049 | /// Vectorization forced |
1050 | Hint Force; |
1051 | |
1052 | /// Return the loop metadata prefix. |
1053 | static StringRef Prefix() { return "llvm.loop."; } |
1054 | |
1055 | public: |
1056 | enum ForceKind { |
1057 | FK_Undefined = -1, ///< Not selected. |
1058 | FK_Disabled = 0, ///< Forcing disabled. |
1059 | FK_Enabled = 1, ///< Forcing enabled. |
1060 | }; |
1061 | |
1062 | LoopVectorizeHints(const Loop *L, bool DisableInterleaving) |
1063 | : Width("vectorize.width", VectorizationFactor, HK_WIDTH), |
1064 | Interleave("interleave.count", DisableInterleaving, HK_UNROLL), |
1065 | Force("vectorize.enable", FK_Undefined, HK_FORCE), |
1066 | TheLoop(L) { |
1067 | // Populate values with existing loop metadata. |
1068 | getHintsFromMetadata(); |
1069 | |
1070 | // force-vector-interleave overrides DisableInterleaving. |
1071 | if (VectorizationInterleave.getNumOccurrences() > 0) |
1072 | Interleave.Value = VectorizationInterleave; |
1073 | |
1074 | DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { if (DisableInterleaving && Interleave .Value == 1) dbgs() << "LV: Interleaving disabled by the pass manager\n" ; } } while (0) |
1075 | << "LV: Interleaving disabled by the pass manager\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { if (DisableInterleaving && Interleave .Value == 1) dbgs() << "LV: Interleaving disabled by the pass manager\n" ; } } while (0); |
1076 | } |
1077 | |
1078 | /// Mark the loop L as already vectorized by setting the width to 1. |
1079 | void setAlreadyVectorized() { |
1080 | Width.Value = Interleave.Value = 1; |
1081 | Hint Hints[] = {Width, Interleave}; |
1082 | writeHintsToMetadata(Hints); |
1083 | } |
1084 | |
1085 | /// Dumps all the hint information. |
1086 | std::string emitRemark() const { |
1087 | Report R; |
1088 | if (Force.Value == LoopVectorizeHints::FK_Disabled) |
1089 | R << "vectorization is explicitly disabled"; |
1090 | else { |
1091 | R << "use -Rpass-analysis=loop-vectorize for more info"; |
1092 | if (Force.Value == LoopVectorizeHints::FK_Enabled) { |
1093 | R << " (Force=true"; |
1094 | if (Width.Value != 0) |
1095 | R << ", Vector Width=" << Width.Value; |
1096 | if (Interleave.Value != 0) |
1097 | R << ", Interleave Count=" << Interleave.Value; |
1098 | R << ")"; |
1099 | } |
1100 | } |
1101 | |
1102 | return R.str(); |
1103 | } |
1104 | |
1105 | unsigned getWidth() const { return Width.Value; } |
1106 | unsigned getInterleave() const { return Interleave.Value; } |
1107 | enum ForceKind getForce() const { return (ForceKind)Force.Value; } |
1108 | |
1109 | private: |
1110 | /// Find hints specified in the loop metadata and update local values. |
1111 | void getHintsFromMetadata() { |
1112 | MDNode *LoopID = TheLoop->getLoopID(); |
1113 | if (!LoopID) |
1114 | return; |
1115 | |
1116 | // First operand should refer to the loop id itself. |
1117 | assert(LoopID->getNumOperands() > 0 && "requires at least one operand")((LoopID->getNumOperands() > 0 && "requires at least one operand" ) ? static_cast<void> (0) : __assert_fail ("LoopID->getNumOperands() > 0 && \"requires at least one operand\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1117, __PRETTY_FUNCTION__)); |
1118 | assert(LoopID->getOperand(0) == LoopID && "invalid loop id")((LoopID->getOperand(0) == LoopID && "invalid loop id" ) ? static_cast<void> (0) : __assert_fail ("LoopID->getOperand(0) == LoopID && \"invalid loop id\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1118, __PRETTY_FUNCTION__)); |
1119 | |
1120 | for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { |
1121 | const MDString *S = nullptr; |
1122 | SmallVector<Metadata *, 4> Args; |
1123 | |
1124 | // The expected hint is either a MDString or a MDNode with the first |
1125 | // operand a MDString. |
1126 | if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) { |
1127 | if (!MD || MD->getNumOperands() == 0) |
1128 | continue; |
1129 | S = dyn_cast<MDString>(MD->getOperand(0)); |
1130 | for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i) |
1131 | Args.push_back(MD->getOperand(i)); |
1132 | } else { |
1133 | S = dyn_cast<MDString>(LoopID->getOperand(i)); |
1134 | assert(Args.size() == 0 && "too many arguments for MDString")((Args.size() == 0 && "too many arguments for MDString" ) ? static_cast<void> (0) : __assert_fail ("Args.size() == 0 && \"too many arguments for MDString\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1134, __PRETTY_FUNCTION__)); |
1135 | } |
1136 | |
1137 | if (!S) |
1138 | continue; |
1139 | |
1140 | // Check if the hint starts with the loop metadata prefix. |
1141 | StringRef Name = S->getString(); |
1142 | if (Args.size() == 1) |
1143 | setHint(Name, Args[0]); |
1144 | } |
1145 | } |
1146 | |
1147 | /// Checks string hint with one operand and set value if valid. |
1148 | void setHint(StringRef Name, Metadata *Arg) { |
1149 | if (!Name.startswith(Prefix())) |
1150 | return; |
1151 | Name = Name.substr(Prefix().size(), StringRef::npos); |
1152 | |
1153 | const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg); |
1154 | if (!C) return; |
1155 | unsigned Val = C->getZExtValue(); |
1156 | |
1157 | Hint *Hints[] = {&Width, &Interleave, &Force}; |
1158 | for (auto H : Hints) { |
1159 | if (Name == H->Name) { |
1160 | if (H->validate(Val)) |
1161 | H->Value = Val; |
1162 | else |
1163 | DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: ignoring invalid hint '" << Name << "'\n"; } } while (0); |
1164 | break; |
1165 | } |
1166 | } |
1167 | } |
1168 | |
1169 | /// Create a new hint from name / value pair. |
1170 | MDNode *createHintMetadata(StringRef Name, unsigned V) const { |
1171 | LLVMContext &Context = TheLoop->getHeader()->getContext(); |
1172 | Metadata *MDs[] = {MDString::get(Context, Name), |
1173 | ConstantAsMetadata::get( |
1174 | ConstantInt::get(Type::getInt32Ty(Context), V))}; |
1175 | return MDNode::get(Context, MDs); |
1176 | } |
1177 | |
1178 | /// Matches metadata with hint name. |
1179 | bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) { |
1180 | MDString* Name = dyn_cast<MDString>(Node->getOperand(0)); |
1181 | if (!Name) |
1182 | return false; |
1183 | |
1184 | for (auto H : HintTypes) |
1185 | if (Name->getString().endswith(H.Name)) |
1186 | return true; |
1187 | return false; |
1188 | } |
1189 | |
1190 | /// Sets current hints into loop metadata, keeping other values intact. |
1191 | void writeHintsToMetadata(ArrayRef<Hint> HintTypes) { |
1192 | if (HintTypes.size() == 0) |
1193 | return; |
1194 | |
1195 | // Reserve the first element to LoopID (see below). |
1196 | SmallVector<Metadata *, 4> MDs(1); |
1197 | // If the loop already has metadata, then ignore the existing operands. |
1198 | MDNode *LoopID = TheLoop->getLoopID(); |
1199 | if (LoopID) { |
1200 | for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { |
1201 | MDNode *Node = cast<MDNode>(LoopID->getOperand(i)); |
1202 | // If node in update list, ignore old value. |
1203 | if (!matchesHintMetadataName(Node, HintTypes)) |
1204 | MDs.push_back(Node); |
1205 | } |
1206 | } |
1207 | |
1208 | // Now, add the missing hints. |
1209 | for (auto H : HintTypes) |
1210 | MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value)); |
1211 | |
1212 | // Replace current metadata node with new one. |
1213 | LLVMContext &Context = TheLoop->getHeader()->getContext(); |
1214 | MDNode *NewLoopID = MDNode::get(Context, MDs); |
1215 | // Set operand 0 to refer to the loop id itself. |
1216 | NewLoopID->replaceOperandWith(0, NewLoopID); |
1217 | |
1218 | TheLoop->setLoopID(NewLoopID); |
1219 | LoopID = NewLoopID; |
Value stored to 'LoopID' is never read | |
1220 | } |
1221 | |
1222 | /// The loop these hints belong to. |
1223 | const Loop *TheLoop; |
1224 | }; |
1225 | |
1226 | static void emitMissedWarning(Function *F, Loop *L, |
1227 | const LoopVectorizeHints &LH) { |
1228 | emitOptimizationRemarkMissed(F->getContext(), DEBUG_TYPE"loop-vectorize", *F, |
1229 | L->getStartLoc(), LH.emitRemark()); |
1230 | |
1231 | if (LH.getForce() == LoopVectorizeHints::FK_Enabled) { |
1232 | if (LH.getWidth() != 1) |
1233 | emitLoopVectorizeWarning( |
1234 | F->getContext(), *F, L->getStartLoc(), |
1235 | "failed explicitly specified loop vectorization"); |
1236 | else if (LH.getInterleave() != 1) |
1237 | emitLoopInterleaveWarning( |
1238 | F->getContext(), *F, L->getStartLoc(), |
1239 | "failed explicitly specified loop interleaving"); |
1240 | } |
1241 | } |
1242 | |
1243 | static void addInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) { |
1244 | if (L.empty()) |
1245 | return V.push_back(&L); |
1246 | |
1247 | for (Loop *InnerL : L) |
1248 | addInnerLoop(*InnerL, V); |
1249 | } |
1250 | |
1251 | /// The LoopVectorize Pass. |
1252 | struct LoopVectorize : public FunctionPass { |
1253 | /// Pass identification, replacement for typeid |
1254 | static char ID; |
1255 | |
1256 | explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true) |
1257 | : FunctionPass(ID), |
1258 | DisableUnrolling(NoUnrolling), |
1259 | AlwaysVectorize(AlwaysVectorize) { |
1260 | initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); |
1261 | } |
1262 | |
1263 | ScalarEvolution *SE; |
1264 | const DataLayout *DL; |
1265 | LoopInfo *LI; |
1266 | TargetTransformInfo *TTI; |
1267 | DominatorTree *DT; |
1268 | BlockFrequencyInfo *BFI; |
1269 | TargetLibraryInfo *TLI; |
1270 | AliasAnalysis *AA; |
1271 | AssumptionTracker *AT; |
1272 | bool DisableUnrolling; |
1273 | bool AlwaysVectorize; |
1274 | |
1275 | BlockFrequency ColdEntryFreq; |
1276 | |
1277 | bool runOnFunction(Function &F) override { |
1278 | SE = &getAnalysis<ScalarEvolution>(); |
1279 | DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); |
1280 | DL = DLP ? &DLP->getDataLayout() : nullptr; |
1281 | LI = &getAnalysis<LoopInfo>(); |
1282 | TTI = &getAnalysis<TargetTransformInfo>(); |
1283 | DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); |
1284 | BFI = &getAnalysis<BlockFrequencyInfo>(); |
1285 | TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); |
1286 | AA = &getAnalysis<AliasAnalysis>(); |
1287 | AT = &getAnalysis<AssumptionTracker>(); |
1288 | |
1289 | // Compute some weights outside of the loop over the loops. Compute this |
1290 | // using a BranchProbability to re-use its scaling math. |
1291 | const BranchProbability ColdProb(1, 5); // 20% |
1292 | ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb; |
1293 | |
1294 | // If the target claims to have no vector registers don't attempt |
1295 | // vectorization. |
1296 | if (!TTI->getNumberOfRegisters(true)) |
1297 | return false; |
1298 | |
1299 | if (!DL) { |
1300 | DEBUG(dbgs() << "\nLV: Not vectorizing " << F.getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "\nLV: Not vectorizing " << F.getName() << ": Missing data layout\n"; } } while (0) |
1301 | << ": Missing data layout\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "\nLV: Not vectorizing " << F.getName() << ": Missing data layout\n"; } } while (0); |
1302 | return false; |
1303 | } |
1304 | |
1305 | // Build up a worklist of inner-loops to vectorize. This is necessary as |
1306 | // the act of vectorizing or partially unrolling a loop creates new loops |
1307 | // and can invalidate iterators across the loops. |
1308 | SmallVector<Loop *, 8> Worklist; |
1309 | |
1310 | for (Loop *L : *LI) |
1311 | addInnerLoop(*L, Worklist); |
1312 | |
1313 | LoopsAnalyzed += Worklist.size(); |
1314 | |
1315 | // Now walk the identified inner loops. |
1316 | bool Changed = false; |
1317 | while (!Worklist.empty()) |
1318 | Changed |= processLoop(Worklist.pop_back_val()); |
1319 | |
1320 | // Process each loop nest in the function. |
1321 | return Changed; |
1322 | } |
1323 | |
1324 | bool processLoop(Loop *L) { |
1325 | assert(L->empty() && "Only process inner loops.")((L->empty() && "Only process inner loops.") ? static_cast <void> (0) : __assert_fail ("L->empty() && \"Only process inner loops.\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1325, __PRETTY_FUNCTION__)); |
1326 | |
1327 | #ifndef NDEBUG |
1328 | const std::string DebugLocStr = getDebugLocString(L); |
1329 | #endif /* NDEBUG */ |
1330 | |
1331 | DEBUG(dbgs() << "\nLV: Checking a loop in \""do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in \"" << L->getHeader()->getParent()->getName() << "\" from " << DebugLocStr << "\n"; } } while (0) |
1332 | << L->getHeader()->getParent()->getName() << "\" from "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in \"" << L->getHeader()->getParent()->getName() << "\" from " << DebugLocStr << "\n"; } } while (0) |
1333 | << DebugLocStr << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in \"" << L->getHeader()->getParent()->getName() << "\" from " << DebugLocStr << "\n"; } } while (0); |
1334 | |
1335 | LoopVectorizeHints Hints(L, DisableUnrolling); |
1336 | |
1337 | DEBUG(dbgs() << "LV: Loop hints:"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " unroll=" << Hints.getInterleave( ) << "\n"; } } while (0) |
1338 | << " force="do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " unroll=" << Hints.getInterleave( ) << "\n"; } } while (0) |
1339 | << (Hints.getForce() == LoopVectorizeHints::FK_Disableddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " unroll=" << Hints.getInterleave( ) << "\n"; } } while (0) |
1340 | ? "disabled"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " unroll=" << Hints.getInterleave( ) << "\n"; } } while (0) |
1341 | : (Hints.getForce() == LoopVectorizeHints::FK_Enableddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " unroll=" << Hints.getInterleave( ) << "\n"; } } while (0) |
1342 | ? "enabled"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " unroll=" << Hints.getInterleave( ) << "\n"; } } while (0) |
1343 | : "?")) << " width=" << Hints.getWidth()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " unroll=" << Hints.getInterleave( ) << "\n"; } } while (0) |
1344 | << " unroll=" << Hints.getInterleave() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " unroll=" << Hints.getInterleave( ) << "\n"; } } while (0); |
1345 | |
1346 | // Function containing loop |
1347 | Function *F = L->getHeader()->getParent(); |
1348 | |
1349 | // Looking at the diagnostic output is the only way to determine if a loop |
1350 | // was vectorized (other than looking at the IR or machine code), so it |
1351 | // is important to generate an optimization remark for each loop. Most of |
1352 | // these messages are generated by emitOptimizationRemarkAnalysis. Remarks |
1353 | // generated by emitOptimizationRemark and emitOptimizationRemarkMissed are |
1354 | // less verbose reporting vectorized loops and unvectorized loops that may |
1355 | // benefit from vectorization, respectively. |
1356 | |
1357 | if (Hints.getForce() == LoopVectorizeHints::FK_Disabled) { |
1358 | DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n" ; } } while (0); |
1359 | emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE"loop-vectorize", *F, |
1360 | L->getStartLoc(), Hints.emitRemark()); |
1361 | return false; |
1362 | } |
1363 | |
1364 | if (!AlwaysVectorize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) { |
1365 | DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n" ; } } while (0); |
1366 | emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE"loop-vectorize", *F, |
1367 | L->getStartLoc(), Hints.emitRemark()); |
1368 | return false; |
1369 | } |
1370 | |
1371 | if (Hints.getWidth() == 1 && Hints.getInterleave() == 1) { |
1372 | DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n" ; } } while (0); |
1373 | emitOptimizationRemarkAnalysis( |
1374 | F->getContext(), DEBUG_TYPE"loop-vectorize", *F, L->getStartLoc(), |
1375 | "loop not vectorized: vector width and interleave count are " |
1376 | "explicitly set to 1"); |
1377 | return false; |
1378 | } |
1379 | |
1380 | // Check the loop for a trip count threshold: |
1381 | // do not vectorize loops with a tiny trip count. |
1382 | const unsigned TC = SE->getSmallConstantTripCount(L); |
1383 | if (TC > 0u && TC < TinyTripCountVectorThreshold) { |
1384 | DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a loop with a very small trip count. " << "This loop is not worth vectorizing."; } } while (0 ) |
1385 | << "This loop is not worth vectorizing.")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a loop with a very small trip count. " << "This loop is not worth vectorizing."; } } while (0 ); |
1386 | if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) |
1387 | DEBUG(dbgs() << " But vectorizing was explicitly forced.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << " But vectorizing was explicitly forced.\n" ; } } while (0); |
1388 | else { |
1389 | DEBUG(dbgs() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "\n"; } } while (0); |
1390 | emitOptimizationRemarkAnalysis( |
1391 | F->getContext(), DEBUG_TYPE"loop-vectorize", *F, L->getStartLoc(), |
1392 | "vectorization is not beneficial and is not explicitly forced"); |
1393 | return false; |
1394 | } |
1395 | } |
1396 | |
1397 | // Check if it is legal to vectorize the loop. |
1398 | LoopVectorizationLegality LVL(L, SE, DL, DT, TLI, AA, F, TTI); |
1399 | if (!LVL.canVectorize()) { |
1400 | DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Cannot prove legality.\n" ; } } while (0); |
1401 | emitMissedWarning(F, L, Hints); |
1402 | return false; |
1403 | } |
1404 | |
1405 | // Use the cost model. |
1406 | LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL, TLI, AT, F, |
1407 | &Hints); |
1408 | |
1409 | // Check the function attributes to find out if this function should be |
1410 | // optimized for size. |
1411 | bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled && |
1412 | F->hasFnAttribute(Attribute::OptimizeForSize); |
1413 | |
1414 | // Compute the weighted frequency of this loop being executed and see if it |
1415 | // is less than 20% of the function entry baseline frequency. Note that we |
1416 | // always have a canonical loop here because we think we *can* vectoriez. |
1417 | // FIXME: This is hidden behind a flag due to pervasive problems with |
1418 | // exactly what block frequency models. |
1419 | if (LoopVectorizeWithBlockFrequency) { |
1420 | BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader()); |
1421 | if (Hints.getForce() != LoopVectorizeHints::FK_Enabled && |
1422 | LoopEntryFreq < ColdEntryFreq) |
1423 | OptForSize = true; |
1424 | } |
1425 | |
1426 | // Check the function attributes to see if implicit floats are allowed.a |
1427 | // FIXME: This check doesn't seem possibly correct -- what if the loop is |
1428 | // an integer loop and the vector instructions selected are purely integer |
1429 | // vector instructions? |
1430 | if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { |
1431 | DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Can't vectorize when the NoImplicitFloat" "attribute is used.\n"; } } while (0) |
1432 | "attribute is used.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Can't vectorize when the NoImplicitFloat" "attribute is used.\n"; } } while (0); |
1433 | emitOptimizationRemarkAnalysis( |
1434 | F->getContext(), DEBUG_TYPE"loop-vectorize", *F, L->getStartLoc(), |
1435 | "loop not vectorized due to NoImplicitFloat attribute"); |
1436 | emitMissedWarning(F, L, Hints); |
1437 | return false; |
1438 | } |
1439 | |
1440 | // Select the optimal vectorization factor. |
1441 | const LoopVectorizationCostModel::VectorizationFactor VF = |
1442 | CM.selectVectorizationFactor(OptForSize); |
1443 | |
1444 | // Select the unroll factor. |
1445 | const unsigned UF = |
1446 | CM.selectUnrollFactor(OptForSize, VF.Width, VF.Cost); |
1447 | |
1448 | DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " << DebugLocStr << '\n'; } } while (0) |
1449 | << DebugLocStr << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " << DebugLocStr << '\n'; } } while (0); |
1450 | DEBUG(dbgs() << "LV: Unroll Factor is " << UF << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Unroll Factor is " << UF << '\n'; } } while (0); |
1451 | |
1452 | if (VF.Width == 1) { |
1453 | DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Vectorization is possible but not beneficial\n" ; } } while (0); |
1454 | |
1455 | if (UF == 1) { |
1456 | emitOptimizationRemarkAnalysis( |
1457 | F->getContext(), DEBUG_TYPE"loop-vectorize", *F, L->getStartLoc(), |
1458 | "not beneficial to vectorize and user disabled interleaving"); |
1459 | return false; |
1460 | } |
1461 | DEBUG(dbgs() << "LV: Trying to at least unroll the loops.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Trying to at least unroll the loops.\n" ; } } while (0); |
1462 | |
1463 | // Report the unrolling decision. |
1464 | emitOptimizationRemark(F->getContext(), DEBUG_TYPE"loop-vectorize", *F, L->getStartLoc(), |
1465 | Twine("unrolled with interleaving factor " + |
1466 | Twine(UF) + |
1467 | " (vectorization not beneficial)")); |
1468 | |
1469 | // We decided not to vectorize, but we may want to unroll. |
1470 | |
1471 | InnerLoopUnroller Unroller(L, SE, LI, DT, DL, TLI, UF); |
1472 | Unroller.vectorize(&LVL); |
1473 | } else { |
1474 | // If we decided that it is *legal* to vectorize the loop then do it. |
1475 | InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF); |
1476 | LB.vectorize(&LVL); |
1477 | ++LoopsVectorized; |
1478 | |
1479 | // Report the vectorization decision. |
1480 | emitOptimizationRemark( |
1481 | F->getContext(), DEBUG_TYPE"loop-vectorize", *F, L->getStartLoc(), |
1482 | Twine("vectorized loop (vectorization factor: ") + Twine(VF.Width) + |
1483 | ", unrolling interleave factor: " + Twine(UF) + ")"); |
1484 | } |
1485 | |
1486 | // Mark the loop as already vectorized to avoid vectorizing again. |
1487 | Hints.setAlreadyVectorized(); |
1488 | |
1489 | DEBUG(verifyFunction(*L->getHeader()->getParent()))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { verifyFunction(*L->getHeader()->getParent ()); } } while (0); |
1490 | return true; |
1491 | } |
1492 | |
1493 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
1494 | AU.addRequired<AssumptionTracker>(); |
1495 | AU.addRequiredID(LoopSimplifyID); |
1496 | AU.addRequiredID(LCSSAID); |
1497 | AU.addRequired<BlockFrequencyInfo>(); |
1498 | AU.addRequired<DominatorTreeWrapperPass>(); |
1499 | AU.addRequired<LoopInfo>(); |
1500 | AU.addRequired<ScalarEvolution>(); |
1501 | AU.addRequired<TargetTransformInfo>(); |
1502 | AU.addRequired<AliasAnalysis>(); |
1503 | AU.addPreserved<LoopInfo>(); |
1504 | AU.addPreserved<DominatorTreeWrapperPass>(); |
1505 | AU.addPreserved<AliasAnalysis>(); |
1506 | } |
1507 | |
1508 | }; |
1509 | |
1510 | } // end anonymous namespace |
1511 | |
1512 | //===----------------------------------------------------------------------===// |
1513 | // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and |
1514 | // LoopVectorizationCostModel. |
1515 | //===----------------------------------------------------------------------===// |
1516 | |
1517 | static Value *stripIntegerCast(Value *V) { |
1518 | if (CastInst *CI = dyn_cast<CastInst>(V)) |
1519 | if (CI->getOperand(0)->getType()->isIntegerTy()) |
1520 | return CI->getOperand(0); |
1521 | return V; |
1522 | } |
1523 | |
1524 | ///\brief Replaces the symbolic stride in a pointer SCEV expression by one. |
1525 | /// |
1526 | /// If \p OrigPtr is not null, use it to look up the stride value instead of |
1527 | /// \p Ptr. |
1528 | static const SCEV *replaceSymbolicStrideSCEV(ScalarEvolution *SE, |
1529 | ValueToValueMap &PtrToStride, |
1530 | Value *Ptr, Value *OrigPtr = nullptr) { |
1531 | |
1532 | const SCEV *OrigSCEV = SE->getSCEV(Ptr); |
1533 | |
1534 | // If there is an entry in the map return the SCEV of the pointer with the |
1535 | // symbolic stride replaced by one. |
1536 | ValueToValueMap::iterator SI = PtrToStride.find(OrigPtr ? OrigPtr : Ptr); |
1537 | if (SI != PtrToStride.end()) { |
1538 | Value *StrideVal = SI->second; |
1539 | |
1540 | // Strip casts. |
1541 | StrideVal = stripIntegerCast(StrideVal); |
1542 | |
1543 | // Replace symbolic stride by one. |
1544 | Value *One = ConstantInt::get(StrideVal->getType(), 1); |
1545 | ValueToValueMap RewriteMap; |
1546 | RewriteMap[StrideVal] = One; |
1547 | |
1548 | const SCEV *ByOne = |
1549 | SCEVParameterRewriter::rewrite(OrigSCEV, *SE, RewriteMap, true); |
1550 | DEBUG(dbgs() << "LV: Replacing SCEV: " << *OrigSCEV << " by: " << *ByOnedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Replacing SCEV: " << *OrigSCEV << " by: " << *ByOne << "\n"; } } while (0) |
1551 | << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Replacing SCEV: " << *OrigSCEV << " by: " << *ByOne << "\n"; } } while (0); |
1552 | return ByOne; |
1553 | } |
1554 | |
1555 | // Otherwise, just return the SCEV of the original pointer. |
1556 | return SE->getSCEV(Ptr); |
1557 | } |
1558 | |
1559 | void LoopVectorizationLegality::RuntimePointerCheck::insert( |
1560 | ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr, unsigned DepSetId, |
1561 | unsigned ASId, ValueToValueMap &Strides) { |
1562 | // Get the stride replaced scev. |
1563 | const SCEV *Sc = replaceSymbolicStrideSCEV(SE, Strides, Ptr); |
1564 | const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc); |
1565 | assert(AR && "Invalid addrec expression")((AR && "Invalid addrec expression") ? static_cast< void> (0) : __assert_fail ("AR && \"Invalid addrec expression\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1565, __PRETTY_FUNCTION__)); |
1566 | const SCEV *Ex = SE->getBackedgeTakenCount(Lp); |
1567 | const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE); |
1568 | Pointers.push_back(Ptr); |
1569 | Starts.push_back(AR->getStart()); |
1570 | Ends.push_back(ScEnd); |
1571 | IsWritePtr.push_back(WritePtr); |
1572 | DependencySetId.push_back(DepSetId); |
1573 | AliasSetId.push_back(ASId); |
1574 | } |
1575 | |
1576 | Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { |
1577 | // We need to place the broadcast of invariant variables outside the loop. |
1578 | Instruction *Instr = dyn_cast<Instruction>(V); |
1579 | bool NewInstr = |
1580 | (Instr && std::find(LoopVectorBody.begin(), LoopVectorBody.end(), |
1581 | Instr->getParent()) != LoopVectorBody.end()); |
1582 | bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr; |
1583 | |
1584 | // Place the code for broadcasting invariant variables in the new preheader. |
1585 | IRBuilder<>::InsertPointGuard Guard(Builder); |
1586 | if (Invariant) |
1587 | Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); |
1588 | |
1589 | // Broadcast the scalar into all locations in the vector. |
1590 | Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); |
1591 | |
1592 | return Shuf; |
1593 | } |
1594 | |
1595 | Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, int StartIdx, |
1596 | bool Negate) { |
1597 | assert(Val->getType()->isVectorTy() && "Must be a vector")((Val->getType()->isVectorTy() && "Must be a vector" ) ? static_cast<void> (0) : __assert_fail ("Val->getType()->isVectorTy() && \"Must be a vector\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1597, __PRETTY_FUNCTION__)); |
1598 | assert(Val->getType()->getScalarType()->isIntegerTy() &&((Val->getType()->getScalarType()->isIntegerTy() && "Elem must be an integer") ? static_cast<void> (0) : __assert_fail ("Val->getType()->getScalarType()->isIntegerTy() && \"Elem must be an integer\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1599, __PRETTY_FUNCTION__)) |
1599 | "Elem must be an integer")((Val->getType()->getScalarType()->isIntegerTy() && "Elem must be an integer") ? static_cast<void> (0) : __assert_fail ("Val->getType()->getScalarType()->isIntegerTy() && \"Elem must be an integer\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1599, __PRETTY_FUNCTION__)); |
1600 | // Create the types. |
1601 | Type *ITy = Val->getType()->getScalarType(); |
1602 | VectorType *Ty = cast<VectorType>(Val->getType()); |
1603 | int VLen = Ty->getNumElements(); |
1604 | SmallVector<Constant*, 8> Indices; |
1605 | |
1606 | // Create a vector of consecutive numbers from zero to VF. |
1607 | for (int i = 0; i < VLen; ++i) { |
1608 | int64_t Idx = Negate ? (-i) : i; |
1609 | Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx, Negate)); |
1610 | } |
1611 | |
1612 | // Add the consecutive indices to the vector value. |
1613 | Constant *Cv = ConstantVector::get(Indices); |
1614 | assert(Cv->getType() == Val->getType() && "Invalid consecutive vec")((Cv->getType() == Val->getType() && "Invalid consecutive vec" ) ? static_cast<void> (0) : __assert_fail ("Cv->getType() == Val->getType() && \"Invalid consecutive vec\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1614, __PRETTY_FUNCTION__)); |
1615 | return Builder.CreateAdd(Val, Cv, "induction"); |
1616 | } |
1617 | |
1618 | /// \brief Find the operand of the GEP that should be checked for consecutive |
1619 | /// stores. This ignores trailing indices that have no effect on the final |
1620 | /// pointer. |
1621 | static unsigned getGEPInductionOperand(const DataLayout *DL, |
1622 | const GetElementPtrInst *Gep) { |
1623 | unsigned LastOperand = Gep->getNumOperands() - 1; |
1624 | unsigned GEPAllocSize = DL->getTypeAllocSize( |
1625 | cast<PointerType>(Gep->getType()->getScalarType())->getElementType()); |
1626 | |
1627 | // Walk backwards and try to peel off zeros. |
1628 | while (LastOperand > 1 && match(Gep->getOperand(LastOperand), m_Zero())) { |
1629 | // Find the type we're currently indexing into. |
1630 | gep_type_iterator GEPTI = gep_type_begin(Gep); |
1631 | std::advance(GEPTI, LastOperand - 1); |
1632 | |
1633 | // If it's a type with the same allocation size as the result of the GEP we |
1634 | // can peel off the zero index. |
1635 | if (DL->getTypeAllocSize(*GEPTI) != GEPAllocSize) |
1636 | break; |
1637 | --LastOperand; |
1638 | } |
1639 | |
1640 | return LastOperand; |
1641 | } |
1642 | |
1643 | int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { |
1644 | assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr")((Ptr->getType()->isPointerTy() && "Unexpected non-ptr" ) ? static_cast<void> (0) : __assert_fail ("Ptr->getType()->isPointerTy() && \"Unexpected non-ptr\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1644, __PRETTY_FUNCTION__)); |
1645 | // Make sure that the pointer does not point to structs. |
1646 | if (Ptr->getType()->getPointerElementType()->isAggregateType()) |
1647 | return 0; |
1648 | |
1649 | // If this value is a pointer induction variable we know it is consecutive. |
1650 | PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr); |
1651 | if (Phi && Inductions.count(Phi)) { |
1652 | InductionInfo II = Inductions[Phi]; |
1653 | if (IK_PtrInduction == II.IK) |
1654 | return 1; |
1655 | else if (IK_ReversePtrInduction == II.IK) |
1656 | return -1; |
1657 | } |
1658 | |
1659 | GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr); |
1660 | if (!Gep) |
1661 | return 0; |
1662 | |
1663 | unsigned NumOperands = Gep->getNumOperands(); |
1664 | Value *GpPtr = Gep->getPointerOperand(); |
1665 | // If this GEP value is a consecutive pointer induction variable and all of |
1666 | // the indices are constant then we know it is consecutive. We can |
1667 | Phi = dyn_cast<PHINode>(GpPtr); |
1668 | if (Phi && Inductions.count(Phi)) { |
1669 | |
1670 | // Make sure that the pointer does not point to structs. |
1671 | PointerType *GepPtrType = cast<PointerType>(GpPtr->getType()); |
1672 | if (GepPtrType->getElementType()->isAggregateType()) |
1673 | return 0; |
1674 | |
1675 | // Make sure that all of the index operands are loop invariant. |
1676 | for (unsigned i = 1; i < NumOperands; ++i) |
1677 | if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) |
1678 | return 0; |
1679 | |
1680 | InductionInfo II = Inductions[Phi]; |
1681 | if (IK_PtrInduction == II.IK) |
1682 | return 1; |
1683 | else if (IK_ReversePtrInduction == II.IK) |
1684 | return -1; |
1685 | } |
1686 | |
1687 | unsigned InductionOperand = getGEPInductionOperand(DL, Gep); |
1688 | |
1689 | // Check that all of the gep indices are uniform except for our induction |
1690 | // operand. |
1691 | for (unsigned i = 0; i != NumOperands; ++i) |
1692 | if (i != InductionOperand && |
1693 | !SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) |
1694 | return 0; |
1695 | |
1696 | // We can emit wide load/stores only if the last non-zero index is the |
1697 | // induction variable. |
1698 | const SCEV *Last = nullptr; |
1699 | if (!Strides.count(Gep)) |
1700 | Last = SE->getSCEV(Gep->getOperand(InductionOperand)); |
1701 | else { |
1702 | // Because of the multiplication by a stride we can have a s/zext cast. |
1703 | // We are going to replace this stride by 1 so the cast is safe to ignore. |
1704 | // |
1705 | // %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] |
1706 | // %0 = trunc i64 %indvars.iv to i32 |
1707 | // %mul = mul i32 %0, %Stride1 |
1708 | // %idxprom = zext i32 %mul to i64 << Safe cast. |
1709 | // %arrayidx = getelementptr inbounds i32* %B, i64 %idxprom |
1710 | // |
1711 | Last = replaceSymbolicStrideSCEV(SE, Strides, |
1712 | Gep->getOperand(InductionOperand), Gep); |
1713 | if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(Last)) |
1714 | Last = |
1715 | (C->getSCEVType() == scSignExtend || C->getSCEVType() == scZeroExtend) |
1716 | ? C->getOperand() |
1717 | : Last; |
1718 | } |
1719 | if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) { |
1720 | const SCEV *Step = AR->getStepRecurrence(*SE); |
1721 | |
1722 | // The memory is consecutive because the last index is consecutive |
1723 | // and all other indices are loop invariant. |
1724 | if (Step->isOne()) |
1725 | return 1; |
1726 | if (Step->isAllOnesValue()) |
1727 | return -1; |
1728 | } |
1729 | |
1730 | return 0; |
1731 | } |
1732 | |
1733 | bool LoopVectorizationLegality::isUniform(Value *V) { |
1734 | return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop)); |
1735 | } |
1736 | |
1737 | InnerLoopVectorizer::VectorParts& |
1738 | InnerLoopVectorizer::getVectorValue(Value *V) { |
1739 | assert(V != Induction && "The new induction variable should not be used.")((V != Induction && "The new induction variable should not be used." ) ? static_cast<void> (0) : __assert_fail ("V != Induction && \"The new induction variable should not be used.\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1739, __PRETTY_FUNCTION__)); |
1740 | assert(!V->getType()->isVectorTy() && "Can't widen a vector")((!V->getType()->isVectorTy() && "Can't widen a vector" ) ? static_cast<void> (0) : __assert_fail ("!V->getType()->isVectorTy() && \"Can't widen a vector\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1740, __PRETTY_FUNCTION__)); |
1741 | |
1742 | // If we have a stride that is replaced by one, do it here. |
1743 | if (Legal->hasStride(V)) |
1744 | V = ConstantInt::get(V->getType(), 1); |
1745 | |
1746 | // If we have this scalar in the map, return it. |
1747 | if (WidenMap.has(V)) |
1748 | return WidenMap.get(V); |
1749 | |
1750 | // If this scalar is unknown, assume that it is a constant or that it is |
1751 | // loop invariant. Broadcast V and save the value for future uses. |
1752 | Value *B = getBroadcastInstrs(V); |
1753 | return WidenMap.splat(V, B); |
1754 | } |
1755 | |
1756 | Value *InnerLoopVectorizer::reverseVector(Value *Vec) { |
1757 | assert(Vec->getType()->isVectorTy() && "Invalid type")((Vec->getType()->isVectorTy() && "Invalid type" ) ? static_cast<void> (0) : __assert_fail ("Vec->getType()->isVectorTy() && \"Invalid type\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1757, __PRETTY_FUNCTION__)); |
1758 | SmallVector<Constant*, 8> ShuffleMask; |
1759 | for (unsigned i = 0; i < VF; ++i) |
1760 | ShuffleMask.push_back(Builder.getInt32(VF - i - 1)); |
1761 | |
1762 | return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), |
1763 | ConstantVector::get(ShuffleMask), |
1764 | "reverse"); |
1765 | } |
1766 | |
1767 | void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { |
1768 | // Attempt to issue a wide load. |
1769 | LoadInst *LI = dyn_cast<LoadInst>(Instr); |
1770 | StoreInst *SI = dyn_cast<StoreInst>(Instr); |
1771 | |
1772 | assert((LI || SI) && "Invalid Load/Store instruction")(((LI || SI) && "Invalid Load/Store instruction") ? static_cast <void> (0) : __assert_fail ("(LI || SI) && \"Invalid Load/Store instruction\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1772, __PRETTY_FUNCTION__)); |
1773 | |
1774 | Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType(); |
1775 | Type *DataTy = VectorType::get(ScalarDataTy, VF); |
1776 | Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); |
1777 | unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment(); |
1778 | // An alignment of 0 means target abi alignment. We need to use the scalar's |
1779 | // target abi alignment in such a case. |
1780 | if (!Alignment) |
1781 | Alignment = DL->getABITypeAlignment(ScalarDataTy); |
1782 | unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); |
1783 | unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy); |
1784 | unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF; |
1785 | |
1786 | if (SI && Legal->blockNeedsPredication(SI->getParent()) && |
1787 | !Legal->isMaskRequired(SI)) |
1788 | return scalarizeInstruction(Instr, true); |
1789 | |
1790 | if (ScalarAllocatedSize != VectorElementSize) |
1791 | return scalarizeInstruction(Instr); |
1792 | |
1793 | // If the pointer is loop invariant or if it is non-consecutive, |
1794 | // scalarize the load. |
1795 | int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); |
1796 | bool Reverse = ConsecutiveStride < 0; |
1797 | bool UniformLoad = LI && Legal->isUniform(Ptr); |
1798 | if (!ConsecutiveStride || UniformLoad) |
1799 | return scalarizeInstruction(Instr); |
1800 | |
1801 | Constant *Zero = Builder.getInt32(0); |
1802 | VectorParts &Entry = WidenMap.get(Instr); |
1803 | |
1804 | // Handle consecutive loads/stores. |
1805 | GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); |
1806 | if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) { |
1807 | setDebugLocFromInst(Builder, Gep); |
1808 | Value *PtrOperand = Gep->getPointerOperand(); |
1809 | Value *FirstBasePtr = getVectorValue(PtrOperand)[0]; |
1810 | FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero); |
1811 | |
1812 | // Create the new GEP with the new induction variable. |
1813 | GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); |
1814 | Gep2->setOperand(0, FirstBasePtr); |
1815 | Gep2->setName("gep.indvar.base"); |
1816 | Ptr = Builder.Insert(Gep2); |
1817 | } else if (Gep) { |
1818 | setDebugLocFromInst(Builder, Gep); |
1819 | assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()),((SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand ()), OrigLoop) && "Base ptr must be invariant") ? static_cast <void> (0) : __assert_fail ("SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()), OrigLoop) && \"Base ptr must be invariant\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1820, __PRETTY_FUNCTION__)) |
1820 | OrigLoop) && "Base ptr must be invariant")((SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand ()), OrigLoop) && "Base ptr must be invariant") ? static_cast <void> (0) : __assert_fail ("SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()), OrigLoop) && \"Base ptr must be invariant\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1820, __PRETTY_FUNCTION__)); |
1821 | |
1822 | // The last index does not have to be the induction. It can be |
1823 | // consecutive and be a function of the index. For example A[I+1]; |
1824 | unsigned NumOperands = Gep->getNumOperands(); |
1825 | unsigned InductionOperand = getGEPInductionOperand(DL, Gep); |
1826 | // Create the new GEP with the new induction variable. |
1827 | GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); |
1828 | |
1829 | for (unsigned i = 0; i < NumOperands; ++i) { |
1830 | Value *GepOperand = Gep->getOperand(i); |
1831 | Instruction *GepOperandInst = dyn_cast<Instruction>(GepOperand); |
1832 | |
1833 | // Update last index or loop invariant instruction anchored in loop. |
1834 | if (i == InductionOperand || |
1835 | (GepOperandInst && OrigLoop->contains(GepOperandInst))) { |
1836 | assert((i == InductionOperand ||(((i == InductionOperand || SE->isLoopInvariant(SE->getSCEV (GepOperandInst), OrigLoop)) && "Must be last index or loop invariant" ) ? static_cast<void> (0) : __assert_fail ("(i == InductionOperand || SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) && \"Must be last index or loop invariant\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1838, __PRETTY_FUNCTION__)) |
1837 | SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) &&(((i == InductionOperand || SE->isLoopInvariant(SE->getSCEV (GepOperandInst), OrigLoop)) && "Must be last index or loop invariant" ) ? static_cast<void> (0) : __assert_fail ("(i == InductionOperand || SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) && \"Must be last index or loop invariant\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1838, __PRETTY_FUNCTION__)) |
1838 | "Must be last index or loop invariant")(((i == InductionOperand || SE->isLoopInvariant(SE->getSCEV (GepOperandInst), OrigLoop)) && "Must be last index or loop invariant" ) ? static_cast<void> (0) : __assert_fail ("(i == InductionOperand || SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) && \"Must be last index or loop invariant\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1838, __PRETTY_FUNCTION__)); |
1839 | |
1840 | VectorParts &GEPParts = getVectorValue(GepOperand); |
1841 | Value *Index = GEPParts[0]; |
1842 | Index = Builder.CreateExtractElement(Index, Zero); |
1843 | Gep2->setOperand(i, Index); |
1844 | Gep2->setName("gep.indvar.idx"); |
1845 | } |
1846 | } |
1847 | Ptr = Builder.Insert(Gep2); |
1848 | } else { |
1849 | // Use the induction element ptr. |
1850 | assert(isa<PHINode>(Ptr) && "Invalid induction ptr")((isa<PHINode>(Ptr) && "Invalid induction ptr") ? static_cast<void> (0) : __assert_fail ("isa<PHINode>(Ptr) && \"Invalid induction ptr\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1850, __PRETTY_FUNCTION__)); |
1851 | setDebugLocFromInst(Builder, Ptr); |
1852 | VectorParts &PtrVal = getVectorValue(Ptr); |
1853 | Ptr = Builder.CreateExtractElement(PtrVal[0], Zero); |
1854 | } |
1855 | |
1856 | // Handle Stores: |
1857 | if (SI) { |
1858 | assert(!Legal->isUniform(SI->getPointerOperand()) &&((!Legal->isUniform(SI->getPointerOperand()) && "We do not allow storing to uniform addresses") ? static_cast <void> (0) : __assert_fail ("!Legal->isUniform(SI->getPointerOperand()) && \"We do not allow storing to uniform addresses\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1859, __PRETTY_FUNCTION__)) |
1859 | "We do not allow storing to uniform addresses")((!Legal->isUniform(SI->getPointerOperand()) && "We do not allow storing to uniform addresses") ? static_cast <void> (0) : __assert_fail ("!Legal->isUniform(SI->getPointerOperand()) && \"We do not allow storing to uniform addresses\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1859, __PRETTY_FUNCTION__)); |
1860 | setDebugLocFromInst(Builder, SI); |
1861 | // We don't want to update the value in the map as it might be used in |
1862 | // another expression. So don't use a reference type for "StoredVal". |
1863 | VectorParts StoredVal = getVectorValue(SI->getValueOperand()); |
1864 | |
1865 | for (unsigned Part = 0; Part < UF; ++Part) { |
1866 | // Calculate the pointer for the specific unroll-part. |
1867 | Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)); |
1868 | |
1869 | if (Reverse) { |
1870 | // If we store to reverse consecutive memory locations then we need |
1871 | // to reverse the order of elements in the stored value. |
1872 | StoredVal[Part] = reverseVector(StoredVal[Part]); |
1873 | // If the address is consecutive but reversed, then the |
1874 | // wide store needs to start at the last vector element. |
1875 | PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)); |
1876 | PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); |
1877 | } |
1878 | |
1879 | Value *VecPtr = Builder.CreateBitCast(PartPtr, |
1880 | DataTy->getPointerTo(AddressSpace)); |
1881 | |
1882 | Instruction *NewSI; |
1883 | if (Legal->isMaskRequired(SI)) { |
1884 | Type *I8PtrTy = |
1885 | Builder.getInt8PtrTy(PartPtr->getType()->getPointerAddressSpace()); |
1886 | |
1887 | Value *I8Ptr = Builder.CreateBitCast(PartPtr, I8PtrTy); |
1888 | |
1889 | VectorParts Cond = createBlockInMask(SI->getParent()); |
1890 | SmallVector <Value *, 8> Ops; |
1891 | Ops.push_back(I8Ptr); |
1892 | Ops.push_back(StoredVal[Part]); |
1893 | Ops.push_back(Builder.getInt32(Alignment)); |
1894 | Ops.push_back(Cond[Part]); |
1895 | NewSI = Builder.CreateMaskedStore(Ops); |
1896 | } |
1897 | else |
1898 | NewSI = Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment); |
1899 | propagateMetadata(NewSI, SI); |
1900 | } |
1901 | return; |
1902 | } |
1903 | |
1904 | // Handle loads. |
1905 | assert(LI && "Must have a load instruction")((LI && "Must have a load instruction") ? static_cast <void> (0) : __assert_fail ("LI && \"Must have a load instruction\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1905, __PRETTY_FUNCTION__)); |
1906 | setDebugLocFromInst(Builder, LI); |
1907 | for (unsigned Part = 0; Part < UF; ++Part) { |
1908 | // Calculate the pointer for the specific unroll-part. |
1909 | Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)); |
1910 | |
1911 | if (Reverse) { |
1912 | // If the address is consecutive but reversed, then the |
1913 | // wide load needs to start at the last vector element. |
1914 | PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)); |
1915 | PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); |
1916 | } |
1917 | |
1918 | Instruction* NewLI; |
1919 | if (Legal->isMaskRequired(LI)) { |
1920 | Type *I8PtrTy = |
1921 | Builder.getInt8PtrTy(PartPtr->getType()->getPointerAddressSpace()); |
1922 | |
1923 | Value *I8Ptr = Builder.CreateBitCast(PartPtr, I8PtrTy); |
1924 | |
1925 | VectorParts SrcMask = createBlockInMask(LI->getParent()); |
1926 | SmallVector <Value *, 8> Ops; |
1927 | Ops.push_back(I8Ptr); |
1928 | Ops.push_back(UndefValue::get(DataTy)); |
1929 | Ops.push_back(Builder.getInt32(Alignment)); |
1930 | Ops.push_back(SrcMask[Part]); |
1931 | NewLI = Builder.CreateMaskedLoad(Ops); |
1932 | } |
1933 | else { |
1934 | Value *VecPtr = Builder.CreateBitCast(PartPtr, |
1935 | DataTy->getPointerTo(AddressSpace)); |
1936 | NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load"); |
1937 | } |
1938 | propagateMetadata(NewLI, LI); |
1939 | Entry[Part] = Reverse ? reverseVector(NewLI) : NewLI; |
1940 | } |
1941 | } |
1942 | |
1943 | void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredicateStore) { |
1944 | assert(!Instr->getType()->isAggregateType() && "Can't handle vectors")((!Instr->getType()->isAggregateType() && "Can't handle vectors" ) ? static_cast<void> (0) : __assert_fail ("!Instr->getType()->isAggregateType() && \"Can't handle vectors\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1944, __PRETTY_FUNCTION__)); |
1945 | // Holds vector parameters or scalars, in case of uniform vals. |
1946 | SmallVector<VectorParts, 4> Params; |
1947 | |
1948 | setDebugLocFromInst(Builder, Instr); |
1949 | |
1950 | // Find all of the vectorized parameters. |
1951 | for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { |
1952 | Value *SrcOp = Instr->getOperand(op); |
1953 | |
1954 | // If we are accessing the old induction variable, use the new one. |
1955 | if (SrcOp == OldInduction) { |
1956 | Params.push_back(getVectorValue(SrcOp)); |
1957 | continue; |
1958 | } |
1959 | |
1960 | // Try using previously calculated values. |
1961 | Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); |
1962 | |
1963 | // If the src is an instruction that appeared earlier in the basic block |
1964 | // then it should already be vectorized. |
1965 | if (SrcInst && OrigLoop->contains(SrcInst)) { |
1966 | assert(WidenMap.has(SrcInst) && "Source operand is unavailable")((WidenMap.has(SrcInst) && "Source operand is unavailable" ) ? static_cast<void> (0) : __assert_fail ("WidenMap.has(SrcInst) && \"Source operand is unavailable\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1966, __PRETTY_FUNCTION__)); |
1967 | // The parameter is a vector value from earlier. |
1968 | Params.push_back(WidenMap.get(SrcInst)); |
1969 | } else { |
1970 | // The parameter is a scalar from outside the loop. Maybe even a constant. |
1971 | VectorParts Scalars; |
1972 | Scalars.append(UF, SrcOp); |
1973 | Params.push_back(Scalars); |
1974 | } |
1975 | } |
1976 | |
1977 | assert(Params.size() == Instr->getNumOperands() &&((Params.size() == Instr->getNumOperands() && "Invalid number of operands" ) ? static_cast<void> (0) : __assert_fail ("Params.size() == Instr->getNumOperands() && \"Invalid number of operands\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1978, __PRETTY_FUNCTION__)) |
1978 | "Invalid number of operands")((Params.size() == Instr->getNumOperands() && "Invalid number of operands" ) ? static_cast<void> (0) : __assert_fail ("Params.size() == Instr->getNumOperands() && \"Invalid number of operands\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1978, __PRETTY_FUNCTION__)); |
1979 | |
1980 | // Does this instruction return a value ? |
1981 | bool IsVoidRetTy = Instr->getType()->isVoidTy(); |
1982 | |
1983 | Value *UndefVec = IsVoidRetTy ? nullptr : |
1984 | UndefValue::get(VectorType::get(Instr->getType(), VF)); |
1985 | // Create a new entry in the WidenMap and initialize it to Undef or Null. |
1986 | VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); |
1987 | |
1988 | Instruction *InsertPt = Builder.GetInsertPoint(); |
1989 | BasicBlock *IfBlock = Builder.GetInsertBlock(); |
1990 | BasicBlock *CondBlock = nullptr; |
1991 | |
1992 | VectorParts Cond; |
1993 | Loop *VectorLp = nullptr; |
1994 | if (IfPredicateStore) { |
1995 | assert(Instr->getParent()->getSinglePredecessor() &&((Instr->getParent()->getSinglePredecessor() && "Only support single predecessor blocks") ? static_cast<void > (0) : __assert_fail ("Instr->getParent()->getSinglePredecessor() && \"Only support single predecessor blocks\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1996, __PRETTY_FUNCTION__)) |
1996 | "Only support single predecessor blocks")((Instr->getParent()->getSinglePredecessor() && "Only support single predecessor blocks") ? static_cast<void > (0) : __assert_fail ("Instr->getParent()->getSinglePredecessor() && \"Only support single predecessor blocks\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1996, __PRETTY_FUNCTION__)); |
1997 | Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(), |
1998 | Instr->getParent()); |
1999 | VectorLp = LI->getLoopFor(IfBlock); |
2000 | assert(VectorLp && "Must have a loop for this block")((VectorLp && "Must have a loop for this block") ? static_cast <void> (0) : __assert_fail ("VectorLp && \"Must have a loop for this block\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2000, __PRETTY_FUNCTION__)); |
2001 | } |
2002 | |
2003 | // For each vector unroll 'part': |
2004 | for (unsigned Part = 0; Part < UF; ++Part) { |
2005 | // For each scalar that we create: |
2006 | for (unsigned Width = 0; Width < VF; ++Width) { |
2007 | |
2008 | // Start if-block. |
2009 | Value *Cmp = nullptr; |
2010 | if (IfPredicateStore) { |
2011 | Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width)); |
2012 | Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1)); |
2013 | CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); |
2014 | LoopVectorBody.push_back(CondBlock); |
2015 | VectorLp->addBasicBlockToLoop(CondBlock, LI->getBase()); |
2016 | // Update Builder with newly created basic block. |
2017 | Builder.SetInsertPoint(InsertPt); |
2018 | } |
2019 | |
2020 | Instruction *Cloned = Instr->clone(); |
2021 | if (!IsVoidRetTy) |
2022 | Cloned->setName(Instr->getName() + ".cloned"); |
2023 | // Replace the operands of the cloned instructions with extracted scalars. |
2024 | for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { |
2025 | Value *Op = Params[op][Part]; |
2026 | // Param is a vector. Need to extract the right lane. |
2027 | if (Op->getType()->isVectorTy()) |
2028 | Op = Builder.CreateExtractElement(Op, Builder.getInt32(Width)); |
2029 | Cloned->setOperand(op, Op); |
2030 | } |
2031 | |
2032 | // Place the cloned scalar in the new loop. |
2033 | Builder.Insert(Cloned); |
2034 | |
2035 | // If the original scalar returns a value we need to place it in a vector |
2036 | // so that future users will be able to use it. |
2037 | if (!IsVoidRetTy) |
2038 | VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned, |
2039 | Builder.getInt32(Width)); |
2040 | // End if-block. |
2041 | if (IfPredicateStore) { |
2042 | BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); |
2043 | LoopVectorBody.push_back(NewIfBlock); |
2044 | VectorLp->addBasicBlockToLoop(NewIfBlock, LI->getBase()); |
2045 | Builder.SetInsertPoint(InsertPt); |
2046 | Instruction *OldBr = IfBlock->getTerminator(); |
2047 | BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); |
2048 | OldBr->eraseFromParent(); |
2049 | IfBlock = NewIfBlock; |
2050 | } |
2051 | } |
2052 | } |
2053 | } |
2054 | |
2055 | static Instruction *getFirstInst(Instruction *FirstInst, Value *V, |
2056 | Instruction *Loc) { |
2057 | if (FirstInst) |
2058 | return FirstInst; |
2059 | if (Instruction *I = dyn_cast<Instruction>(V)) |
2060 | return I->getParent() == Loc->getParent() ? I : nullptr; |
2061 | return nullptr; |
2062 | } |
2063 | |
2064 | std::pair<Instruction *, Instruction *> |
2065 | InnerLoopVectorizer::addStrideCheck(Instruction *Loc) { |
2066 | Instruction *tnullptr = nullptr; |
2067 | if (!Legal->mustCheckStrides()) |
2068 | return std::pair<Instruction *, Instruction *>(tnullptr, tnullptr); |
2069 | |
2070 | IRBuilder<> ChkBuilder(Loc); |
2071 | |
2072 | // Emit checks. |
2073 | Value *Check = nullptr; |
2074 | Instruction *FirstInst = nullptr; |
2075 | for (SmallPtrSet<Value *, 8>::iterator SI = Legal->strides_begin(), |
2076 | SE = Legal->strides_end(); |
2077 | SI != SE; ++SI) { |
2078 | Value *Ptr = stripIntegerCast(*SI); |
2079 | Value *C = ChkBuilder.CreateICmpNE(Ptr, ConstantInt::get(Ptr->getType(), 1), |
2080 | "stride.chk"); |
2081 | // Store the first instruction we create. |
2082 | FirstInst = getFirstInst(FirstInst, C, Loc); |
2083 | if (Check) |
2084 | Check = ChkBuilder.CreateOr(Check, C); |
2085 | else |
2086 | Check = C; |
2087 | } |
2088 | |
2089 | // We have to do this trickery because the IRBuilder might fold the check to a |
2090 | // constant expression in which case there is no Instruction anchored in a |
2091 | // the block. |
2092 | LLVMContext &Ctx = Loc->getContext(); |
2093 | Instruction *TheCheck = |
2094 | BinaryOperator::CreateAnd(Check, ConstantInt::getTrue(Ctx)); |
2095 | ChkBuilder.Insert(TheCheck, "stride.not.one"); |
2096 | FirstInst = getFirstInst(FirstInst, TheCheck, Loc); |
2097 | |
2098 | return std::make_pair(FirstInst, TheCheck); |
2099 | } |
2100 | |
2101 | std::pair<Instruction *, Instruction *> |
2102 | InnerLoopVectorizer::addRuntimeCheck(Instruction *Loc) { |
2103 | LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck = |
2104 | Legal->getRuntimePointerCheck(); |
2105 | |
2106 | Instruction *tnullptr = nullptr; |
2107 | if (!PtrRtCheck->Need) |
2108 | return std::pair<Instruction *, Instruction *>(tnullptr, tnullptr); |
2109 | |
2110 | unsigned NumPointers = PtrRtCheck->Pointers.size(); |
2111 | SmallVector<TrackingVH<Value> , 2> Starts; |
2112 | SmallVector<TrackingVH<Value> , 2> Ends; |
2113 | |
2114 | LLVMContext &Ctx = Loc->getContext(); |
2115 | SCEVExpander Exp(*SE, "induction"); |
2116 | Instruction *FirstInst = nullptr; |
2117 | |
2118 | for (unsigned i = 0; i < NumPointers; ++i) { |
2119 | Value *Ptr = PtrRtCheck->Pointers[i]; |
2120 | const SCEV *Sc = SE->getSCEV(Ptr); |
2121 | |
2122 | if (SE->isLoopInvariant(Sc, OrigLoop)) { |
2123 | DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Adding RT check for a loop invariant ptr:" << *Ptr <<"\n"; } } while (0) |
2124 | *Ptr <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Adding RT check for a loop invariant ptr:" << *Ptr <<"\n"; } } while (0); |
2125 | Starts.push_back(Ptr); |
2126 | Ends.push_back(Ptr); |
2127 | } else { |
2128 | DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Adding RT check for range:" << *Ptr << '\n'; } } while (0); |
2129 | unsigned AS = Ptr->getType()->getPointerAddressSpace(); |
2130 | |
2131 | // Use this type for pointer arithmetic. |
2132 | Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS); |
2133 | |
2134 | Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc); |
2135 | Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc); |
2136 | Starts.push_back(Start); |
2137 | Ends.push_back(End); |
2138 | } |
2139 | } |
2140 | |
2141 | IRBuilder<> ChkBuilder(Loc); |
2142 | // Our instructions might fold to a constant. |
2143 | Value *MemoryRuntimeCheck = nullptr; |
2144 | for (unsigned i = 0; i < NumPointers; ++i) { |
2145 | for (unsigned j = i+1; j < NumPointers; ++j) { |
2146 | // No need to check if two readonly pointers intersect. |
2147 | if (!PtrRtCheck->IsWritePtr[i] && !PtrRtCheck->IsWritePtr[j]) |
2148 | continue; |
2149 | |
2150 | // Only need to check pointers between two different dependency sets. |
2151 | if (PtrRtCheck->DependencySetId[i] == PtrRtCheck->DependencySetId[j]) |
2152 | continue; |
2153 | // Only need to check pointers in the same alias set. |
2154 | if (PtrRtCheck->AliasSetId[i] != PtrRtCheck->AliasSetId[j]) |
2155 | continue; |
2156 | |
2157 | unsigned AS0 = Starts[i]->getType()->getPointerAddressSpace(); |
2158 | unsigned AS1 = Starts[j]->getType()->getPointerAddressSpace(); |
2159 | |
2160 | assert((AS0 == Ends[j]->getType()->getPointerAddressSpace()) &&(((AS0 == Ends[j]->getType()->getPointerAddressSpace()) && (AS1 == Ends[i]->getType()->getPointerAddressSpace ()) && "Trying to bounds check pointers with different address spaces" ) ? static_cast<void> (0) : __assert_fail ("(AS0 == Ends[j]->getType()->getPointerAddressSpace()) && (AS1 == Ends[i]->getType()->getPointerAddressSpace()) && \"Trying to bounds check pointers with different address spaces\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2162, __PRETTY_FUNCTION__)) |
2161 | (AS1 == Ends[i]->getType()->getPointerAddressSpace()) &&(((AS0 == Ends[j]->getType()->getPointerAddressSpace()) && (AS1 == Ends[i]->getType()->getPointerAddressSpace ()) && "Trying to bounds check pointers with different address spaces" ) ? static_cast<void> (0) : __assert_fail ("(AS0 == Ends[j]->getType()->getPointerAddressSpace()) && (AS1 == Ends[i]->getType()->getPointerAddressSpace()) && \"Trying to bounds check pointers with different address spaces\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2162, __PRETTY_FUNCTION__)) |
2162 | "Trying to bounds check pointers with different address spaces")(((AS0 == Ends[j]->getType()->getPointerAddressSpace()) && (AS1 == Ends[i]->getType()->getPointerAddressSpace ()) && "Trying to bounds check pointers with different address spaces" ) ? static_cast<void> (0) : __assert_fail ("(AS0 == Ends[j]->getType()->getPointerAddressSpace()) && (AS1 == Ends[i]->getType()->getPointerAddressSpace()) && \"Trying to bounds check pointers with different address spaces\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2162, __PRETTY_FUNCTION__)); |
2163 | |
2164 | Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0); |
2165 | Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1); |
2166 | |
2167 | Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy0, "bc"); |
2168 | Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy1, "bc"); |
2169 | Value *End0 = ChkBuilder.CreateBitCast(Ends[i], PtrArithTy1, "bc"); |
2170 | Value *End1 = ChkBuilder.CreateBitCast(Ends[j], PtrArithTy0, "bc"); |
2171 | |
2172 | Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0"); |
2173 | FirstInst = getFirstInst(FirstInst, Cmp0, Loc); |
2174 | Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1"); |
2175 | FirstInst = getFirstInst(FirstInst, Cmp1, Loc); |
2176 | Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict"); |
2177 | FirstInst = getFirstInst(FirstInst, IsConflict, Loc); |
2178 | if (MemoryRuntimeCheck) { |
2179 | IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, |
2180 | "conflict.rdx"); |
2181 | FirstInst = getFirstInst(FirstInst, IsConflict, Loc); |
2182 | } |
2183 | MemoryRuntimeCheck = IsConflict; |
2184 | } |
2185 | } |
2186 | |
2187 | // We have to do this trickery because the IRBuilder might fold the check to a |
2188 | // constant expression in which case there is no Instruction anchored in a |
2189 | // the block. |
2190 | Instruction *Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck, |
2191 | ConstantInt::getTrue(Ctx)); |
2192 | ChkBuilder.Insert(Check, "memcheck.conflict"); |
2193 | FirstInst = getFirstInst(FirstInst, Check, Loc); |
2194 | return std::make_pair(FirstInst, Check); |
2195 | } |
2196 | |
2197 | void InnerLoopVectorizer::createEmptyLoop() { |
2198 | /* |
2199 | In this function we generate a new loop. The new loop will contain |
2200 | the vectorized instructions while the old loop will continue to run the |
2201 | scalar remainder. |
2202 | |
2203 | [ ] <-- Back-edge taken count overflow check. |
2204 | / | |
2205 | / v |
2206 | | [ ] <-- vector loop bypass (may consist of multiple blocks). |
2207 | | / | |
2208 | | / v |
2209 | || [ ] <-- vector pre header. |
2210 | || | |
2211 | || v |
2212 | || [ ] \ |
2213 | || [ ]_| <-- vector loop. |
2214 | || | |
2215 | | \ v |
2216 | | >[ ] <--- middle-block. |
2217 | | / | |
2218 | | / v |
2219 | -|- >[ ] <--- new preheader. |
2220 | | | |
2221 | | v |
2222 | | [ ] \ |
2223 | | [ ]_| <-- old scalar loop to handle remainder. |
2224 | \ | |
2225 | \ v |
2226 | >[ ] <-- exit block. |
2227 | ... |
2228 | */ |
2229 | |
2230 | BasicBlock *OldBasicBlock = OrigLoop->getHeader(); |
2231 | BasicBlock *BypassBlock = OrigLoop->getLoopPreheader(); |
2232 | BasicBlock *ExitBlock = OrigLoop->getExitBlock(); |
2233 | assert(BypassBlock && "Invalid loop structure")((BypassBlock && "Invalid loop structure") ? static_cast <void> (0) : __assert_fail ("BypassBlock && \"Invalid loop structure\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2233, __PRETTY_FUNCTION__)); |
2234 | assert(ExitBlock && "Must have an exit block")((ExitBlock && "Must have an exit block") ? static_cast <void> (0) : __assert_fail ("ExitBlock && \"Must have an exit block\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2234, __PRETTY_FUNCTION__)); |
2235 | |
2236 | // Some loops have a single integer induction variable, while other loops |
2237 | // don't. One example is c++ iterators that often have multiple pointer |
2238 | // induction variables. In the code below we also support a case where we |
2239 | // don't have a single induction variable. |
2240 | OldInduction = Legal->getInduction(); |
2241 | Type *IdxTy = Legal->getWidestInductionType(); |
2242 | |
2243 | // Find the loop boundaries. |
2244 | const SCEV *ExitCount = SE->getBackedgeTakenCount(OrigLoop); |
2245 | assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count")((ExitCount != SE->getCouldNotCompute() && "Invalid loop count" ) ? static_cast<void> (0) : __assert_fail ("ExitCount != SE->getCouldNotCompute() && \"Invalid loop count\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2245, __PRETTY_FUNCTION__)); |
2246 | |
2247 | // The exit count might have the type of i64 while the phi is i32. This can |
2248 | // happen if we have an induction variable that is sign extended before the |
2249 | // compare. The only way that we get a backedge taken count is that the |
2250 | // induction variable was signed and as such will not overflow. In such a case |
2251 | // truncation is legal. |
2252 | if (ExitCount->getType()->getPrimitiveSizeInBits() > |
2253 | IdxTy->getPrimitiveSizeInBits()) |
2254 | ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy); |
2255 | |
2256 | const SCEV *BackedgeTakeCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy); |
2257 | // Get the total trip count from the count by adding 1. |
2258 | ExitCount = SE->getAddExpr(BackedgeTakeCount, |
2259 | SE->getConstant(BackedgeTakeCount->getType(), 1)); |
2260 | |
2261 | // Expand the trip count and place the new instructions in the preheader. |
2262 | // Notice that the pre-header does not change, only the loop body. |
2263 | SCEVExpander Exp(*SE, "induction"); |
2264 | |
2265 | // We need to test whether the backedge-taken count is uint##_max. Adding one |
2266 | // to it will cause overflow and an incorrect loop trip count in the vector |
2267 | // body. In case of overflow we want to directly jump to the scalar remainder |
2268 | // loop. |
2269 | Value *BackedgeCount = |
2270 | Exp.expandCodeFor(BackedgeTakeCount, BackedgeTakeCount->getType(), |
2271 | BypassBlock->getTerminator()); |
2272 | if (BackedgeCount->getType()->isPointerTy()) |
2273 | BackedgeCount = CastInst::CreatePointerCast(BackedgeCount, IdxTy, |
2274 | "backedge.ptrcnt.to.int", |
2275 | BypassBlock->getTerminator()); |
2276 | Instruction *CheckBCOverflow = |
2277 | CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, BackedgeCount, |
2278 | Constant::getAllOnesValue(BackedgeCount->getType()), |
2279 | "backedge.overflow", BypassBlock->getTerminator()); |
2280 | |
2281 | // The loop index does not have to start at Zero. Find the original start |
2282 | // value from the induction PHI node. If we don't have an induction variable |
2283 | // then we know that it starts at zero. |
2284 | Builder.SetInsertPoint(BypassBlock->getTerminator()); |
2285 | Value *StartIdx = ExtendedIdx = OldInduction ? |
2286 | Builder.CreateZExt(OldInduction->getIncomingValueForBlock(BypassBlock), |
2287 | IdxTy): |
2288 | ConstantInt::get(IdxTy, 0); |
2289 | |
2290 | // We need an instruction to anchor the overflow check on. StartIdx needs to |
2291 | // be defined before the overflow check branch. Because the scalar preheader |
2292 | // is going to merge the start index and so the overflow branch block needs to |
2293 | // contain a definition of the start index. |
2294 | Instruction *OverflowCheckAnchor = BinaryOperator::CreateAdd( |
2295 | StartIdx, ConstantInt::get(IdxTy, 0), "overflow.check.anchor", |
2296 | BypassBlock->getTerminator()); |
2297 | |
2298 | // Count holds the overall loop count (N). |
2299 | Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(), |
2300 | BypassBlock->getTerminator()); |
2301 | |
2302 | LoopBypassBlocks.push_back(BypassBlock); |
2303 | |
2304 | // Split the single block loop into the two loop structure described above. |
2305 | BasicBlock *VectorPH = |
2306 | BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); |
2307 | BasicBlock *VecBody = |
2308 | VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); |
2309 | BasicBlock *MiddleBlock = |
2310 | VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); |
2311 | BasicBlock *ScalarPH = |
2312 | MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); |
2313 | |
2314 | // Create and register the new vector loop. |
2315 | Loop* Lp = new Loop(); |
2316 | Loop *ParentLoop = OrigLoop->getParentLoop(); |
2317 | |
2318 | // Insert the new loop into the loop nest and register the new basic blocks |
2319 | // before calling any utilities such as SCEV that require valid LoopInfo. |
2320 | if (ParentLoop) { |
2321 | ParentLoop->addChildLoop(Lp); |
2322 | ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase()); |
2323 | ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); |
2324 | ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); |
2325 | } else { |
2326 | LI->addTopLevelLoop(Lp); |
2327 | } |
2328 | Lp->addBasicBlockToLoop(VecBody, LI->getBase()); |
2329 | |
2330 | // Use this IR builder to create the loop instructions (Phi, Br, Cmp) |
2331 | // inside the loop. |
2332 | Builder.SetInsertPoint(VecBody->getFirstNonPHI()); |
2333 | |
2334 | // Generate the induction variable. |
2335 | setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction)); |
2336 | Induction = Builder.CreatePHI(IdxTy, 2, "index"); |
2337 | // The loop step is equal to the vectorization factor (num of SIMD elements) |
2338 | // times the unroll factor (num of SIMD instructions). |
2339 | Constant *Step = ConstantInt::get(IdxTy, VF * UF); |
2340 | |
2341 | // This is the IR builder that we use to add all of the logic for bypassing |
2342 | // the new vector loop. |
2343 | IRBuilder<> BypassBuilder(BypassBlock->getTerminator()); |
2344 | setDebugLocFromInst(BypassBuilder, |
2345 | getDebugLocFromInstOrOperands(OldInduction)); |
2346 | |
2347 | // We may need to extend the index in case there is a type mismatch. |
2348 | // We know that the count starts at zero and does not overflow. |
2349 | if (Count->getType() != IdxTy) { |
2350 | // The exit count can be of pointer type. Convert it to the correct |
2351 | // integer type. |
2352 | if (ExitCount->getType()->isPointerTy()) |
2353 | Count = BypassBuilder.CreatePointerCast(Count, IdxTy, "ptrcnt.to.int"); |
2354 | else |
2355 | Count = BypassBuilder.CreateZExtOrTrunc(Count, IdxTy, "cnt.cast"); |
2356 | } |
2357 | |
2358 | // Add the start index to the loop count to get the new end index. |
2359 | Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx, "end.idx"); |
2360 | |
2361 | // Now we need to generate the expression for N - (N % VF), which is |
2362 | // the part that the vectorized body will execute. |
2363 | Value *R = BypassBuilder.CreateURem(Count, Step, "n.mod.vf"); |
2364 | Value *CountRoundDown = BypassBuilder.CreateSub(Count, R, "n.vec"); |
2365 | Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx, |
2366 | "end.idx.rnd.down"); |
2367 | |
2368 | // Now, compare the new count to zero. If it is zero skip the vector loop and |
2369 | // jump to the scalar loop. |
2370 | Value *Cmp = |
2371 | BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, "cmp.zero"); |
2372 | |
2373 | BasicBlock *LastBypassBlock = BypassBlock; |
2374 | |
2375 | // Generate code to check that the loops trip count that we computed by adding |
2376 | // one to the backedge-taken count will not overflow. |
2377 | { |
2378 | auto PastOverflowCheck = |
2379 | std::next(BasicBlock::iterator(OverflowCheckAnchor)); |
2380 | BasicBlock *CheckBlock = |
2381 | LastBypassBlock->splitBasicBlock(PastOverflowCheck, "overflow.checked"); |
2382 | if (ParentLoop) |
2383 | ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase()); |
2384 | LoopBypassBlocks.push_back(CheckBlock); |
2385 | Instruction *OldTerm = LastBypassBlock->getTerminator(); |
2386 | BranchInst::Create(ScalarPH, CheckBlock, CheckBCOverflow, OldTerm); |
2387 | OldTerm->eraseFromParent(); |
2388 | LastBypassBlock = CheckBlock; |
2389 | } |
2390 | |
2391 | // Generate the code to check that the strides we assumed to be one are really |
2392 | // one. We want the new basic block to start at the first instruction in a |
2393 | // sequence of instructions that form a check. |
2394 | Instruction *StrideCheck; |
2395 | Instruction *FirstCheckInst; |
2396 | std::tie(FirstCheckInst, StrideCheck) = |
2397 | addStrideCheck(LastBypassBlock->getTerminator()); |
2398 | if (StrideCheck) { |
2399 | // Create a new block containing the stride check. |
2400 | BasicBlock *CheckBlock = |
2401 | LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.stridecheck"); |
2402 | if (ParentLoop) |
2403 | ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase()); |
2404 | LoopBypassBlocks.push_back(CheckBlock); |
2405 | |
2406 | // Replace the branch into the memory check block with a conditional branch |
2407 | // for the "few elements case". |
2408 | Instruction *OldTerm = LastBypassBlock->getTerminator(); |
2409 | BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm); |
2410 | OldTerm->eraseFromParent(); |
2411 | |
2412 | Cmp = StrideCheck; |
2413 | LastBypassBlock = CheckBlock; |
2414 | } |
2415 | |
2416 | // Generate the code that checks in runtime if arrays overlap. We put the |
2417 | // checks into a separate block to make the more common case of few elements |
2418 | // faster. |
2419 | Instruction *MemRuntimeCheck; |
2420 | std::tie(FirstCheckInst, MemRuntimeCheck) = |
2421 | addRuntimeCheck(LastBypassBlock->getTerminator()); |
2422 | if (MemRuntimeCheck) { |
2423 | // Create a new block containing the memory check. |
2424 | BasicBlock *CheckBlock = |
2425 | LastBypassBlock->splitBasicBlock(MemRuntimeCheck, "vector.memcheck"); |
2426 | if (ParentLoop) |
2427 | ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase()); |
2428 | LoopBypassBlocks.push_back(CheckBlock); |
2429 | |
2430 | // Replace the branch into the memory check block with a conditional branch |
2431 | // for the "few elements case". |
2432 | Instruction *OldTerm = LastBypassBlock->getTerminator(); |
2433 | BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm); |
2434 | OldTerm->eraseFromParent(); |
2435 | |
2436 | Cmp = MemRuntimeCheck; |
2437 | LastBypassBlock = CheckBlock; |
2438 | } |
2439 | |
2440 | LastBypassBlock->getTerminator()->eraseFromParent(); |
2441 | BranchInst::Create(MiddleBlock, VectorPH, Cmp, |
2442 | LastBypassBlock); |
2443 | |
2444 | // We are going to resume the execution of the scalar loop. |
2445 | // Go over all of the induction variables that we found and fix the |
2446 | // PHIs that are left in the scalar version of the loop. |
2447 | // The starting values of PHI nodes depend on the counter of the last |
2448 | // iteration in the vectorized loop. |
2449 | // If we come from a bypass edge then we need to start from the original |
2450 | // start value. |
2451 | |
2452 | // This variable saves the new starting index for the scalar loop. |
2453 | PHINode *ResumeIndex = nullptr; |
2454 | LoopVectorizationLegality::InductionList::iterator I, E; |
2455 | LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); |
2456 | // Set builder to point to last bypass block. |
2457 | BypassBuilder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator()); |
2458 | for (I = List->begin(), E = List->end(); I != E; ++I) { |
2459 | PHINode *OrigPhi = I->first; |
2460 | LoopVectorizationLegality::InductionInfo II = I->second; |
2461 | |
2462 | Type *ResumeValTy = (OrigPhi == OldInduction) ? IdxTy : OrigPhi->getType(); |
2463 | PHINode *ResumeVal = PHINode::Create(ResumeValTy, 2, "resume.val", |
2464 | MiddleBlock->getTerminator()); |
2465 | // We might have extended the type of the induction variable but we need a |
2466 | // truncated version for the scalar loop. |
2467 | PHINode *TruncResumeVal = (OrigPhi == OldInduction) ? |
2468 | PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val", |
2469 | MiddleBlock->getTerminator()) : nullptr; |
2470 | |
2471 | // Create phi nodes to merge from the backedge-taken check block. |
2472 | PHINode *BCResumeVal = PHINode::Create(ResumeValTy, 3, "bc.resume.val", |
2473 | ScalarPH->getTerminator()); |
2474 | BCResumeVal->addIncoming(ResumeVal, MiddleBlock); |
2475 | |
2476 | PHINode *BCTruncResumeVal = nullptr; |
2477 | if (OrigPhi == OldInduction) { |
2478 | BCTruncResumeVal = |
2479 | PHINode::Create(OrigPhi->getType(), 2, "bc.trunc.resume.val", |
2480 | ScalarPH->getTerminator()); |
2481 | BCTruncResumeVal->addIncoming(TruncResumeVal, MiddleBlock); |
2482 | } |
2483 | |
2484 | Value *EndValue = nullptr; |
2485 | switch (II.IK) { |
2486 | case LoopVectorizationLegality::IK_NoInduction: |
2487 | llvm_unreachable("Unknown induction")::llvm::llvm_unreachable_internal("Unknown induction", "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2487); |
2488 | case LoopVectorizationLegality::IK_IntInduction: { |
2489 | // Handle the integer induction counter. |
2490 | assert(OrigPhi->getType()->isIntegerTy() && "Invalid type")((OrigPhi->getType()->isIntegerTy() && "Invalid type" ) ? static_cast<void> (0) : __assert_fail ("OrigPhi->getType()->isIntegerTy() && \"Invalid type\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2490, __PRETTY_FUNCTION__)); |
2491 | |
2492 | // We have the canonical induction variable. |
2493 | if (OrigPhi == OldInduction) { |
2494 | // Create a truncated version of the resume value for the scalar loop, |
2495 | // we might have promoted the type to a larger width. |
2496 | EndValue = |
2497 | BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType()); |
2498 | // The new PHI merges the original incoming value, in case of a bypass, |
2499 | // or the value at the end of the vectorized loop. |
2500 | for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) |
2501 | TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]); |
2502 | TruncResumeVal->addIncoming(EndValue, VecBody); |
2503 | |
2504 | BCTruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]); |
2505 | |
2506 | // We know what the end value is. |
2507 | EndValue = IdxEndRoundDown; |
2508 | // We also know which PHI node holds it. |
2509 | ResumeIndex = ResumeVal; |
2510 | break; |
2511 | } |
2512 | |
2513 | // Not the canonical induction variable - add the vector loop count to the |
2514 | // start value. |
2515 | Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown, |
2516 | II.StartValue->getType(), |
2517 | "cast.crd"); |
2518 | EndValue = BypassBuilder.CreateAdd(CRD, II.StartValue , "ind.end"); |
2519 | break; |
2520 | } |
2521 | case LoopVectorizationLegality::IK_ReverseIntInduction: { |
2522 | // Convert the CountRoundDown variable to the PHI size. |
2523 | Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown, |
2524 | II.StartValue->getType(), |
2525 | "cast.crd"); |
2526 | // Handle reverse integer induction counter. |
2527 | EndValue = BypassBuilder.CreateSub(II.StartValue, CRD, "rev.ind.end"); |
2528 | break; |
2529 | } |
2530 | case LoopVectorizationLegality::IK_PtrInduction: { |
2531 | // For pointer induction variables, calculate the offset using |
2532 | // the end index. |
2533 | EndValue = BypassBuilder.CreateGEP(II.StartValue, CountRoundDown, |
2534 | "ptr.ind.end"); |
2535 | break; |
2536 | } |
2537 | case LoopVectorizationLegality::IK_ReversePtrInduction: { |
2538 | // The value at the end of the loop for the reverse pointer is calculated |
2539 | // by creating a GEP with a negative index starting from the start value. |
2540 | Value *Zero = ConstantInt::get(CountRoundDown->getType(), 0); |
2541 | Value *NegIdx = BypassBuilder.CreateSub(Zero, CountRoundDown, |
2542 | "rev.ind.end"); |
2543 | EndValue = BypassBuilder.CreateGEP(II.StartValue, NegIdx, |
2544 | "rev.ptr.ind.end"); |
2545 | break; |
2546 | } |
2547 | }// end of case |
2548 | |
2549 | // The new PHI merges the original incoming value, in case of a bypass, |
2550 | // or the value at the end of the vectorized loop. |
2551 | for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) { |
2552 | if (OrigPhi == OldInduction) |
2553 | ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]); |
2554 | else |
2555 | ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]); |
2556 | } |
2557 | ResumeVal->addIncoming(EndValue, VecBody); |
2558 | |
2559 | // Fix the scalar body counter (PHI node). |
2560 | unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH); |
2561 | |
2562 | // The old induction's phi node in the scalar body needs the truncated |
2563 | // value. |
2564 | if (OrigPhi == OldInduction) { |
2565 | BCResumeVal->addIncoming(StartIdx, LoopBypassBlocks[0]); |
2566 | OrigPhi->setIncomingValue(BlockIdx, BCTruncResumeVal); |
2567 | } else { |
2568 | BCResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]); |
2569 | OrigPhi->setIncomingValue(BlockIdx, BCResumeVal); |
2570 | } |
2571 | } |
2572 | |
2573 | // If we are generating a new induction variable then we also need to |
2574 | // generate the code that calculates the exit value. This value is not |
2575 | // simply the end of the counter because we may skip the vectorized body |
2576 | // in case of a runtime check. |
2577 | if (!OldInduction){ |
2578 | assert(!ResumeIndex && "Unexpected resume value found")((!ResumeIndex && "Unexpected resume value found") ? static_cast <void> (0) : __assert_fail ("!ResumeIndex && \"Unexpected resume value found\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2578, __PRETTY_FUNCTION__)); |
2579 | ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val", |
2580 | MiddleBlock->getTerminator()); |
2581 | for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) |
2582 | ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]); |
2583 | ResumeIndex->addIncoming(IdxEndRoundDown, VecBody); |
2584 | } |
2585 | |
2586 | // Make sure that we found the index where scalar loop needs to continue. |
2587 | assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() &&((ResumeIndex && ResumeIndex->getType()->isIntegerTy () && "Invalid resume Index") ? static_cast<void> (0) : __assert_fail ("ResumeIndex && ResumeIndex->getType()->isIntegerTy() && \"Invalid resume Index\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2588, __PRETTY_FUNCTION__)) |
2588 | "Invalid resume Index")((ResumeIndex && ResumeIndex->getType()->isIntegerTy () && "Invalid resume Index") ? static_cast<void> (0) : __assert_fail ("ResumeIndex && ResumeIndex->getType()->isIntegerTy() && \"Invalid resume Index\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2588, __PRETTY_FUNCTION__)); |
2589 | |
2590 | // Add a check in the middle block to see if we have completed |
2591 | // all of the iterations in the first vector loop. |
2592 | // If (N - N%VF) == N, then we *don't* need to run the remainder. |
2593 | Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd, |
2594 | ResumeIndex, "cmp.n", |
2595 | MiddleBlock->getTerminator()); |
2596 | |
2597 | BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator()); |
2598 | // Remove the old terminator. |
2599 | MiddleBlock->getTerminator()->eraseFromParent(); |
2600 | |
2601 | // Create i+1 and fill the PHINode. |
2602 | Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next"); |
2603 | Induction->addIncoming(StartIdx, VectorPH); |
2604 | Induction->addIncoming(NextIdx, VecBody); |
2605 | // Create the compare. |
2606 | Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown); |
2607 | Builder.CreateCondBr(ICmp, MiddleBlock, VecBody); |
2608 | |
2609 | // Now we have two terminators. Remove the old one from the block. |
2610 | VecBody->getTerminator()->eraseFromParent(); |
2611 | |
2612 | // Get ready to start creating new instructions into the vectorized body. |
2613 | Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); |
2614 | |
2615 | // Save the state. |
2616 | LoopVectorPreHeader = VectorPH; |
2617 | LoopScalarPreHeader = ScalarPH; |
2618 | LoopMiddleBlock = MiddleBlock; |
2619 | LoopExitBlock = ExitBlock; |
2620 | LoopVectorBody.push_back(VecBody); |
2621 | LoopScalarBody = OldBasicBlock; |
2622 | |
2623 | LoopVectorizeHints Hints(Lp, true); |
2624 | Hints.setAlreadyVectorized(); |
2625 | } |
2626 | |
2627 | /// This function returns the identity element (or neutral element) for |
2628 | /// the operation K. |
2629 | Constant* |
2630 | LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp) { |
2631 | switch (K) { |
2632 | case RK_IntegerXor: |
2633 | case RK_IntegerAdd: |
2634 | case RK_IntegerOr: |
2635 | // Adding, Xoring, Oring zero to a number does not change it. |
2636 | return ConstantInt::get(Tp, 0); |
2637 | case RK_IntegerMult: |
2638 | // Multiplying a number by 1 does not change it. |
2639 | return ConstantInt::get(Tp, 1); |
2640 | case RK_IntegerAnd: |
2641 | // AND-ing a number with an all-1 value does not change it. |
2642 | return ConstantInt::get(Tp, -1, true); |
2643 | case RK_FloatMult: |
2644 | // Multiplying a number by 1 does not change it. |
2645 | return ConstantFP::get(Tp, 1.0L); |
2646 | case RK_FloatAdd: |
2647 | // Adding zero to a number does not change it. |
2648 | return ConstantFP::get(Tp, 0.0L); |
2649 | default: |
2650 | llvm_unreachable("Unknown reduction kind")::llvm::llvm_unreachable_internal("Unknown reduction kind", "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2650); |
2651 | } |
2652 | } |
2653 | |
2654 | /// This function translates the reduction kind to an LLVM binary operator. |
2655 | static unsigned |
2656 | getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) { |
2657 | switch (Kind) { |
2658 | case LoopVectorizationLegality::RK_IntegerAdd: |
2659 | return Instruction::Add; |
2660 | case LoopVectorizationLegality::RK_IntegerMult: |
2661 | return Instruction::Mul; |
2662 | case LoopVectorizationLegality::RK_IntegerOr: |
2663 | return Instruction::Or; |
2664 | case LoopVectorizationLegality::RK_IntegerAnd: |
2665 | return Instruction::And; |
2666 | case LoopVectorizationLegality::RK_IntegerXor: |
2667 | return Instruction::Xor; |
2668 | case LoopVectorizationLegality::RK_FloatMult: |
2669 | return Instruction::FMul; |
2670 | case LoopVectorizationLegality::RK_FloatAdd: |
2671 | return Instruction::FAdd; |
2672 | case LoopVectorizationLegality::RK_IntegerMinMax: |
2673 | return Instruction::ICmp; |
2674 | case LoopVectorizationLegality::RK_FloatMinMax: |
2675 | return Instruction::FCmp; |
2676 | default: |
2677 | llvm_unreachable("Unknown reduction operation")::llvm::llvm_unreachable_internal("Unknown reduction operation" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2677); |
2678 | } |
2679 | } |
2680 | |
2681 | Value *createMinMaxOp(IRBuilder<> &Builder, |
2682 | LoopVectorizationLegality::MinMaxReductionKind RK, |
2683 | Value *Left, |
2684 | Value *Right) { |
2685 | CmpInst::Predicate P = CmpInst::ICMP_NE; |
2686 | switch (RK) { |
2687 | default: |
2688 | llvm_unreachable("Unknown min/max reduction kind")::llvm::llvm_unreachable_internal("Unknown min/max reduction kind" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2688); |
2689 | case LoopVectorizationLegality::MRK_UIntMin: |
2690 | P = CmpInst::ICMP_ULT; |
2691 | break; |
2692 | case LoopVectorizationLegality::MRK_UIntMax: |
2693 | P = CmpInst::ICMP_UGT; |
2694 | break; |
2695 | case LoopVectorizationLegality::MRK_SIntMin: |
2696 | P = CmpInst::ICMP_SLT; |
2697 | break; |
2698 | case LoopVectorizationLegality::MRK_SIntMax: |
2699 | P = CmpInst::ICMP_SGT; |
2700 | break; |
2701 | case LoopVectorizationLegality::MRK_FloatMin: |
2702 | P = CmpInst::FCMP_OLT; |
2703 | break; |
2704 | case LoopVectorizationLegality::MRK_FloatMax: |
2705 | P = CmpInst::FCMP_OGT; |
2706 | break; |
2707 | } |
2708 | |
2709 | Value *Cmp; |
2710 | if (RK == LoopVectorizationLegality::MRK_FloatMin || |
2711 | RK == LoopVectorizationLegality::MRK_FloatMax) |
2712 | Cmp = Builder.CreateFCmp(P, Left, Right, "rdx.minmax.cmp"); |
2713 | else |
2714 | Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp"); |
2715 | |
2716 | Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select"); |
2717 | return Select; |
2718 | } |
2719 | |
2720 | namespace { |
2721 | struct CSEDenseMapInfo { |
2722 | static bool canHandle(Instruction *I) { |
2723 | return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || |
2724 | isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); |
2725 | } |
2726 | static inline Instruction *getEmptyKey() { |
2727 | return DenseMapInfo<Instruction *>::getEmptyKey(); |
2728 | } |
2729 | static inline Instruction *getTombstoneKey() { |
2730 | return DenseMapInfo<Instruction *>::getTombstoneKey(); |
2731 | } |
2732 | static unsigned getHashValue(Instruction *I) { |
2733 | assert(canHandle(I) && "Unknown instruction!")((canHandle(I) && "Unknown instruction!") ? static_cast <void> (0) : __assert_fail ("canHandle(I) && \"Unknown instruction!\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2733, __PRETTY_FUNCTION__)); |
2734 | return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), |
2735 | I->value_op_end())); |
2736 | } |
2737 | static bool isEqual(Instruction *LHS, Instruction *RHS) { |
2738 | if (LHS == getEmptyKey() || RHS == getEmptyKey() || |
2739 | LHS == getTombstoneKey() || RHS == getTombstoneKey()) |
2740 | return LHS == RHS; |
2741 | return LHS->isIdenticalTo(RHS); |
2742 | } |
2743 | }; |
2744 | } |
2745 | |
2746 | /// \brief Check whether this block is a predicated block. |
2747 | /// Due to if predication of stores we might create a sequence of "if(pred) a[i] |
2748 | /// = ...; " blocks. We start with one vectorized basic block. For every |
2749 | /// conditional block we split this vectorized block. Therefore, every second |
2750 | /// block will be a predicated one. |
2751 | static bool isPredicatedBlock(unsigned BlockNum) { |
2752 | return BlockNum % 2; |
2753 | } |
2754 | |
2755 | ///\brief Perform cse of induction variable instructions. |
2756 | static void cse(SmallVector<BasicBlock *, 4> &BBs) { |
2757 | // Perform simple cse. |
2758 | SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; |
2759 | for (unsigned i = 0, e = BBs.size(); i != e; ++i) { |
2760 | BasicBlock *BB = BBs[i]; |
2761 | for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { |
2762 | Instruction *In = I++; |
2763 | |
2764 | if (!CSEDenseMapInfo::canHandle(In)) |
2765 | continue; |
2766 | |
2767 | // Check if we can replace this instruction with any of the |
2768 | // visited instructions. |
2769 | if (Instruction *V = CSEMap.lookup(In)) { |
2770 | In->replaceAllUsesWith(V); |
2771 | In->eraseFromParent(); |
2772 | continue; |
2773 | } |
2774 | // Ignore instructions in conditional blocks. We create "if (pred) a[i] = |
2775 | // ...;" blocks for predicated stores. Every second block is a predicated |
2776 | // block. |
2777 | if (isPredicatedBlock(i)) |
2778 | continue; |
2779 | |
2780 | CSEMap[In] = In; |
2781 | } |
2782 | } |
2783 | } |
2784 | |
2785 | /// \brief Adds a 'fast' flag to floating point operations. |
2786 | static Value *addFastMathFlag(Value *V) { |
2787 | if (isa<FPMathOperator>(V)){ |
2788 | FastMathFlags Flags; |
2789 | Flags.setUnsafeAlgebra(); |
2790 | cast<Instruction>(V)->setFastMathFlags(Flags); |
2791 | } |
2792 | return V; |
2793 | } |
2794 | |
2795 | void InnerLoopVectorizer::vectorizeLoop() { |
2796 | //===------------------------------------------------===// |
2797 | // |
2798 | // Notice: any optimization or new instruction that go |
2799 | // into the code below should be also be implemented in |
2800 | // the cost-model. |
2801 | // |
2802 | //===------------------------------------------------===// |
2803 | Constant *Zero = Builder.getInt32(0); |
2804 | |
2805 | // In order to support reduction variables we need to be able to vectorize |
2806 | // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two |
2807 | // stages. First, we create a new vector PHI node with no incoming edges. |
2808 | // We use this value when we vectorize all of the instructions that use the |
2809 | // PHI. Next, after all of the instructions in the block are complete we |
2810 | // add the new incoming edges to the PHI. At this point all of the |
2811 | // instructions in the basic block are vectorized, so we can use them to |
2812 | // construct the PHI. |
2813 | PhiVector RdxPHIsToFix; |
2814 | |
2815 | // Scan the loop in a topological order to ensure that defs are vectorized |
2816 | // before users. |
2817 | LoopBlocksDFS DFS(OrigLoop); |
2818 | DFS.perform(LI); |
2819 | |
2820 | // Vectorize all of the blocks in the original loop. |
2821 | for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), |
2822 | be = DFS.endRPO(); bb != be; ++bb) |
2823 | vectorizeBlockInLoop(*bb, &RdxPHIsToFix); |
2824 | |
2825 | // At this point every instruction in the original loop is widened to |
2826 | // a vector form. We are almost done. Now, we need to fix the PHI nodes |
2827 | // that we vectorized. The PHI nodes are currently empty because we did |
2828 | // not want to introduce cycles. Notice that the remaining PHI nodes |
2829 | // that we need to fix are reduction variables. |
2830 | |
2831 | // Create the 'reduced' values for each of the induction vars. |
2832 | // The reduced values are the vector values that we scalarize and combine |
2833 | // after the loop is finished. |
2834 | for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end(); |
2835 | it != e; ++it) { |
2836 | PHINode *RdxPhi = *it; |
2837 | assert(RdxPhi && "Unable to recover vectorized PHI")((RdxPhi && "Unable to recover vectorized PHI") ? static_cast <void> (0) : __assert_fail ("RdxPhi && \"Unable to recover vectorized PHI\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2837, __PRETTY_FUNCTION__)); |
2838 | |
2839 | // Find the reduction variable descriptor. |
2840 | assert(Legal->getReductionVars()->count(RdxPhi) &&((Legal->getReductionVars()->count(RdxPhi) && "Unable to find the reduction variable" ) ? static_cast<void> (0) : __assert_fail ("Legal->getReductionVars()->count(RdxPhi) && \"Unable to find the reduction variable\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2841, __PRETTY_FUNCTION__)) |
2841 | "Unable to find the reduction variable")((Legal->getReductionVars()->count(RdxPhi) && "Unable to find the reduction variable" ) ? static_cast<void> (0) : __assert_fail ("Legal->getReductionVars()->count(RdxPhi) && \"Unable to find the reduction variable\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2841, __PRETTY_FUNCTION__)); |
2842 | LoopVectorizationLegality::ReductionDescriptor RdxDesc = |
2843 | (*Legal->getReductionVars())[RdxPhi]; |
2844 | |
2845 | setDebugLocFromInst(Builder, RdxDesc.StartValue); |
2846 | |
2847 | // We need to generate a reduction vector from the incoming scalar. |
2848 | // To do so, we need to generate the 'identity' vector and override |
2849 | // one of the elements with the incoming scalar reduction. We need |
2850 | // to do it in the vector-loop preheader. |
2851 | Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator()); |
2852 | |
2853 | // This is the vector-clone of the value that leaves the loop. |
2854 | VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr); |
2855 | Type *VecTy = VectorExit[0]->getType(); |
2856 | |
2857 | // Find the reduction identity variable. Zero for addition, or, xor, |
2858 | // one for multiplication, -1 for And. |
2859 | Value *Identity; |
2860 | Value *VectorStart; |
2861 | if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerMinMax || |
2862 | RdxDesc.Kind == LoopVectorizationLegality::RK_FloatMinMax) { |
2863 | // MinMax reduction have the start value as their identify. |
2864 | if (VF == 1) { |
2865 | VectorStart = Identity = RdxDesc.StartValue; |
2866 | } else { |
2867 | VectorStart = Identity = Builder.CreateVectorSplat(VF, |
2868 | RdxDesc.StartValue, |
2869 | "minmax.ident"); |
2870 | } |
2871 | } else { |
2872 | // Handle other reduction kinds: |
2873 | Constant *Iden = |
2874 | LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind, |
2875 | VecTy->getScalarType()); |
2876 | if (VF == 1) { |
2877 | Identity = Iden; |
2878 | // This vector is the Identity vector where the first element is the |
2879 | // incoming scalar reduction. |
2880 | VectorStart = RdxDesc.StartValue; |
2881 | } else { |
2882 | Identity = ConstantVector::getSplat(VF, Iden); |
2883 | |
2884 | // This vector is the Identity vector where the first element is the |
2885 | // incoming scalar reduction. |
2886 | VectorStart = Builder.CreateInsertElement(Identity, |
2887 | RdxDesc.StartValue, Zero); |
2888 | } |
2889 | } |
2890 | |
2891 | // Fix the vector-loop phi. |
2892 | |
2893 | // Reductions do not have to start at zero. They can start with |
2894 | // any loop invariant values. |
2895 | VectorParts &VecRdxPhi = WidenMap.get(RdxPhi); |
2896 | BasicBlock *Latch = OrigLoop->getLoopLatch(); |
2897 | Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch); |
2898 | VectorParts &Val = getVectorValue(LoopVal); |
2899 | for (unsigned part = 0; part < UF; ++part) { |
2900 | // Make sure to add the reduction stat value only to the |
2901 | // first unroll part. |
2902 | Value *StartVal = (part == 0) ? VectorStart : Identity; |
2903 | cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal, |
2904 | LoopVectorPreHeader); |
2905 | cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part], |
2906 | LoopVectorBody.back()); |
2907 | } |
2908 | |
2909 | // Before each round, move the insertion point right between |
2910 | // the PHIs and the values we are going to write. |
2911 | // This allows us to write both PHINodes and the extractelement |
2912 | // instructions. |
2913 | Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); |
2914 | |
2915 | VectorParts RdxParts; |
2916 | setDebugLocFromInst(Builder, RdxDesc.LoopExitInstr); |
2917 | for (unsigned part = 0; part < UF; ++part) { |
2918 | // This PHINode contains the vectorized reduction variable, or |
2919 | // the initial value vector, if we bypass the vector loop. |
2920 | VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr); |
2921 | PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); |
2922 | Value *StartVal = (part == 0) ? VectorStart : Identity; |
2923 | for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) |
2924 | NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]); |
2925 | NewPhi->addIncoming(RdxExitVal[part], |
2926 | LoopVectorBody.back()); |
2927 | RdxParts.push_back(NewPhi); |
2928 | } |
2929 | |
2930 | // Reduce all of the unrolled parts into a single vector. |
2931 | Value *ReducedPartRdx = RdxParts[0]; |
2932 | unsigned Op = getReductionBinOp(RdxDesc.Kind); |
2933 | setDebugLocFromInst(Builder, ReducedPartRdx); |
2934 | for (unsigned part = 1; part < UF; ++part) { |
2935 | if (Op != Instruction::ICmp && Op != Instruction::FCmp) |
2936 | // Floating point operations had to be 'fast' to enable the reduction. |
2937 | ReducedPartRdx = addFastMathFlag( |
2938 | Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part], |
2939 | ReducedPartRdx, "bin.rdx")); |
2940 | else |
2941 | ReducedPartRdx = createMinMaxOp(Builder, RdxDesc.MinMaxKind, |
2942 | ReducedPartRdx, RdxParts[part]); |
2943 | } |
2944 | |
2945 | if (VF > 1) { |
2946 | // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles |
2947 | // and vector ops, reducing the set of values being computed by half each |
2948 | // round. |
2949 | assert(isPowerOf2_32(VF) &&((isPowerOf2_32(VF) && "Reduction emission only supported for pow2 vectors!" ) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(VF) && \"Reduction emission only supported for pow2 vectors!\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2950, __PRETTY_FUNCTION__)) |
2950 | "Reduction emission only supported for pow2 vectors!")((isPowerOf2_32(VF) && "Reduction emission only supported for pow2 vectors!" ) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(VF) && \"Reduction emission only supported for pow2 vectors!\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2950, __PRETTY_FUNCTION__)); |
2951 | Value *TmpVec = ReducedPartRdx; |
2952 | SmallVector<Constant*, 32> ShuffleMask(VF, nullptr); |
2953 | for (unsigned i = VF; i != 1; i >>= 1) { |
2954 | // Move the upper half of the vector to the lower half. |
2955 | for (unsigned j = 0; j != i/2; ++j) |
2956 | ShuffleMask[j] = Builder.getInt32(i/2 + j); |
2957 | |
2958 | // Fill the rest of the mask with undef. |
2959 | std::fill(&ShuffleMask[i/2], ShuffleMask.end(), |
2960 | UndefValue::get(Builder.getInt32Ty())); |
2961 | |
2962 | Value *Shuf = |
2963 | Builder.CreateShuffleVector(TmpVec, |
2964 | UndefValue::get(TmpVec->getType()), |
2965 | ConstantVector::get(ShuffleMask), |
2966 | "rdx.shuf"); |
2967 | |
2968 | if (Op != Instruction::ICmp && Op != Instruction::FCmp) |
2969 | // Floating point operations had to be 'fast' to enable the reduction. |
2970 | TmpVec = addFastMathFlag(Builder.CreateBinOp( |
2971 | (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx")); |
2972 | else |
2973 | TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf); |
2974 | } |
2975 | |
2976 | // The result is in the first element of the vector. |
2977 | ReducedPartRdx = Builder.CreateExtractElement(TmpVec, |
2978 | Builder.getInt32(0)); |
2979 | } |
2980 | |
2981 | // Create a phi node that merges control-flow from the backedge-taken check |
2982 | // block and the middle block. |
2983 | PHINode *BCBlockPhi = PHINode::Create(RdxPhi->getType(), 2, "bc.merge.rdx", |
2984 | LoopScalarPreHeader->getTerminator()); |
2985 | BCBlockPhi->addIncoming(RdxDesc.StartValue, LoopBypassBlocks[0]); |
2986 | BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); |
2987 | |
2988 | // Now, we need to fix the users of the reduction variable |
2989 | // inside and outside of the scalar remainder loop. |
2990 | // We know that the loop is in LCSSA form. We need to update the |
2991 | // PHI nodes in the exit blocks. |
2992 | for (BasicBlock::iterator LEI = LoopExitBlock->begin(), |
2993 | LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { |
2994 | PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); |
2995 | if (!LCSSAPhi) break; |
2996 | |
2997 | // All PHINodes need to have a single entry edge, or two if |
2998 | // we already fixed them. |
2999 | assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI")((LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI" ) ? static_cast<void> (0) : __assert_fail ("LCSSAPhi->getNumIncomingValues() < 3 && \"Invalid LCSSA PHI\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2999, __PRETTY_FUNCTION__)); |
3000 | |
3001 | // We found our reduction value exit-PHI. Update it with the |
3002 | // incoming bypass edge. |
3003 | if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) { |
3004 | // Add an edge coming from the bypass. |
3005 | LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); |
3006 | break; |
3007 | } |
3008 | }// end of the LCSSA phi scan. |
3009 | |
3010 | // Fix the scalar loop reduction variable with the incoming reduction sum |
3011 | // from the vector body and from the backedge value. |
3012 | int IncomingEdgeBlockIdx = |
3013 | (RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch()); |
3014 | assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index")((IncomingEdgeBlockIdx >= 0 && "Invalid block index" ) ? static_cast<void> (0) : __assert_fail ("IncomingEdgeBlockIdx >= 0 && \"Invalid block index\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3014, __PRETTY_FUNCTION__)); |
3015 | // Pick the other block. |
3016 | int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); |
3017 | (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); |
3018 | (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr); |
3019 | }// end of for each redux variable. |
3020 | |
3021 | fixLCSSAPHIs(); |
3022 | |
3023 | // Remove redundant induction instructions. |
3024 | cse(LoopVectorBody); |
3025 | } |
3026 | |
3027 | void InnerLoopVectorizer::fixLCSSAPHIs() { |
3028 | for (BasicBlock::iterator LEI = LoopExitBlock->begin(), |
3029 | LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { |
3030 | PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); |
3031 | if (!LCSSAPhi) break; |
3032 | if (LCSSAPhi->getNumIncomingValues() == 1) |
3033 | LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()), |
3034 | LoopMiddleBlock); |
3035 | } |
3036 | } |
3037 | |
3038 | InnerLoopVectorizer::VectorParts |
3039 | InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { |
3040 | assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) &&((std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end( Dst) && "Invalid edge") ? static_cast<void> (0) : __assert_fail ("std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) && \"Invalid edge\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3041, __PRETTY_FUNCTION__)) |
3041 | "Invalid edge")((std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end( Dst) && "Invalid edge") ? static_cast<void> (0) : __assert_fail ("std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) && \"Invalid edge\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3041, __PRETTY_FUNCTION__)); |
3042 | |
3043 | // Look for cached value. |
3044 | std::pair<BasicBlock*, BasicBlock*> Edge(Src, Dst); |
3045 | EdgeMaskCache::iterator ECEntryIt = MaskCache.find(Edge); |
3046 | if (ECEntryIt != MaskCache.end()) |
3047 | return ECEntryIt->second; |
3048 | |
3049 | VectorParts SrcMask = createBlockInMask(Src); |
3050 | |
3051 | // The terminator has to be a branch inst! |
3052 | BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); |
3053 | assert(BI && "Unexpected terminator found")((BI && "Unexpected terminator found") ? static_cast< void> (0) : __assert_fail ("BI && \"Unexpected terminator found\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3053, __PRETTY_FUNCTION__)); |
3054 | |
3055 | if (BI->isConditional()) { |
3056 | VectorParts EdgeMask = getVectorValue(BI->getCondition()); |
3057 | |
3058 | if (BI->getSuccessor(0) != Dst) |
3059 | for (unsigned part = 0; part < UF; ++part) |
3060 | EdgeMask[part] = Builder.CreateNot(EdgeMask[part]); |
3061 | |
3062 | for (unsigned part = 0; part < UF; ++part) |
3063 | EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]); |
3064 | |
3065 | MaskCache[Edge] = EdgeMask; |
3066 | return EdgeMask; |
3067 | } |
3068 | |
3069 | MaskCache[Edge] = SrcMask; |
3070 | return SrcMask; |
3071 | } |
3072 | |
3073 | InnerLoopVectorizer::VectorParts |
3074 | InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { |
3075 | assert(OrigLoop->contains(BB) && "Block is not a part of a loop")((OrigLoop->contains(BB) && "Block is not a part of a loop" ) ? static_cast<void> (0) : __assert_fail ("OrigLoop->contains(BB) && \"Block is not a part of a loop\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3075, __PRETTY_FUNCTION__)); |
3076 | |
3077 | // Loop incoming mask is all-one. |
3078 | if (OrigLoop->getHeader() == BB) { |
3079 | Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1); |
3080 | return getVectorValue(C); |
3081 | } |
3082 | |
3083 | // This is the block mask. We OR all incoming edges, and with zero. |
3084 | Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0); |
3085 | VectorParts BlockMask = getVectorValue(Zero); |
3086 | |
3087 | // For each pred: |
3088 | for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) { |
3089 | VectorParts EM = createEdgeMask(*it, BB); |
3090 | for (unsigned part = 0; part < UF; ++part) |
3091 | BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]); |
3092 | } |
3093 | |
3094 | return BlockMask; |
3095 | } |
3096 | |
3097 | void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, |
3098 | InnerLoopVectorizer::VectorParts &Entry, |
3099 | unsigned UF, unsigned VF, PhiVector *PV) { |
3100 | PHINode* P = cast<PHINode>(PN); |
3101 | // Handle reduction variables: |
3102 | if (Legal->getReductionVars()->count(P)) { |
3103 | for (unsigned part = 0; part < UF; ++part) { |
3104 | // This is phase one of vectorizing PHIs. |
3105 | Type *VecTy = (VF == 1) ? PN->getType() : |
3106 | VectorType::get(PN->getType(), VF); |
3107 | Entry[part] = PHINode::Create(VecTy, 2, "vec.phi", |
3108 | LoopVectorBody.back()-> getFirstInsertionPt()); |
3109 | } |
3110 | PV->push_back(P); |
3111 | return; |
3112 | } |
3113 | |
3114 | setDebugLocFromInst(Builder, P); |
3115 | // Check for PHI nodes that are lowered to vector selects. |
3116 | if (P->getParent() != OrigLoop->getHeader()) { |
3117 | // We know that all PHIs in non-header blocks are converted into |
3118 | // selects, so we don't have to worry about the insertion order and we |
3119 | // can just use the builder. |
3120 | // At this point we generate the predication tree. There may be |
3121 | // duplications since this is a simple recursive scan, but future |
3122 | // optimizations will clean it up. |
3123 | |
3124 | unsigned NumIncoming = P->getNumIncomingValues(); |
3125 | |
3126 | // Generate a sequence of selects of the form: |
3127 | // SELECT(Mask3, In3, |
3128 | // SELECT(Mask2, In2, |
3129 | // ( ...))) |
3130 | for (unsigned In = 0; In < NumIncoming; In++) { |
3131 | VectorParts Cond = createEdgeMask(P->getIncomingBlock(In), |
3132 | P->getParent()); |
3133 | VectorParts &In0 = getVectorValue(P->getIncomingValue(In)); |
3134 | |
3135 | for (unsigned part = 0; part < UF; ++part) { |
3136 | // We might have single edge PHIs (blocks) - use an identity |
3137 | // 'select' for the first PHI operand. |
3138 | if (In == 0) |
3139 | Entry[part] = Builder.CreateSelect(Cond[part], In0[part], |
3140 | In0[part]); |
3141 | else |
3142 | // Select between the current value and the previous incoming edge |
3143 | // based on the incoming mask. |
3144 | Entry[part] = Builder.CreateSelect(Cond[part], In0[part], |
3145 | Entry[part], "predphi"); |
3146 | } |
3147 | } |
3148 | return; |
3149 | } |
3150 | |
3151 | // This PHINode must be an induction variable. |
3152 | // Make sure that we know about it. |
3153 | assert(Legal->getInductionVars()->count(P) &&((Legal->getInductionVars()->count(P) && "Not an induction variable" ) ? static_cast<void> (0) : __assert_fail ("Legal->getInductionVars()->count(P) && \"Not an induction variable\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3154, __PRETTY_FUNCTION__)) |
3154 | "Not an induction variable")((Legal->getInductionVars()->count(P) && "Not an induction variable" ) ? static_cast<void> (0) : __assert_fail ("Legal->getInductionVars()->count(P) && \"Not an induction variable\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3154, __PRETTY_FUNCTION__)); |
3155 | |
3156 | LoopVectorizationLegality::InductionInfo II = |
3157 | Legal->getInductionVars()->lookup(P); |
3158 | |
3159 | switch (II.IK) { |
3160 | case LoopVectorizationLegality::IK_NoInduction: |
3161 | llvm_unreachable("Unknown induction")::llvm::llvm_unreachable_internal("Unknown induction", "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3161); |
3162 | case LoopVectorizationLegality::IK_IntInduction: { |
3163 | assert(P->getType() == II.StartValue->getType() && "Types must match")((P->getType() == II.StartValue->getType() && "Types must match" ) ? static_cast<void> (0) : __assert_fail ("P->getType() == II.StartValue->getType() && \"Types must match\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3163, __PRETTY_FUNCTION__)); |
3164 | Type *PhiTy = P->getType(); |
3165 | Value *Broadcasted; |
3166 | if (P == OldInduction) { |
3167 | // Handle the canonical induction variable. We might have had to |
3168 | // extend the type. |
3169 | Broadcasted = Builder.CreateTrunc(Induction, PhiTy); |
3170 | } else { |
3171 | // Handle other induction variables that are now based on the |
3172 | // canonical one. |
3173 | Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx, |
3174 | "normalized.idx"); |
3175 | NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy); |
3176 | Broadcasted = Builder.CreateAdd(II.StartValue, NormalizedIdx, |
3177 | "offset.idx"); |
3178 | } |
3179 | Broadcasted = getBroadcastInstrs(Broadcasted); |
3180 | // After broadcasting the induction variable we need to make the vector |
3181 | // consecutive by adding 0, 1, 2, etc. |
3182 | for (unsigned part = 0; part < UF; ++part) |
3183 | Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false); |
3184 | return; |
3185 | } |
3186 | case LoopVectorizationLegality::IK_ReverseIntInduction: |
3187 | case LoopVectorizationLegality::IK_PtrInduction: |
3188 | case LoopVectorizationLegality::IK_ReversePtrInduction: |
3189 | // Handle reverse integer and pointer inductions. |
3190 | Value *StartIdx = ExtendedIdx; |
3191 | // This is the normalized GEP that starts counting at zero. |
3192 | Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx, |
3193 | "normalized.idx"); |
3194 | |
3195 | // Handle the reverse integer induction variable case. |
3196 | if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) { |
3197 | IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType()); |
3198 | Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy, |
3199 | "resize.norm.idx"); |
3200 | Value *ReverseInd = Builder.CreateSub(II.StartValue, CNI, |
3201 | "reverse.idx"); |
3202 | |
3203 | // This is a new value so do not hoist it out. |
3204 | Value *Broadcasted = getBroadcastInstrs(ReverseInd); |
3205 | // After broadcasting the induction variable we need to make the |
3206 | // vector consecutive by adding ... -3, -2, -1, 0. |
3207 | for (unsigned part = 0; part < UF; ++part) |
3208 | Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part, |
3209 | true); |
3210 | return; |
3211 | } |
3212 | |
3213 | // Handle the pointer induction variable case. |
3214 | assert(P->getType()->isPointerTy() && "Unexpected type.")((P->getType()->isPointerTy() && "Unexpected type." ) ? static_cast<void> (0) : __assert_fail ("P->getType()->isPointerTy() && \"Unexpected type.\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3214, __PRETTY_FUNCTION__)); |
3215 | |
3216 | // Is this a reverse induction ptr or a consecutive induction ptr. |
3217 | bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction == |
3218 | II.IK); |
3219 | |
3220 | // This is the vector of results. Notice that we don't generate |
3221 | // vector geps because scalar geps result in better code. |
3222 | for (unsigned part = 0; part < UF; ++part) { |
3223 | if (VF == 1) { |
3224 | int EltIndex = (part) * (Reverse ? -1 : 1); |
3225 | Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex); |
3226 | Value *GlobalIdx; |
3227 | if (Reverse) |
3228 | GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx"); |
3229 | else |
3230 | GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx"); |
3231 | |
3232 | Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, |
3233 | "next.gep"); |
3234 | Entry[part] = SclrGep; |
3235 | continue; |
3236 | } |
3237 | |
3238 | Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); |
3239 | for (unsigned int i = 0; i < VF; ++i) { |
3240 | int EltIndex = (i + part * VF) * (Reverse ? -1 : 1); |
3241 | Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex); |
3242 | Value *GlobalIdx; |
3243 | if (!Reverse) |
3244 | GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx"); |
3245 | else |
3246 | GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx"); |
3247 | |
3248 | Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, |
3249 | "next.gep"); |
3250 | VecVal = Builder.CreateInsertElement(VecVal, SclrGep, |
3251 | Builder.getInt32(i), |
3252 | "insert.gep"); |
3253 | } |
3254 | Entry[part] = VecVal; |
3255 | } |
3256 | return; |
3257 | } |
3258 | } |
3259 | |
3260 | void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { |
3261 | // For each instruction in the old loop. |
3262 | for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { |
3263 | VectorParts &Entry = WidenMap.get(it); |
3264 | switch (it->getOpcode()) { |
3265 | case Instruction::Br: |
3266 | // Nothing to do for PHIs and BR, since we already took care of the |
3267 | // loop control flow instructions. |
3268 | continue; |
3269 | case Instruction::PHI:{ |
3270 | // Vectorize PHINodes. |
3271 | widenPHIInstruction(it, Entry, UF, VF, PV); |
3272 | continue; |
3273 | }// End of PHI. |
3274 | |
3275 | case Instruction::Add: |
3276 | case Instruction::FAdd: |
3277 | case Instruction::Sub: |
3278 | case Instruction::FSub: |
3279 | case Instruction::Mul: |
3280 | case Instruction::FMul: |
3281 | case Instruction::UDiv: |
3282 | case Instruction::SDiv: |
3283 | case Instruction::FDiv: |
3284 | case Instruction::URem: |
3285 | case Instruction::SRem: |
3286 | case Instruction::FRem: |
3287 | case Instruction::Shl: |
3288 | case Instruction::LShr: |
3289 | case Instruction::AShr: |
3290 | case Instruction::And: |
3291 | case Instruction::Or: |
3292 | case Instruction::Xor: { |
3293 | // Just widen binops. |
3294 | BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it); |
3295 | setDebugLocFromInst(Builder, BinOp); |
3296 | VectorParts &A = getVectorValue(it->getOperand(0)); |
3297 | VectorParts &B = getVectorValue(it->getOperand(1)); |
3298 | |
3299 | // Use this vector value for all users of the original instruction. |
3300 | for (unsigned Part = 0; Part < UF; ++Part) { |
3301 | Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]); |
3302 | |
3303 | if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V)) |
3304 | VecOp->copyIRFlags(BinOp); |
3305 | |
3306 | Entry[Part] = V; |
3307 | } |
3308 | |
3309 | propagateMetadata(Entry, it); |
3310 | break; |
3311 | } |
3312 | case Instruction::Select: { |
3313 | // Widen selects. |
3314 | // If the selector is loop invariant we can create a select |
3315 | // instruction with a scalar condition. Otherwise, use vector-select. |
3316 | bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)), |
3317 | OrigLoop); |
3318 | setDebugLocFromInst(Builder, it); |
3319 | |
3320 | // The condition can be loop invariant but still defined inside the |
3321 | // loop. This means that we can't just use the original 'cond' value. |
3322 | // We have to take the 'vectorized' value and pick the first lane. |
3323 | // Instcombine will make this a no-op. |
3324 | VectorParts &Cond = getVectorValue(it->getOperand(0)); |
3325 | VectorParts &Op0 = getVectorValue(it->getOperand(1)); |
3326 | VectorParts &Op1 = getVectorValue(it->getOperand(2)); |
3327 | |
3328 | Value *ScalarCond = (VF == 1) ? Cond[0] : |
3329 | Builder.CreateExtractElement(Cond[0], Builder.getInt32(0)); |
3330 | |
3331 | for (unsigned Part = 0; Part < UF; ++Part) { |
3332 | Entry[Part] = Builder.CreateSelect( |
3333 | InvariantCond ? ScalarCond : Cond[Part], |
3334 | Op0[Part], |
3335 | Op1[Part]); |
3336 | } |
3337 | |
3338 | propagateMetadata(Entry, it); |
3339 | break; |
3340 | } |
3341 | |
3342 | case Instruction::ICmp: |
3343 | case Instruction::FCmp: { |
3344 | // Widen compares. Generate vector compares. |
3345 | bool FCmp = (it->getOpcode() == Instruction::FCmp); |
3346 | CmpInst *Cmp = dyn_cast<CmpInst>(it); |
3347 | setDebugLocFromInst(Builder, it); |
3348 | VectorParts &A = getVectorValue(it->getOperand(0)); |
3349 | VectorParts &B = getVectorValue(it->getOperand(1)); |
3350 | for (unsigned Part = 0; Part < UF; ++Part) { |
3351 | Value *C = nullptr; |
3352 | if (FCmp) |
3353 | C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]); |
3354 | else |
3355 | C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]); |
3356 | Entry[Part] = C; |
3357 | } |
3358 | |
3359 | propagateMetadata(Entry, it); |
3360 | break; |
3361 | } |
3362 | |
3363 | case Instruction::Store: |
3364 | case Instruction::Load: |
3365 | vectorizeMemoryInstruction(it); |
3366 | break; |
3367 | case Instruction::ZExt: |
3368 | case Instruction::SExt: |
3369 | case Instruction::FPToUI: |
3370 | case Instruction::FPToSI: |
3371 | case Instruction::FPExt: |
3372 | case Instruction::PtrToInt: |
3373 | case Instruction::IntToPtr: |
3374 | case Instruction::SIToFP: |
3375 | case Instruction::UIToFP: |
3376 | case Instruction::Trunc: |
3377 | case Instruction::FPTrunc: |
3378 | case Instruction::BitCast: { |
3379 | CastInst *CI = dyn_cast<CastInst>(it); |
3380 | setDebugLocFromInst(Builder, it); |
3381 | /// Optimize the special case where the source is the induction |
3382 | /// variable. Notice that we can only optimize the 'trunc' case |
3383 | /// because: a. FP conversions lose precision, b. sext/zext may wrap, |
3384 | /// c. other casts depend on pointer size. |
3385 | if (CI->getOperand(0) == OldInduction && |
3386 | it->getOpcode() == Instruction::Trunc) { |
3387 | Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction, |
3388 | CI->getType()); |
3389 | Value *Broadcasted = getBroadcastInstrs(ScalarCast); |
3390 | for (unsigned Part = 0; Part < UF; ++Part) |
3391 | Entry[Part] = getConsecutiveVector(Broadcasted, VF * Part, false); |
3392 | propagateMetadata(Entry, it); |
3393 | break; |
3394 | } |
3395 | /// Vectorize casts. |
3396 | Type *DestTy = (VF == 1) ? CI->getType() : |
3397 | VectorType::get(CI->getType(), VF); |
3398 | |
3399 | VectorParts &A = getVectorValue(it->getOperand(0)); |
3400 | for (unsigned Part = 0; Part < UF; ++Part) |
3401 | Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy); |
3402 | propagateMetadata(Entry, it); |
3403 | break; |
3404 | } |
3405 | |
3406 | case Instruction::Call: { |
3407 | // Ignore dbg intrinsics. |
3408 | if (isa<DbgInfoIntrinsic>(it)) |
3409 | break; |
3410 | setDebugLocFromInst(Builder, it); |
3411 | |
3412 | Module *M = BB->getParent()->getParent(); |
3413 | CallInst *CI = cast<CallInst>(it); |
3414 | Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); |
3415 | assert(ID && "Not an intrinsic call!")((ID && "Not an intrinsic call!") ? static_cast<void > (0) : __assert_fail ("ID && \"Not an intrinsic call!\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3415, __PRETTY_FUNCTION__)); |
3416 | switch (ID) { |
3417 | case Intrinsic::assume: |
3418 | case Intrinsic::lifetime_end: |
3419 | case Intrinsic::lifetime_start: |
3420 | scalarizeInstruction(it); |
3421 | break; |
3422 | default: |
3423 | bool HasScalarOpd = hasVectorInstrinsicScalarOpd(ID, 1); |
3424 | for (unsigned Part = 0; Part < UF; ++Part) { |
3425 | SmallVector<Value *, 4> Args; |
3426 | for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { |
3427 | if (HasScalarOpd && i == 1) { |
3428 | Args.push_back(CI->getArgOperand(i)); |
3429 | continue; |
3430 | } |
3431 | VectorParts &Arg = getVectorValue(CI->getArgOperand(i)); |
3432 | Args.push_back(Arg[Part]); |
3433 | } |
3434 | Type *Tys[] = {CI->getType()}; |
3435 | if (VF > 1) |
3436 | Tys[0] = VectorType::get(CI->getType()->getScalarType(), VF); |
3437 | |
3438 | Function *F = Intrinsic::getDeclaration(M, ID, Tys); |
3439 | Entry[Part] = Builder.CreateCall(F, Args); |
3440 | } |
3441 | |
3442 | propagateMetadata(Entry, it); |
3443 | break; |
3444 | } |
3445 | break; |
3446 | } |
3447 | |
3448 | default: |
3449 | // All other instructions are unsupported. Scalarize them. |
3450 | scalarizeInstruction(it); |
3451 | break; |
3452 | }// end of switch. |
3453 | }// end of for_each instr. |
3454 | } |
3455 | |
3456 | void InnerLoopVectorizer::updateAnalysis() { |
3457 | // Forget the original basic block. |
3458 | SE->forgetLoop(OrigLoop); |
3459 | |
3460 | // Update the dominator tree information. |
3461 | assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&((DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock ) && "Entry does not dominate exit.") ? static_cast< void> (0) : __assert_fail ("DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && \"Entry does not dominate exit.\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3462, __PRETTY_FUNCTION__)) |
3462 | "Entry does not dominate exit.")((DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock ) && "Entry does not dominate exit.") ? static_cast< void> (0) : __assert_fail ("DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && \"Entry does not dominate exit.\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3462, __PRETTY_FUNCTION__)); |
3463 | |
3464 | for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) |
3465 | DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]); |
3466 | DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back()); |
3467 | |
3468 | // Due to if predication of stores we might create a sequence of "if(pred) |
3469 | // a[i] = ...; " blocks. |
3470 | for (unsigned i = 0, e = LoopVectorBody.size(); i != e; ++i) { |
3471 | if (i == 0) |
3472 | DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader); |
3473 | else if (isPredicatedBlock(i)) { |
3474 | DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-1]); |
3475 | } else { |
3476 | DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-2]); |
3477 | } |
3478 | } |
3479 | |
3480 | DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks[1]); |
3481 | DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]); |
3482 | DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); |
3483 | DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); |
3484 | |
3485 | DEBUG(DT->verifyDomTree())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { DT->verifyDomTree(); } } while (0); |
3486 | } |
3487 | |
3488 | /// \brief Check whether it is safe to if-convert this phi node. |
3489 | /// |
3490 | /// Phi nodes with constant expressions that can trap are not safe to if |
3491 | /// convert. |
3492 | static bool canIfConvertPHINodes(BasicBlock *BB) { |
3493 | for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { |
3494 | PHINode *Phi = dyn_cast<PHINode>(I); |
3495 | if (!Phi) |
3496 | return true; |
3497 | for (unsigned p = 0, e = Phi->getNumIncomingValues(); p != e; ++p) |
3498 | if (Constant *C = dyn_cast<Constant>(Phi->getIncomingValue(p))) |
3499 | if (C->canTrap()) |
3500 | return false; |
3501 | } |
3502 | return true; |
3503 | } |
3504 | |
3505 | bool LoopVectorizationLegality::canVectorizeWithIfConvert() { |
3506 | if (!EnableIfConversion) { |
3507 | emitAnalysis(Report() << "if-conversion is disabled"); |
3508 | return false; |
3509 | } |
3510 | |
3511 | assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable")((TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable" ) ? static_cast<void> (0) : __assert_fail ("TheLoop->getNumBlocks() > 1 && \"Single block loops are vectorizable\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3511, __PRETTY_FUNCTION__)); |
3512 | |
3513 | // A list of pointers that we can safely read and write to. |
3514 | SmallPtrSet<Value *, 8> SafePointes; |
3515 | |
3516 | // Collect safe addresses. |
3517 | for (Loop::block_iterator BI = TheLoop->block_begin(), |
3518 | BE = TheLoop->block_end(); BI != BE; ++BI) { |
3519 | BasicBlock *BB = *BI; |
3520 | |
3521 | if (blockNeedsPredication(BB)) |
3522 | continue; |
3523 | |
3524 | for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { |
3525 | if (LoadInst *LI = dyn_cast<LoadInst>(I)) |
3526 | SafePointes.insert(LI->getPointerOperand()); |
3527 | else if (StoreInst *SI = dyn_cast<StoreInst>(I)) |
3528 | SafePointes.insert(SI->getPointerOperand()); |
3529 | } |
3530 | } |
3531 | |
3532 | // Collect the blocks that need predication. |
3533 | BasicBlock *Header = TheLoop->getHeader(); |
3534 | for (Loop::block_iterator BI = TheLoop->block_begin(), |
3535 | BE = TheLoop->block_end(); BI != BE; ++BI) { |
3536 | BasicBlock *BB = *BI; |
3537 | |
3538 | // We don't support switch statements inside loops. |
3539 | if (!isa<BranchInst>(BB->getTerminator())) { |
3540 | emitAnalysis(Report(BB->getTerminator()) |
3541 | << "loop contains a switch statement"); |
3542 | return false; |
3543 | } |
3544 | |
3545 | // We must be able to predicate all blocks that need to be predicated. |
3546 | if (blockNeedsPredication(BB)) { |
3547 | if (!blockCanBePredicated(BB, SafePointes)) { |
3548 | emitAnalysis(Report(BB->getTerminator()) |
3549 | << "control flow cannot be substituted for a select"); |
3550 | return false; |
3551 | } |
3552 | } else if (BB != Header && !canIfConvertPHINodes(BB)) { |
3553 | emitAnalysis(Report(BB->getTerminator()) |
3554 | << "control flow cannot be substituted for a select"); |
3555 | return false; |
3556 | } |
3557 | } |
3558 | |
3559 | // We can if-convert this loop. |
3560 | return true; |
3561 | } |
3562 | |
3563 | bool LoopVectorizationLegality::canVectorize() { |
3564 | // We must have a loop in canonical form. Loops with indirectbr in them cannot |
3565 | // be canonicalized. |
3566 | if (!TheLoop->getLoopPreheader()) { |
3567 | emitAnalysis( |
3568 | Report() << "loop control flow is not understood by vectorizer"); |
3569 | return false; |
3570 | } |
3571 | |
3572 | // We can only vectorize innermost loops. |
3573 | if (TheLoop->getSubLoopsVector().size()) { |
3574 | emitAnalysis(Report() << "loop is not the innermost loop"); |
3575 | return false; |
3576 | } |
3577 | |
3578 | // We must have a single backedge. |
3579 | if (TheLoop->getNumBackEdges() != 1) { |
3580 | emitAnalysis( |
3581 | Report() << "loop control flow is not understood by vectorizer"); |
3582 | return false; |
3583 | } |
3584 | |
3585 | // We must have a single exiting block. |
3586 | if (!TheLoop->getExitingBlock()) { |
3587 | emitAnalysis( |
3588 | Report() << "loop control flow is not understood by vectorizer"); |
3589 | return false; |
3590 | } |
3591 | |
3592 | // We only handle bottom-tested loops, i.e. loop in which the condition is |
3593 | // checked at the end of each iteration. With that we can assume that all |
3594 | // instructions in the loop are executed the same number of times. |
3595 | if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { |
3596 | emitAnalysis( |
3597 | Report() << "loop control flow is not understood by vectorizer"); |
3598 | return false; |
3599 | } |
3600 | |
3601 | // We need to have a loop header. |
3602 | DEBUG(dbgs() << "LV: Found a loop: " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName() << '\n'; } } while (0) |
3603 | TheLoop->getHeader()->getName() << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName() << '\n'; } } while (0); |
3604 | |
3605 | // Check if we can if-convert non-single-bb loops. |
3606 | unsigned NumBlocks = TheLoop->getNumBlocks(); |
3607 | if (NumBlocks != 1 && !canVectorizeWithIfConvert()) { |
3608 | DEBUG(dbgs() << "LV: Can't if-convert the loop.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Can't if-convert the loop.\n" ; } } while (0); |
3609 | return false; |
3610 | } |
3611 | |
3612 | // ScalarEvolution needs to be able to find the exit count. |
3613 | const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop); |
3614 | if (ExitCount == SE->getCouldNotCompute()) { |
3615 | emitAnalysis(Report() << "could not determine number of loop iterations"); |
3616 | DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: SCEV could not compute the loop exit count.\n" ; } } while (0); |
3617 | return false; |
3618 | } |
3619 | |
3620 | // Check if we can vectorize the instructions and CFG in this loop. |
3621 | if (!canVectorizeInstrs()) { |
3622 | DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Can't vectorize the instructions or CFG\n" ; } } while (0); |
3623 | return false; |
3624 | } |
3625 | |
3626 | // Go over each instruction and look at memory deps. |
3627 | if (!canVectorizeMemory()) { |
3628 | DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Can't vectorize due to memory conflicts\n" ; } } while (0); |
3629 | return false; |
3630 | } |
3631 | |
3632 | // Collect all of the variables that remain uniform after vectorization. |
3633 | collectLoopUniforms(); |
3634 | |
3635 | DEBUG(dbgs() << "LV: We can vectorize this loop" <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We can vectorize this loop" << (PtrRtCheck.Need ? " (with a runtime bound check)" : "") <<"!\n"; } } while (0) |
3636 | (PtrRtCheck.Need ? " (with a runtime bound check)" : "")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We can vectorize this loop" << (PtrRtCheck.Need ? " (with a runtime bound check)" : "") <<"!\n"; } } while (0) |
3637 | <<"!\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We can vectorize this loop" << (PtrRtCheck.Need ? " (with a runtime bound check)" : "") <<"!\n"; } } while (0); |
3638 | |
3639 | // Okay! We can vectorize. At this point we don't have any other mem analysis |
3640 | // which may limit our maximum vectorization factor, so just return true with |
3641 | // no restrictions. |
3642 | return true; |
3643 | } |
3644 | |
3645 | static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) { |
3646 | if (Ty->isPointerTy()) |
3647 | return DL.getIntPtrType(Ty); |
3648 | |
3649 | // It is possible that char's or short's overflow when we ask for the loop's |
3650 | // trip count, work around this by changing the type size. |
3651 | if (Ty->getScalarSizeInBits() < 32) |
3652 | return Type::getInt32Ty(Ty->getContext()); |
3653 | |
3654 | return Ty; |
3655 | } |
3656 | |
3657 | static Type* getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) { |
3658 | Ty0 = convertPointerToIntegerType(DL, Ty0); |
3659 | Ty1 = convertPointerToIntegerType(DL, Ty1); |
3660 | if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits()) |
3661 | return Ty0; |
3662 | return Ty1; |
3663 | } |
3664 | |
3665 | /// \brief Check that the instruction has outside loop users and is not an |
3666 | /// identified reduction variable. |
3667 | static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, |
3668 | SmallPtrSetImpl<Value *> &Reductions) { |
3669 | // Reduction instructions are allowed to have exit users. All other |
3670 | // instructions must not have external users. |
3671 | if (!Reductions.count(Inst)) |
3672 | //Check that all of the users of the loop are inside the BB. |
3673 | for (User *U : Inst->users()) { |
3674 | Instruction *UI = cast<Instruction>(U); |
3675 | // This user may be a reduction exit value. |
3676 | if (!TheLoop->contains(UI)) { |
3677 | DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an outside user for : " << *UI << '\n'; } } while (0); |
3678 | return true; |
3679 | } |
3680 | } |
3681 | return false; |
3682 | } |
3683 | |
3684 | bool LoopVectorizationLegality::canVectorizeInstrs() { |
3685 | BasicBlock *PreHeader = TheLoop->getLoopPreheader(); |
3686 | BasicBlock *Header = TheLoop->getHeader(); |
3687 | |
3688 | // Look for the attribute signaling the absence of NaNs. |
3689 | Function &F = *Header->getParent(); |
3690 | if (F.hasFnAttribute("no-nans-fp-math")) |
3691 | HasFunNoNaNAttr = F.getAttributes().getAttribute( |
3692 | AttributeSet::FunctionIndex, |
3693 | "no-nans-fp-math").getValueAsString() == "true"; |
3694 | |
3695 | // For each block in the loop. |
3696 | for (Loop::block_iterator bb = TheLoop->block_begin(), |
3697 | be = TheLoop->block_end(); bb != be; ++bb) { |
3698 | |
3699 | // Scan the instructions in the block and look for hazards. |
3700 | for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; |
3701 | ++it) { |
3702 | |
3703 | if (PHINode *Phi = dyn_cast<PHINode>(it)) { |
3704 | Type *PhiTy = Phi->getType(); |
3705 | // Check that this PHI type is allowed. |
3706 | if (!PhiTy->isIntegerTy() && |
3707 | !PhiTy->isFloatingPointTy() && |
3708 | !PhiTy->isPointerTy()) { |
3709 | emitAnalysis(Report(it) |
3710 | << "loop control flow is not understood by vectorizer"); |
3711 | DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an non-int non-pointer PHI.\n" ; } } while (0); |
3712 | return false; |
3713 | } |
3714 | |
3715 | // If this PHINode is not in the header block, then we know that we |
3716 | // can convert it to select during if-conversion. No need to check if |
3717 | // the PHIs in this block are induction or reduction variables. |
3718 | if (*bb != Header) { |
3719 | // Check that this instruction has no outside users or is an |
3720 | // identified reduction value with an outside user. |
3721 | if (!hasOutsideLoopUser(TheLoop, it, AllowedExit)) |
3722 | continue; |
3723 | emitAnalysis(Report(it) << "value could not be identified as " |
3724 | "an induction or reduction variable"); |
3725 | return false; |
3726 | } |
3727 | |
3728 | // We only allow if-converted PHIs with more than two incoming values. |
3729 | if (Phi->getNumIncomingValues() != 2) { |
3730 | emitAnalysis(Report(it) |
3731 | << "control flow not understood by vectorizer"); |
3732 | DEBUG(dbgs() << "LV: Found an invalid PHI.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an invalid PHI.\n" ; } } while (0); |
3733 | return false; |
3734 | } |
3735 | |
3736 | // This is the value coming from the preheader. |
3737 | Value *StartValue = Phi->getIncomingValueForBlock(PreHeader); |
3738 | // Check if this is an induction variable. |
3739 | InductionKind IK = isInductionVariable(Phi); |
3740 | |
3741 | if (IK_NoInduction != IK) { |
3742 | // Get the widest type. |
3743 | if (!WidestIndTy) |
3744 | WidestIndTy = convertPointerToIntegerType(*DL, PhiTy); |
3745 | else |
3746 | WidestIndTy = getWiderType(*DL, PhiTy, WidestIndTy); |
3747 | |
3748 | // Int inductions are special because we only allow one IV. |
3749 | if (IK == IK_IntInduction) { |
3750 | // Use the phi node with the widest type as induction. Use the last |
3751 | // one if there are multiple (no good reason for doing this other |
3752 | // than it is expedient). |
3753 | if (!Induction || PhiTy == WidestIndTy) |
3754 | Induction = Phi; |
3755 | } |
3756 | |
3757 | DEBUG(dbgs() << "LV: Found an induction variable.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an induction variable.\n" ; } } while (0); |
3758 | Inductions[Phi] = InductionInfo(StartValue, IK); |
3759 | |
3760 | // Until we explicitly handle the case of an induction variable with |
3761 | // an outside loop user we have to give up vectorizing this loop. |
3762 | if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) { |
3763 | emitAnalysis(Report(it) << "use of induction value outside of the " |
3764 | "loop is not handled by vectorizer"); |
3765 | return false; |
3766 | } |
3767 | |
3768 | continue; |
3769 | } |
3770 | |
3771 | if (AddReductionVar(Phi, RK_IntegerAdd)) { |
3772 | DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an ADD reduction PHI." << *Phi <<"\n"; } } while (0); |
3773 | continue; |
3774 | } |
3775 | if (AddReductionVar(Phi, RK_IntegerMult)) { |
3776 | DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a MUL reduction PHI." << *Phi <<"\n"; } } while (0); |
3777 | continue; |
3778 | } |
3779 | if (AddReductionVar(Phi, RK_IntegerOr)) { |
3780 | DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an OR reduction PHI." << *Phi <<"\n"; } } while (0); |
3781 | continue; |
3782 | } |
3783 | if (AddReductionVar(Phi, RK_IntegerAnd)) { |
3784 | DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an AND reduction PHI." << *Phi <<"\n"; } } while (0); |
3785 | continue; |
3786 | } |
3787 | if (AddReductionVar(Phi, RK_IntegerXor)) { |
3788 | DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a XOR reduction PHI." << *Phi <<"\n"; } } while (0); |
3789 | continue; |
3790 | } |
3791 | if (AddReductionVar(Phi, RK_IntegerMinMax)) { |
3792 | DEBUG(dbgs() << "LV: Found a MINMAX reduction PHI."<< *Phi <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a MINMAX reduction PHI." << *Phi <<"\n"; } } while (0); |
3793 | continue; |
3794 | } |
3795 | if (AddReductionVar(Phi, RK_FloatMult)) { |
3796 | DEBUG(dbgs() << "LV: Found an FMult reduction PHI."<< *Phi <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an FMult reduction PHI." << *Phi <<"\n"; } } while (0); |
3797 | continue; |
3798 | } |
3799 | if (AddReductionVar(Phi, RK_FloatAdd)) { |
3800 | DEBUG(dbgs() << "LV: Found an FAdd reduction PHI."<< *Phi <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an FAdd reduction PHI." << *Phi <<"\n"; } } while (0); |
3801 | continue; |
3802 | } |
3803 | if (AddReductionVar(Phi, RK_FloatMinMax)) { |
3804 | DEBUG(dbgs() << "LV: Found an float MINMAX reduction PHI."<< *Phi <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an float MINMAX reduction PHI." << *Phi << "\n"; } } while (0) |
3805 | "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an float MINMAX reduction PHI." << *Phi << "\n"; } } while (0); |
3806 | continue; |
3807 | } |
3808 | |
3809 | emitAnalysis(Report(it) << "value that could not be identified as " |
3810 | "reduction is used outside the loop"); |
3811 | DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an unidentified PHI." << *Phi <<"\n"; } } while (0); |
3812 | return false; |
3813 | }// end of PHI handling |
3814 | |
3815 | // We still don't handle functions. However, we can ignore dbg intrinsic |
3816 | // calls and we do handle certain intrinsic and libm functions. |
3817 | CallInst *CI = dyn_cast<CallInst>(it); |
3818 | if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI)) { |
3819 | emitAnalysis(Report(it) << "call instruction cannot be vectorized"); |
3820 | DEBUG(dbgs() << "LV: Found a call site.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a call site.\n" ; } } while (0); |
3821 | return false; |
3822 | } |
3823 | |
3824 | // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the |
3825 | // second argument is the same (i.e. loop invariant) |
3826 | if (CI && |
3827 | hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) { |
3828 | if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) { |
3829 | emitAnalysis(Report(it) |
3830 | << "intrinsic instruction cannot be vectorized"); |
3831 | DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n"; } } while (0); |
3832 | return false; |
3833 | } |
3834 | } |
3835 | |
3836 | // Check that the instruction return type is vectorizable. |
3837 | // Also, we can't vectorize extractelement instructions. |
3838 | if ((!VectorType::isValidElementType(it->getType()) && |
3839 | !it->getType()->isVoidTy()) || isa<ExtractElementInst>(it)) { |
3840 | emitAnalysis(Report(it) |
3841 | << "instruction return type cannot be vectorized"); |
3842 | DEBUG(dbgs() << "LV: Found unvectorizable type.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found unvectorizable type.\n" ; } } while (0); |
3843 | return false; |
3844 | } |
3845 | |
3846 | // Check that the stored type is vectorizable. |
3847 | if (StoreInst *ST = dyn_cast<StoreInst>(it)) { |
3848 | Type *T = ST->getValueOperand()->getType(); |
3849 | if (!VectorType::isValidElementType(T)) { |
3850 | emitAnalysis(Report(ST) << "store instruction cannot be vectorized"); |
3851 | return false; |
3852 | } |
3853 | if (EnableMemAccessVersioning) |
3854 | collectStridedAcccess(ST); |
3855 | } |
3856 | |
3857 | if (EnableMemAccessVersioning) |
3858 | if (LoadInst *LI = dyn_cast<LoadInst>(it)) |
3859 | collectStridedAcccess(LI); |
3860 | |
3861 | // Reduction instructions are allowed to have exit users. |
3862 | // All other instructions must not have external users. |
3863 | if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) { |
3864 | emitAnalysis(Report(it) << "value cannot be used outside the loop"); |
3865 | return false; |
3866 | } |
3867 | |
3868 | } // next instr. |
3869 | |
3870 | } |
3871 | |
3872 | if (!Induction) { |
3873 | DEBUG(dbgs() << "LV: Did not find one integer induction var.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Did not find one integer induction var.\n" ; } } while (0); |
3874 | if (Inductions.empty()) { |
3875 | emitAnalysis(Report() |
3876 | << "loop induction variable could not be identified"); |
3877 | return false; |
3878 | } |
3879 | } |
3880 | |
3881 | return true; |
3882 | } |
3883 | |
3884 | ///\brief Remove GEPs whose indices but the last one are loop invariant and |
3885 | /// return the induction operand of the gep pointer. |
3886 | static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, |
3887 | const DataLayout *DL, Loop *Lp) { |
3888 | GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr); |
3889 | if (!GEP) |
3890 | return Ptr; |
3891 | |
3892 | unsigned InductionOperand = getGEPInductionOperand(DL, GEP); |
3893 | |
3894 | // Check that all of the gep indices are uniform except for our induction |
3895 | // operand. |
3896 | for (unsigned i = 0, e = GEP->getNumOperands(); i != e; ++i) |
3897 | if (i != InductionOperand && |
3898 | !SE->isLoopInvariant(SE->getSCEV(GEP->getOperand(i)), Lp)) |
3899 | return Ptr; |
3900 | return GEP->getOperand(InductionOperand); |
3901 | } |
3902 | |
3903 | ///\brief Look for a cast use of the passed value. |
3904 | static Value *getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty) { |
3905 | Value *UniqueCast = nullptr; |
3906 | for (User *U : Ptr->users()) { |
3907 | CastInst *CI = dyn_cast<CastInst>(U); |
3908 | if (CI && CI->getType() == Ty) { |
3909 | if (!UniqueCast) |
3910 | UniqueCast = CI; |
3911 | else |
3912 | return nullptr; |
3913 | } |
3914 | } |
3915 | return UniqueCast; |
3916 | } |
3917 | |
3918 | ///\brief Get the stride of a pointer access in a loop. |
3919 | /// Looks for symbolic strides "a[i*stride]". Returns the symbolic stride as a |
3920 | /// pointer to the Value, or null otherwise. |
3921 | static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, |
3922 | const DataLayout *DL, Loop *Lp) { |
3923 | const PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); |
3924 | if (!PtrTy || PtrTy->isAggregateType()) |
3925 | return nullptr; |
3926 | |
3927 | // Try to remove a gep instruction to make the pointer (actually index at this |
3928 | // point) easier analyzable. If OrigPtr is equal to Ptr we are analzying the |
3929 | // pointer, otherwise, we are analyzing the index. |
3930 | Value *OrigPtr = Ptr; |
3931 | |
3932 | // The size of the pointer access. |
3933 | int64_t PtrAccessSize = 1; |
3934 | |
3935 | Ptr = stripGetElementPtr(Ptr, SE, DL, Lp); |
3936 | const SCEV *V = SE->getSCEV(Ptr); |
3937 | |
3938 | if (Ptr != OrigPtr) |
3939 | // Strip off casts. |
3940 | while (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(V)) |
3941 | V = C->getOperand(); |
3942 | |
3943 | const SCEVAddRecExpr *S = dyn_cast<SCEVAddRecExpr>(V); |
3944 | if (!S) |
3945 | return nullptr; |
3946 | |
3947 | V = S->getStepRecurrence(*SE); |
3948 | if (!V) |
3949 | return nullptr; |
3950 | |
3951 | // Strip off the size of access multiplication if we are still analyzing the |
3952 | // pointer. |
3953 | if (OrigPtr == Ptr) { |
3954 | DL->getTypeAllocSize(PtrTy->getElementType()); |
3955 | if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(V)) { |
3956 | if (M->getOperand(0)->getSCEVType() != scConstant) |
3957 | return nullptr; |
3958 | |
3959 | const APInt &APStepVal = |
3960 | cast<SCEVConstant>(M->getOperand(0))->getValue()->getValue(); |
3961 | |
3962 | // Huge step value - give up. |
3963 | if (APStepVal.getBitWidth() > 64) |
3964 | return nullptr; |
3965 | |
3966 | int64_t StepVal = APStepVal.getSExtValue(); |
3967 | if (PtrAccessSize != StepVal) |
3968 | return nullptr; |
3969 | V = M->getOperand(1); |
3970 | } |
3971 | } |
3972 | |
3973 | // Strip off casts. |
3974 | Type *StripedOffRecurrenceCast = nullptr; |
3975 | if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(V)) { |
3976 | StripedOffRecurrenceCast = C->getType(); |
3977 | V = C->getOperand(); |
3978 | } |
3979 | |
3980 | // Look for the loop invariant symbolic value. |
3981 | const SCEVUnknown *U = dyn_cast<SCEVUnknown>(V); |
3982 | if (!U) |
3983 | return nullptr; |
3984 | |
3985 | Value *Stride = U->getValue(); |
3986 | if (!Lp->isLoopInvariant(Stride)) |
3987 | return nullptr; |
3988 | |
3989 | // If we have stripped off the recurrence cast we have to make sure that we |
3990 | // return the value that is used in this loop so that we can replace it later. |
3991 | if (StripedOffRecurrenceCast) |
3992 | Stride = getUniqueCastUse(Stride, Lp, StripedOffRecurrenceCast); |
3993 | |
3994 | return Stride; |
3995 | } |
3996 | |
3997 | void LoopVectorizationLegality::collectStridedAcccess(Value *MemAccess) { |
3998 | Value *Ptr = nullptr; |
3999 | if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess)) |
4000 | Ptr = LI->getPointerOperand(); |
4001 | else if (StoreInst *SI = dyn_cast<StoreInst>(MemAccess)) |
4002 | Ptr = SI->getPointerOperand(); |
4003 | else |
4004 | return; |
4005 | |
4006 | Value *Stride = getStrideFromPointer(Ptr, SE, DL, TheLoop); |
4007 | if (!Stride) |
4008 | return; |
4009 | |
4010 | DEBUG(dbgs() << "LV: Found a strided access that we can version")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a strided access that we can version" ; } } while (0); |
4011 | DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *Stride << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << " Ptr: " << *Ptr << " Stride: " << *Stride << "\n"; } } while (0); |
4012 | Strides[Ptr] = Stride; |
4013 | StrideSet.insert(Stride); |
4014 | } |
4015 | |
4016 | void LoopVectorizationLegality::collectLoopUniforms() { |
4017 | // We now know that the loop is vectorizable! |
4018 | // Collect variables that will remain uniform after vectorization. |
4019 | std::vector<Value*> Worklist; |
4020 | BasicBlock *Latch = TheLoop->getLoopLatch(); |
4021 | |
4022 | // Start with the conditional branch and walk up the block. |
4023 | Worklist.push_back(Latch->getTerminator()->getOperand(0)); |
4024 | |
4025 | // Also add all consecutive pointer values; these values will be uniform |
4026 | // after vectorization (and subsequent cleanup) and, until revectorization is |
4027 | // supported, all dependencies must also be uniform. |
4028 | for (Loop::block_iterator B = TheLoop->block_begin(), |
4029 | BE = TheLoop->block_end(); B != BE; ++B) |
4030 | for (BasicBlock::iterator I = (*B)->begin(), IE = (*B)->end(); |
4031 | I != IE; ++I) |
4032 | if (I->getType()->isPointerTy() && isConsecutivePtr(I)) |
4033 | Worklist.insert(Worklist.end(), I->op_begin(), I->op_end()); |
4034 | |
4035 | while (Worklist.size()) { |
4036 | Instruction *I = dyn_cast<Instruction>(Worklist.back()); |
4037 | Worklist.pop_back(); |
4038 | |
4039 | // Look at instructions inside this loop. |
4040 | // Stop when reaching PHI nodes. |
4041 | // TODO: we need to follow values all over the loop, not only in this block. |
4042 | if (!I || !TheLoop->contains(I) || isa<PHINode>(I)) |
4043 | continue; |
4044 | |
4045 | // This is a known uniform. |
4046 | Uniforms.insert(I); |
4047 | |
4048 | // Insert all operands. |
4049 | Worklist.insert(Worklist.end(), I->op_begin(), I->op_end()); |
4050 | } |
4051 | } |
4052 | |
4053 | namespace { |
4054 | /// \brief Analyses memory accesses in a loop. |
4055 | /// |
4056 | /// Checks whether run time pointer checks are needed and builds sets for data |
4057 | /// dependence checking. |
4058 | class AccessAnalysis { |
4059 | public: |
4060 | /// \brief Read or write access location. |
4061 | typedef PointerIntPair<Value *, 1, bool> MemAccessInfo; |
4062 | typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet; |
4063 | |
4064 | /// \brief Set of potential dependent memory accesses. |
4065 | typedef EquivalenceClasses<MemAccessInfo> DepCandidates; |
4066 | |
4067 | AccessAnalysis(const DataLayout *Dl, AliasAnalysis *AA, DepCandidates &DA) : |
4068 | DL(Dl), AST(*AA), DepCands(DA), IsRTCheckNeeded(false) {} |
4069 | |
4070 | /// \brief Register a load and whether it is only read from. |
4071 | void addLoad(AliasAnalysis::Location &Loc, bool IsReadOnly) { |
4072 | Value *Ptr = const_cast<Value*>(Loc.Ptr); |
4073 | AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags); |
4074 | Accesses.insert(MemAccessInfo(Ptr, false)); |
4075 | if (IsReadOnly) |
4076 | ReadOnlyPtr.insert(Ptr); |
4077 | } |
4078 | |
4079 | /// \brief Register a store. |
4080 | void addStore(AliasAnalysis::Location &Loc) { |
4081 | Value *Ptr = const_cast<Value*>(Loc.Ptr); |
4082 | AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags); |
4083 | Accesses.insert(MemAccessInfo(Ptr, true)); |
4084 | } |
4085 | |
4086 | /// \brief Check whether we can check the pointers at runtime for |
4087 | /// non-intersection. |
4088 | bool canCheckPtrAtRT(LoopVectorizationLegality::RuntimePointerCheck &RtCheck, |
4089 | unsigned &NumComparisons, ScalarEvolution *SE, |
4090 | Loop *TheLoop, ValueToValueMap &Strides, |
4091 | bool ShouldCheckStride = false); |
4092 | |
4093 | /// \brief Goes over all memory accesses, checks whether a RT check is needed |
4094 | /// and builds sets of dependent accesses. |
4095 | void buildDependenceSets() { |
4096 | processMemAccesses(); |
4097 | } |
4098 | |
4099 | bool isRTCheckNeeded() { return IsRTCheckNeeded; } |
4100 | |
4101 | bool isDependencyCheckNeeded() { return !CheckDeps.empty(); } |
4102 | void resetDepChecks() { CheckDeps.clear(); } |
4103 | |
4104 | MemAccessInfoSet &getDependenciesToCheck() { return CheckDeps; } |
4105 | |
4106 | private: |
4107 | typedef SetVector<MemAccessInfo> PtrAccessSet; |
4108 | |
4109 | /// \brief Go over all memory access and check whether runtime pointer checks |
4110 | /// are needed /// and build sets of dependency check candidates. |
4111 | void processMemAccesses(); |
4112 | |
4113 | /// Set of all accesses. |
4114 | PtrAccessSet Accesses; |
4115 | |
4116 | /// Set of accesses that need a further dependence check. |
4117 | MemAccessInfoSet CheckDeps; |
4118 | |
4119 | /// Set of pointers that are read only. |
4120 | SmallPtrSet<Value*, 16> ReadOnlyPtr; |
4121 | |
4122 | const DataLayout *DL; |
4123 | |
4124 | /// An alias set tracker to partition the access set by underlying object and |
4125 | //intrinsic property (such as TBAA metadata). |
4126 | AliasSetTracker AST; |
4127 | |
4128 | /// Sets of potentially dependent accesses - members of one set share an |
4129 | /// underlying pointer. The set "CheckDeps" identfies which sets really need a |
4130 | /// dependence check. |
4131 | DepCandidates &DepCands; |
4132 | |
4133 | bool IsRTCheckNeeded; |
4134 | }; |
4135 | |
4136 | } // end anonymous namespace |
4137 | |
4138 | /// \brief Check whether a pointer can participate in a runtime bounds check. |
4139 | static bool hasComputableBounds(ScalarEvolution *SE, ValueToValueMap &Strides, |
4140 | Value *Ptr) { |
4141 | const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, Strides, Ptr); |
4142 | const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev); |
4143 | if (!AR) |
4144 | return false; |
4145 | |
4146 | return AR->isAffine(); |
4147 | } |
4148 | |
4149 | /// \brief Check the stride of the pointer and ensure that it does not wrap in |
4150 | /// the address space. |
4151 | static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr, |
4152 | const Loop *Lp, ValueToValueMap &StridesMap); |
4153 | |
4154 | bool AccessAnalysis::canCheckPtrAtRT( |
4155 | LoopVectorizationLegality::RuntimePointerCheck &RtCheck, |
4156 | unsigned &NumComparisons, ScalarEvolution *SE, Loop *TheLoop, |
4157 | ValueToValueMap &StridesMap, bool ShouldCheckStride) { |
4158 | // Find pointers with computable bounds. We are going to use this information |
4159 | // to place a runtime bound check. |
4160 | bool CanDoRT = true; |
4161 | |
4162 | bool IsDepCheckNeeded = isDependencyCheckNeeded(); |
4163 | NumComparisons = 0; |
4164 | |
4165 | // We assign a consecutive id to access from different alias sets. |
4166 | // Accesses between different groups doesn't need to be checked. |
4167 | unsigned ASId = 1; |
4168 | for (auto &AS : AST) { |
4169 | unsigned NumReadPtrChecks = 0; |
4170 | unsigned NumWritePtrChecks = 0; |
4171 | |
4172 | // We assign consecutive id to access from different dependence sets. |
4173 | // Accesses within the same set don't need a runtime check. |
4174 | unsigned RunningDepId = 1; |
4175 | DenseMap<Value *, unsigned> DepSetId; |
4176 | |
4177 | for (auto A : AS) { |
4178 | Value *Ptr = A.getValue(); |
4179 | bool IsWrite = Accesses.count(MemAccessInfo(Ptr, true)); |
4180 | MemAccessInfo Access(Ptr, IsWrite); |
4181 | |
4182 | if (IsWrite) |
4183 | ++NumWritePtrChecks; |
4184 | else |
4185 | ++NumReadPtrChecks; |
4186 | |
4187 | if (hasComputableBounds(SE, StridesMap, Ptr) && |
4188 | // When we run after a failing dependency check we have to make sure we |
4189 | // don't have wrapping pointers. |
4190 | (!ShouldCheckStride || |
4191 | isStridedPtr(SE, DL, Ptr, TheLoop, StridesMap) == 1)) { |
4192 | // The id of the dependence set. |
4193 | unsigned DepId; |
4194 | |
4195 | if (IsDepCheckNeeded) { |
4196 | Value *Leader = DepCands.getLeaderValue(Access).getPointer(); |
4197 | unsigned &LeaderId = DepSetId[Leader]; |
4198 | if (!LeaderId) |
4199 | LeaderId = RunningDepId++; |
4200 | DepId = LeaderId; |
4201 | } else |
4202 | // Each access has its own dependence set. |
4203 | DepId = RunningDepId++; |
4204 | |
4205 | RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap); |
4206 | |
4207 | DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a runtime check ptr:" << *Ptr << '\n'; } } while (0); |
4208 | } else { |
4209 | CanDoRT = false; |
4210 | } |
4211 | } |
4212 | |
4213 | if (IsDepCheckNeeded && CanDoRT && RunningDepId == 2) |
4214 | NumComparisons += 0; // Only one dependence set. |
4215 | else { |
4216 | NumComparisons += (NumWritePtrChecks * (NumReadPtrChecks + |
4217 | NumWritePtrChecks - 1)); |
4218 | } |
4219 | |
4220 | ++ASId; |
4221 | } |
4222 | |
4223 | // If the pointers that we would use for the bounds comparison have different |
4224 | // address spaces, assume the values aren't directly comparable, so we can't |
4225 | // use them for the runtime check. We also have to assume they could |
4226 | // overlap. In the future there should be metadata for whether address spaces |
4227 | // are disjoint. |
4228 | unsigned NumPointers = RtCheck.Pointers.size(); |
4229 | for (unsigned i = 0; i < NumPointers; ++i) { |
4230 | for (unsigned j = i + 1; j < NumPointers; ++j) { |
4231 | // Only need to check pointers between two different dependency sets. |
4232 | if (RtCheck.DependencySetId[i] == RtCheck.DependencySetId[j]) |
4233 | continue; |
4234 | // Only need to check pointers in the same alias set. |
4235 | if (RtCheck.AliasSetId[i] != RtCheck.AliasSetId[j]) |
4236 | continue; |
4237 | |
4238 | Value *PtrI = RtCheck.Pointers[i]; |
4239 | Value *PtrJ = RtCheck.Pointers[j]; |
4240 | |
4241 | unsigned ASi = PtrI->getType()->getPointerAddressSpace(); |
4242 | unsigned ASj = PtrJ->getType()->getPointerAddressSpace(); |
4243 | if (ASi != ASj) { |
4244 | DEBUG(dbgs() << "LV: Runtime check would require comparison between"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Runtime check would require comparison between" " different address spaces\n"; } } while (0) |
4245 | " different address spaces\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Runtime check would require comparison between" " different address spaces\n"; } } while (0); |
4246 | return false; |
4247 | } |
4248 | } |
4249 | } |
4250 | |
4251 | return CanDoRT; |
4252 | } |
4253 | |
4254 | void AccessAnalysis::processMemAccesses() { |
4255 | // We process the set twice: first we process read-write pointers, last we |
4256 | // process read-only pointers. This allows us to skip dependence tests for |
4257 | // read-only pointers. |
4258 | |
4259 | DEBUG(dbgs() << "LV: Processing memory accesses...\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Processing memory accesses...\n" ; } } while (0); |
4260 | DEBUG(dbgs() << " AST: "; AST.dump())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << " AST: "; AST.dump(); } } while (0); |
4261 | DEBUG(dbgs() << "LV: Accesses:\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Accesses:\n"; } } while (0); |
4262 | DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { for (auto A : Accesses) dbgs() << "\t" << *A.getPointer() << " (" << (A.getInt () ? "write" : (ReadOnlyPtr.count(A.getPointer()) ? "read-only" : "read")) << ")\n"; }; } } while (0) |
4263 | for (auto A : Accesses)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { for (auto A : Accesses) dbgs() << "\t" << *A.getPointer() << " (" << (A.getInt () ? "write" : (ReadOnlyPtr.count(A.getPointer()) ? "read-only" : "read")) << ")\n"; }; } } while (0) |
4264 | dbgs() << "\t" << *A.getPointer() << " (" <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { for (auto A : Accesses) dbgs() << "\t" << *A.getPointer() << " (" << (A.getInt () ? "write" : (ReadOnlyPtr.count(A.getPointer()) ? "read-only" : "read")) << ")\n"; }; } } while (0) |
4265 | (A.getInt() ? "write" : (ReadOnlyPtr.count(A.getPointer()) ?do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { for (auto A : Accesses) dbgs() << "\t" << *A.getPointer() << " (" << (A.getInt () ? "write" : (ReadOnlyPtr.count(A.getPointer()) ? "read-only" : "read")) << ")\n"; }; } } while (0) |
4266 | "read-only" : "read")) << ")\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { for (auto A : Accesses) dbgs() << "\t" << *A.getPointer() << " (" << (A.getInt () ? "write" : (ReadOnlyPtr.count(A.getPointer()) ? "read-only" : "read")) << ")\n"; }; } } while (0) |
4267 | })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { for (auto A : Accesses) dbgs() << "\t" << *A.getPointer() << " (" << (A.getInt () ? "write" : (ReadOnlyPtr.count(A.getPointer()) ? "read-only" : "read")) << ")\n"; }; } } while (0); |
4268 | |
4269 | // The AliasSetTracker has nicely partitioned our pointers by metadata |
4270 | // compatibility and potential for underlying-object overlap. As a result, we |
4271 | // only need to check for potential pointer dependencies within each alias |
4272 | // set. |
4273 | for (auto &AS : AST) { |
4274 | // Note that both the alias-set tracker and the alias sets themselves used |
4275 | // linked lists internally and so the iteration order here is deterministic |
4276 | // (matching the original instruction order within each set). |
4277 | |
4278 | bool SetHasWrite = false; |
4279 | |
4280 | // Map of pointers to last access encountered. |
4281 | typedef DenseMap<Value*, MemAccessInfo> UnderlyingObjToAccessMap; |
4282 | UnderlyingObjToAccessMap ObjToLastAccess; |
4283 | |
4284 | // Set of access to check after all writes have been processed. |
4285 | PtrAccessSet DeferredAccesses; |
4286 | |
4287 | // Iterate over each alias set twice, once to process read/write pointers, |
4288 | // and then to process read-only pointers. |
4289 | for (int SetIteration = 0; SetIteration < 2; ++SetIteration) { |
4290 | bool UseDeferred = SetIteration > 0; |
4291 | PtrAccessSet &S = UseDeferred ? DeferredAccesses : Accesses; |
4292 | |
4293 | for (auto A : AS) { |
4294 | Value *Ptr = A.getValue(); |
4295 | bool IsWrite = S.count(MemAccessInfo(Ptr, true)); |
4296 | |
4297 | // If we're using the deferred access set, then it contains only reads. |
4298 | bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite; |
4299 | if (UseDeferred && !IsReadOnlyPtr) |
4300 | continue; |
4301 | // Otherwise, the pointer must be in the PtrAccessSet, either as a read |
4302 | // or a write. |
4303 | assert(((IsReadOnlyPtr && UseDeferred) || IsWrite ||((((IsReadOnlyPtr && UseDeferred) || IsWrite || S.count (MemAccessInfo(Ptr, false))) && "Alias-set pointer not in the access set?" ) ? static_cast<void> (0) : __assert_fail ("((IsReadOnlyPtr && UseDeferred) || IsWrite || S.count(MemAccessInfo(Ptr, false))) && \"Alias-set pointer not in the access set?\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 4305, __PRETTY_FUNCTION__)) |
4304 | S.count(MemAccessInfo(Ptr, false))) &&((((IsReadOnlyPtr && UseDeferred) || IsWrite || S.count (MemAccessInfo(Ptr, false))) && "Alias-set pointer not in the access set?" ) ? static_cast<void> (0) : __assert_fail ("((IsReadOnlyPtr && UseDeferred) || IsWrite || S.count(MemAccessInfo(Ptr, false))) && \"Alias-set pointer not in the access set?\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 4305, __PRETTY_FUNCTION__)) |
4305 | "Alias-set pointer not in the access set?")((((IsReadOnlyPtr && UseDeferred) || IsWrite || S.count (MemAccessInfo(Ptr, false))) && "Alias-set pointer not in the access set?" ) ? static_cast<void> (0) : __assert_fail ("((IsReadOnlyPtr && UseDeferred) || IsWrite || S.count(MemAccessInfo(Ptr, false))) && \"Alias-set pointer not in the access set?\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 4305, __PRETTY_FUNCTION__)); |
4306 | |
4307 | MemAccessInfo Access(Ptr, IsWrite); |
4308 | DepCands.insert(Access); |
4309 | |
4310 | // Memorize read-only pointers for later processing and skip them in the |
4311 | // first round (they need to be checked after we have seen all write |
4312 | // pointers). Note: we also mark pointer that are not consecutive as |
4313 | // "read-only" pointers (so that we check "a[b[i]] +="). Hence, we need |
4314 | // the second check for "!IsWrite". |
4315 | if (!UseDeferred && IsReadOnlyPtr) { |
4316 | DeferredAccesses.insert(Access); |
4317 | continue; |
4318 | } |
4319 | |
4320 | // If this is a write - check other reads and writes for conflicts. If |
4321 | // this is a read only check other writes for conflicts (but only if |
4322 | // there is no other write to the ptr - this is an optimization to |
4323 | // catch "a[i] = a[i] + " without having to do a dependence check). |
4324 | if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) { |
4325 | CheckDeps.insert(Access); |
4326 | IsRTCheckNeeded = true; |
4327 | } |
4328 | |
4329 | if (IsWrite) |
4330 | SetHasWrite = true; |
4331 | |
4332 | // Create sets of pointers connected by a shared alias set and |
4333 | // underlying object. |
4334 | typedef SmallVector<Value *, 16> ValueVector; |
4335 | ValueVector TempObjects; |
4336 | GetUnderlyingObjects(Ptr, TempObjects, DL); |
4337 | for (Value *UnderlyingObj : TempObjects) { |
4338 | UnderlyingObjToAccessMap::iterator Prev = |
4339 | ObjToLastAccess.find(UnderlyingObj); |
4340 | if (Prev != ObjToLastAccess.end()) |
4341 | DepCands.unionSets(Access, Prev->second); |
4342 | |
4343 | ObjToLastAccess[UnderlyingObj] = Access; |
4344 | } |
4345 | } |
4346 | } |
4347 | } |
4348 | } |
4349 | |
4350 | namespace { |
4351 | /// \brief Checks memory dependences among accesses to the same underlying |
4352 | /// object to determine whether there vectorization is legal or not (and at |
4353 | /// which vectorization factor). |
4354 | /// |
4355 | /// This class works under the assumption that we already checked that memory |
4356 | /// locations with different underlying pointers are "must-not alias". |
4357 | /// We use the ScalarEvolution framework to symbolically evalutate access |
4358 | /// functions pairs. Since we currently don't restructure the loop we can rely |
4359 | /// on the program order of memory accesses to determine their safety. |
4360 | /// At the moment we will only deem accesses as safe for: |
4361 | /// * A negative constant distance assuming program order. |
4362 | /// |
4363 | /// Safe: tmp = a[i + 1]; OR a[i + 1] = x; |
4364 | /// a[i] = tmp; y = a[i]; |
4365 | /// |
4366 | /// The latter case is safe because later checks guarantuee that there can't |
4367 | /// be a cycle through a phi node (that is, we check that "x" and "y" is not |
4368 | /// the same variable: a header phi can only be an induction or a reduction, a |
4369 | /// reduction can't have a memory sink, an induction can't have a memory |
4370 | /// source). This is important and must not be violated (or we have to |
4371 | /// resort to checking for cycles through memory). |
4372 | /// |
4373 | /// * A positive constant distance assuming program order that is bigger |
4374 | /// than the biggest memory access. |
4375 | /// |
4376 | /// tmp = a[i] OR b[i] = x |
4377 | /// a[i+2] = tmp y = b[i+2]; |
4378 | /// |
4379 | /// Safe distance: 2 x sizeof(a[0]), and 2 x sizeof(b[0]), respectively. |
4380 | /// |
4381 | /// * Zero distances and all accesses have the same size. |
4382 | /// |
4383 | class MemoryDepChecker { |
4384 | public: |
4385 | typedef PointerIntPair<Value *, 1, bool> MemAccessInfo; |
4386 | typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet; |
4387 | |
4388 | MemoryDepChecker(ScalarEvolution *Se, const DataLayout *Dl, const Loop *L) |
4389 | : SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0), |
4390 | ShouldRetryWithRuntimeCheck(false) {} |
4391 | |
4392 | /// \brief Register the location (instructions are given increasing numbers) |
4393 | /// of a write access. |
4394 | void addAccess(StoreInst *SI) { |
4395 | Value *Ptr = SI->getPointerOperand(); |
4396 | Accesses[MemAccessInfo(Ptr, true)].push_back(AccessIdx); |
4397 | InstMap.push_back(SI); |
4398 | ++AccessIdx; |
4399 | } |
4400 | |
4401 | /// \brief Register the location (instructions are given increasing numbers) |
4402 | /// of a write access. |
4403 | void addAccess(LoadInst *LI) { |
4404 | Value *Ptr = LI->getPointerOperand(); |
4405 | Accesses[MemAccessInfo(Ptr, false)].push_back(AccessIdx); |
4406 | InstMap.push_back(LI); |
4407 | ++AccessIdx; |
4408 | } |
4409 | |
4410 | /// \brief Check whether the dependencies between the accesses are safe. |
4411 | /// |
4412 | /// Only checks sets with elements in \p CheckDeps. |
4413 | bool areDepsSafe(AccessAnalysis::DepCandidates &AccessSets, |
4414 | MemAccessInfoSet &CheckDeps, ValueToValueMap &Strides); |
4415 | |
4416 | /// \brief The maximum number of bytes of a vector register we can vectorize |
4417 | /// the accesses safely with. |
4418 | unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; } |
4419 | |
4420 | /// \brief In same cases when the dependency check fails we can still |
4421 | /// vectorize the loop with a dynamic array access check. |
4422 | bool shouldRetryWithRuntimeCheck() { return ShouldRetryWithRuntimeCheck; } |
4423 | |
4424 | private: |
4425 | ScalarEvolution *SE; |
4426 | const DataLayout *DL; |
4427 | const Loop *InnermostLoop; |
4428 | |
4429 | /// \brief Maps access locations (ptr, read/write) to program order. |
4430 | DenseMap<MemAccessInfo, std::vector<unsigned> > Accesses; |
4431 | |
4432 | /// \brief Memory access instructions in program order. |
4433 | SmallVector<Instruction *, 16> InstMap; |
4434 | |
4435 | /// \brief The program order index to be used for the next instruction. |
4436 | unsigned AccessIdx; |
4437 | |
4438 | // We can access this many bytes in parallel safely. |
4439 | unsigned MaxSafeDepDistBytes; |
4440 | |
4441 | /// \brief If we see a non-constant dependence distance we can still try to |
4442 | /// vectorize this loop with runtime checks. |
4443 | bool ShouldRetryWithRuntimeCheck; |
4444 | |
4445 | /// \brief Check whether there is a plausible dependence between the two |
4446 | /// accesses. |
4447 | /// |
4448 | /// Access \p A must happen before \p B in program order. The two indices |
4449 | /// identify the index into the program order map. |
4450 | /// |
4451 | /// This function checks whether there is a plausible dependence (or the |
4452 | /// absence of such can't be proved) between the two accesses. If there is a |
4453 | /// plausible dependence but the dependence distance is bigger than one |
4454 | /// element access it records this distance in \p MaxSafeDepDistBytes (if this |
4455 | /// distance is smaller than any other distance encountered so far). |
4456 | /// Otherwise, this function returns true signaling a possible dependence. |
4457 | bool isDependent(const MemAccessInfo &A, unsigned AIdx, |
4458 | const MemAccessInfo &B, unsigned BIdx, |
4459 | ValueToValueMap &Strides); |
4460 | |
4461 | /// \brief Check whether the data dependence could prevent store-load |
4462 | /// forwarding. |
4463 | bool couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize); |
4464 | }; |
4465 | |
4466 | } // end anonymous namespace |
4467 | |
4468 | static bool isInBoundsGep(Value *Ptr) { |
4469 | if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr)) |
4470 | return GEP->isInBounds(); |
4471 | return false; |
4472 | } |
4473 | |
4474 | /// \brief Check whether the access through \p Ptr has a constant stride. |
4475 | static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr, |
4476 | const Loop *Lp, ValueToValueMap &StridesMap) { |
4477 | const Type *Ty = Ptr->getType(); |
4478 | assert(Ty->isPointerTy() && "Unexpected non-ptr")((Ty->isPointerTy() && "Unexpected non-ptr") ? static_cast <void> (0) : __assert_fail ("Ty->isPointerTy() && \"Unexpected non-ptr\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 4478, __PRETTY_FUNCTION__)); |
4479 | |
4480 | // Make sure that the pointer does not point to aggregate types. |
4481 | const PointerType *PtrTy = cast<PointerType>(Ty); |
4482 | if (PtrTy->getElementType()->isAggregateType()) { |
4483 | DEBUG(dbgs() << "LV: Bad stride - Not a pointer to a scalar type" << *Ptr <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Bad stride - Not a pointer to a scalar type" << *Ptr << "\n"; } } while (0) |
4484 | "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Bad stride - Not a pointer to a scalar type" << *Ptr << "\n"; } } while (0); |
4485 | return 0; |
4486 | } |
4487 | |
4488 | const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, StridesMap, Ptr); |
4489 | |
4490 | const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev); |
4491 | if (!AR) { |
4492 | DEBUG(dbgs() << "LV: Bad stride - Not an AddRecExpr pointer "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Bad stride - Not an AddRecExpr pointer " << *Ptr << " SCEV: " << *PtrScev << "\n" ; } } while (0) |
4493 | << *Ptr << " SCEV: " << *PtrScev << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Bad stride - Not an AddRecExpr pointer " << *Ptr << " SCEV: " << *PtrScev << "\n" ; } } while (0); |
4494 | return 0; |
4495 | } |
4496 | |
4497 | // The accesss function must stride over the innermost loop. |
4498 | if (Lp != AR->getLoop()) { |
4499 | DEBUG(dbgs() << "LV: Bad stride - Not striding over innermost loop " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Bad stride - Not striding over innermost loop " << *Ptr << " SCEV: " << *PtrScev << "\n" ; } } while (0) |
4500 | *Ptr << " SCEV: " << *PtrScev << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Bad stride - Not striding over innermost loop " << *Ptr << " SCEV: " << *PtrScev << "\n" ; } } while (0); |
4501 | } |
4502 | |
4503 | // The address calculation must not wrap. Otherwise, a dependence could be |
4504 | // inverted. |
4505 | // An inbounds getelementptr that is a AddRec with a unit stride |
4506 | // cannot wrap per definition. The unit stride requirement is checked later. |
4507 | // An getelementptr without an inbounds attribute and unit stride would have |
4508 | // to access the pointer value "0" which is undefined behavior in address |
4509 | // space 0, therefore we can also vectorize this case. |
4510 | bool IsInBoundsGEP = isInBoundsGep(Ptr); |
4511 | bool IsNoWrapAddRec = AR->getNoWrapFlags(SCEV::NoWrapMask); |
4512 | bool IsInAddressSpaceZero = PtrTy->getAddressSpace() == 0; |
4513 | if (!IsNoWrapAddRec && !IsInBoundsGEP && !IsInAddressSpaceZero) { |
4514 | DEBUG(dbgs() << "LV: Bad stride - Pointer may wrap in the address space "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Bad stride - Pointer may wrap in the address space " << *Ptr << " SCEV: " << *PtrScev << "\n" ; } } while (0) |
4515 | << *Ptr << " SCEV: " << *PtrScev << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Bad stride - Pointer may wrap in the address space " << *Ptr << " SCEV: " << *PtrScev << "\n" ; } } while (0); |
4516 | return 0; |
4517 | } |
4518 | |
4519 | // Check the step is constant. |
4520 | const SCEV *Step = AR->getStepRecurrence(*SE); |
4521 | |
4522 | // Calculate the pointer stride and check if it is consecutive. |
4523 | const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); |
4524 | if (!C) { |
4525 | DEBUG(dbgs() << "LV: Bad stride - Not a constant strided " << *Ptr <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Bad stride - Not a constant strided " << *Ptr << " SCEV: " << *PtrScev << "\n" ; } } while (0) |
4526 | " SCEV: " << *PtrScev << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Bad stride - Not a constant strided " << *Ptr << " SCEV: " << *PtrScev << "\n" ; } } while (0); |
4527 | return 0; |
4528 | } |
4529 | |
4530 | int64_t Size = DL->getTypeAllocSize(PtrTy->getElementType()); |
4531 | const APInt &APStepVal = C->getValue()->getValue(); |
4532 | |
4533 | // Huge step value - give up. |
4534 | if (APStepVal.getBitWidth() > 64) |
4535 | return 0; |
4536 | |
4537 | int64_t StepVal = APStepVal.getSExtValue(); |
4538 | |
4539 | // Strided access. |
4540 | int64_t Stride = StepVal / Size; |
4541 | int64_t Rem = StepVal % Size; |
4542 | if (Rem) |
4543 | return 0; |
4544 | |
4545 | // If the SCEV could wrap but we have an inbounds gep with a unit stride we |
4546 | // know we can't "wrap around the address space". In case of address space |
4547 | // zero we know that this won't happen without triggering undefined behavior. |
4548 | if (!IsNoWrapAddRec && (IsInBoundsGEP || IsInAddressSpaceZero) && |
4549 | Stride != 1 && Stride != -1) |
4550 | return 0; |
4551 | |
4552 | return Stride; |
4553 | } |
4554 | |
4555 | bool MemoryDepChecker::couldPreventStoreLoadForward(unsigned Distance, |
4556 | unsigned TypeByteSize) { |
4557 | // If loads occur at a distance that is not a multiple of a feasible vector |
4558 | // factor store-load forwarding does not take place. |
4559 | // Positive dependences might cause troubles because vectorizing them might |
4560 | // prevent store-load forwarding making vectorized code run a lot slower. |
4561 | // a[i] = a[i-3] ^ a[i-8]; |
4562 | // The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and |
4563 | // hence on your typical architecture store-load forwarding does not take |
4564 | // place. Vectorizing in such cases does not make sense. |
4565 | // Store-load forwarding distance. |
4566 | const unsigned NumCyclesForStoreLoadThroughMemory = 8*TypeByteSize; |
4567 | // Maximum vector factor. |
4568 | unsigned MaxVFWithoutSLForwardIssues = MaxVectorWidth*TypeByteSize; |
4569 | if(MaxSafeDepDistBytes < MaxVFWithoutSLForwardIssues) |
4570 | MaxVFWithoutSLForwardIssues = MaxSafeDepDistBytes; |
4571 | |
4572 | for (unsigned vf = 2*TypeByteSize; vf <= MaxVFWithoutSLForwardIssues; |
4573 | vf *= 2) { |
4574 | if (Distance % vf && Distance / vf < NumCyclesForStoreLoadThroughMemory) { |
4575 | MaxVFWithoutSLForwardIssues = (vf >>=1); |
4576 | break; |
4577 | } |
4578 | } |
4579 | |
4580 | if (MaxVFWithoutSLForwardIssues< 2*TypeByteSize) { |
4581 | DEBUG(dbgs() << "LV: Distance " << Distance <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Distance " << Distance << " that could cause a store-load forwarding conflict\n" ; } } while (0) |
4582 | " that could cause a store-load forwarding conflict\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Distance " << Distance << " that could cause a store-load forwarding conflict\n" ; } } while (0); |
4583 | return true; |
4584 | } |
4585 | |
4586 | if (MaxVFWithoutSLForwardIssues < MaxSafeDepDistBytes && |
4587 | MaxVFWithoutSLForwardIssues != MaxVectorWidth*TypeByteSize) |
4588 | MaxSafeDepDistBytes = MaxVFWithoutSLForwardIssues; |
4589 | return false; |
4590 | } |
4591 | |
4592 | bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, |
4593 | const MemAccessInfo &B, unsigned BIdx, |
4594 | ValueToValueMap &Strides) { |
4595 | assert (AIdx < BIdx && "Must pass arguments in program order")((AIdx < BIdx && "Must pass arguments in program order" ) ? static_cast<void> (0) : __assert_fail ("AIdx < BIdx && \"Must pass arguments in program order\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 4595, __PRETTY_FUNCTION__)); |
4596 | |
4597 | Value *APtr = A.getPointer(); |
4598 | Value *BPtr = B.getPointer(); |
4599 | bool AIsWrite = A.getInt(); |
4600 | bool BIsWrite = B.getInt(); |
4601 | |
4602 | // Two reads are independent. |
4603 | if (!AIsWrite && !BIsWrite) |
4604 | return false; |
4605 | |
4606 | // We cannot check pointers in different address spaces. |
4607 | if (APtr->getType()->getPointerAddressSpace() != |
4608 | BPtr->getType()->getPointerAddressSpace()) |
4609 | return true; |
4610 | |
4611 | const SCEV *AScev = replaceSymbolicStrideSCEV(SE, Strides, APtr); |
4612 | const SCEV *BScev = replaceSymbolicStrideSCEV(SE, Strides, BPtr); |
4613 | |
4614 | int StrideAPtr = isStridedPtr(SE, DL, APtr, InnermostLoop, Strides); |
4615 | int StrideBPtr = isStridedPtr(SE, DL, BPtr, InnermostLoop, Strides); |
4616 | |
4617 | const SCEV *Src = AScev; |
4618 | const SCEV *Sink = BScev; |
4619 | |
4620 | // If the induction step is negative we have to invert source and sink of the |
4621 | // dependence. |
4622 | if (StrideAPtr < 0) { |
4623 | //Src = BScev; |
4624 | //Sink = AScev; |
4625 | std::swap(APtr, BPtr); |
4626 | std::swap(Src, Sink); |
4627 | std::swap(AIsWrite, BIsWrite); |
4628 | std::swap(AIdx, BIdx); |
4629 | std::swap(StrideAPtr, StrideBPtr); |
4630 | } |
4631 | |
4632 | const SCEV *Dist = SE->getMinusSCEV(Sink, Src); |
4633 | |
4634 | DEBUG(dbgs() << "LV: Src Scev: " << *Src << "Sink Scev: " << *Sinkdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Src Scev: " << *Src << "Sink Scev: " << *Sink << "(Induction step: " << StrideAPtr << ")\n"; } } while (0) |
4635 | << "(Induction step: " << StrideAPtr << ")\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Src Scev: " << *Src << "Sink Scev: " << *Sink << "(Induction step: " << StrideAPtr << ")\n"; } } while (0); |
4636 | DEBUG(dbgs() << "LV: Distance for " << *InstMap[AIdx] << " to "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Distance for " << *InstMap[AIdx] << " to " << *InstMap[BIdx] << ": " << *Dist << "\n"; } } while (0) |
4637 | << *InstMap[BIdx] << ": " << *Dist << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Distance for " << *InstMap[AIdx] << " to " << *InstMap[BIdx] << ": " << *Dist << "\n"; } } while (0); |
4638 | |
4639 | // Need consecutive accesses. We don't want to vectorize |
4640 | // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in |
4641 | // the address space. |
4642 | if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr){ |
4643 | DEBUG(dbgs() << "Non-consecutive pointer access\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "Non-consecutive pointer access\n" ; } } while (0); |
4644 | return true; |
4645 | } |
4646 | |
4647 | const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist); |
4648 | if (!C) { |
4649 | DEBUG(dbgs() << "LV: Dependence because of non-constant distance\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Dependence because of non-constant distance\n" ; } } while (0); |
4650 | ShouldRetryWithRuntimeCheck = true; |
4651 | return true; |
4652 | } |
4653 | |
4654 | Type *ATy = APtr->getType()->getPointerElementType(); |
4655 | Type *BTy = BPtr->getType()->getPointerElementType(); |
4656 | unsigned TypeByteSize = DL->getTypeAllocSize(ATy); |
4657 | |
4658 | // Negative distances are not plausible dependencies. |
4659 | const APInt &Val = C->getValue()->getValue(); |
4660 | if (Val.isNegative()) { |
4661 | bool IsTrueDataDependence = (AIsWrite && !BIsWrite); |
4662 | if (IsTrueDataDependence && |
4663 | (couldPreventStoreLoadForward(Val.abs().getZExtValue(), TypeByteSize) || |
4664 | ATy != BTy)) |
4665 | return true; |
4666 | |
4667 | DEBUG(dbgs() << "LV: Dependence is negative: NoDep\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Dependence is negative: NoDep\n" ; } } while (0); |
4668 | return false; |
4669 | } |
4670 | |
4671 | // Write to the same location with the same size. |
4672 | // Could be improved to assert type sizes are the same (i32 == float, etc). |
4673 | if (Val == 0) { |
4674 | if (ATy == BTy) |
4675 | return false; |
4676 | DEBUG(dbgs() << "LV: Zero dependence difference but different types\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Zero dependence difference but different types\n" ; } } while (0); |
4677 | return true; |
4678 | } |
4679 | |
4680 | assert(Val.isStrictlyPositive() && "Expect a positive value")((Val.isStrictlyPositive() && "Expect a positive value" ) ? static_cast<void> (0) : __assert_fail ("Val.isStrictlyPositive() && \"Expect a positive value\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 4680, __PRETTY_FUNCTION__)); |
4681 | |
4682 | // Positive distance bigger than max vectorization factor. |
4683 | if (ATy != BTy) { |
4684 | DEBUG(dbgs() <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: ReadWrite-Write positive dependency with different types\n" ; } } while (0) |
4685 | "LV: ReadWrite-Write positive dependency with different types\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: ReadWrite-Write positive dependency with different types\n" ; } } while (0); |
4686 | return false; |
4687 | } |
4688 | |
4689 | unsigned Distance = (unsigned) Val.getZExtValue(); |
4690 | |
4691 | // Bail out early if passed-in parameters make vectorization not feasible. |
4692 | unsigned ForcedFactor = VectorizationFactor ? VectorizationFactor : 1; |
4693 | unsigned ForcedUnroll = VectorizationInterleave ? VectorizationInterleave : 1; |
4694 | |
4695 | // The distance must be bigger than the size needed for a vectorized version |
4696 | // of the operation and the size of the vectorized operation must not be |
4697 | // bigger than the currrent maximum size. |
4698 | if (Distance < 2*TypeByteSize || |
4699 | 2*TypeByteSize > MaxSafeDepDistBytes || |
4700 | Distance < TypeByteSize * ForcedUnroll * ForcedFactor) { |
4701 | DEBUG(dbgs() << "LV: Failure because of Positive distance "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Failure because of Positive distance " << Val.getSExtValue() << '\n'; } } while (0) |
4702 | << Val.getSExtValue() << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Failure because of Positive distance " << Val.getSExtValue() << '\n'; } } while (0); |
4703 | return true; |
4704 | } |
4705 | |
4706 | MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ? |
4707 | Distance : MaxSafeDepDistBytes; |
4708 | |
4709 | bool IsTrueDataDependence = (!AIsWrite && BIsWrite); |
4710 | if (IsTrueDataDependence && |
4711 | couldPreventStoreLoadForward(Distance, TypeByteSize)) |
4712 | return true; |
4713 | |
4714 | DEBUG(dbgs() << "LV: Positive distance " << Val.getSExtValue() <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Positive distance " << Val.getSExtValue() << " with max VF = " << MaxSafeDepDistBytes / TypeByteSize << '\n'; } } while ( 0) |
4715 | " with max VF = " << MaxSafeDepDistBytes / TypeByteSize << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Positive distance " << Val.getSExtValue() << " with max VF = " << MaxSafeDepDistBytes / TypeByteSize << '\n'; } } while ( 0); |
4716 | |
4717 | return false; |
4718 | } |
4719 | |
4720 | bool MemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates &AccessSets, |
4721 | MemAccessInfoSet &CheckDeps, |
4722 | ValueToValueMap &Strides) { |
4723 | |
4724 | MaxSafeDepDistBytes = -1U; |
4725 | while (!CheckDeps.empty()) { |
4726 | MemAccessInfo CurAccess = *CheckDeps.begin(); |
4727 | |
4728 | // Get the relevant memory access set. |
4729 | EquivalenceClasses<MemAccessInfo>::iterator I = |
4730 | AccessSets.findValue(AccessSets.getLeaderValue(CurAccess)); |
4731 | |
4732 | // Check accesses within this set. |
4733 | EquivalenceClasses<MemAccessInfo>::member_iterator AI, AE; |
4734 | AI = AccessSets.member_begin(I), AE = AccessSets.member_end(); |
4735 | |
4736 | // Check every access pair. |
4737 | while (AI != AE) { |
4738 | CheckDeps.erase(*AI); |
4739 | EquivalenceClasses<MemAccessInfo>::member_iterator OI = std::next(AI); |
4740 | while (OI != AE) { |
4741 | // Check every accessing instruction pair in program order. |
4742 | for (std::vector<unsigned>::iterator I1 = Accesses[*AI].begin(), |
4743 | I1E = Accesses[*AI].end(); I1 != I1E; ++I1) |
4744 | for (std::vector<unsigned>::iterator I2 = Accesses[*OI].begin(), |
4745 | I2E = Accesses[*OI].end(); I2 != I2E; ++I2) { |
4746 | if (*I1 < *I2 && isDependent(*AI, *I1, *OI, *I2, Strides)) |
4747 | return false; |
4748 | if (*I2 < *I1 && isDependent(*OI, *I2, *AI, *I1, Strides)) |
4749 | return false; |
4750 | } |
4751 | ++OI; |
4752 | } |
4753 | AI++; |
4754 | } |
4755 | } |
4756 | return true; |
4757 | } |
4758 | |
4759 | bool LoopVectorizationLegality::canVectorizeMemory() { |
4760 | |
4761 | typedef SmallVector<Value*, 16> ValueVector; |
4762 | typedef SmallPtrSet<Value*, 16> ValueSet; |
4763 | |
4764 | // Holds the Load and Store *instructions*. |
4765 | ValueVector Loads; |
4766 | ValueVector Stores; |
4767 | |
4768 | // Holds all the different accesses in the loop. |
4769 | unsigned NumReads = 0; |
4770 | unsigned NumReadWrites = 0; |
4771 | |
4772 | PtrRtCheck.Pointers.clear(); |
4773 | PtrRtCheck.Need = false; |
4774 | |
4775 | const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); |
4776 | MemoryDepChecker DepChecker(SE, DL, TheLoop); |
4777 | |
4778 | // For each block. |
4779 | for (Loop::block_iterator bb = TheLoop->block_begin(), |
4780 | be = TheLoop->block_end(); bb != be; ++bb) { |
4781 | |
4782 | // Scan the BB and collect legal loads and stores. |
4783 | for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; |
4784 | ++it) { |
4785 | |
4786 | // If this is a load, save it. If this instruction can read from memory |
4787 | // but is not a load, then we quit. Notice that we don't handle function |
4788 | // calls that read or write. |
4789 | if (it->mayReadFromMemory()) { |
4790 | // Many math library functions read the rounding mode. We will only |
4791 | // vectorize a loop if it contains known function calls that don't set |
4792 | // the flag. Therefore, it is safe to ignore this read from memory. |
4793 | CallInst *Call = dyn_cast<CallInst>(it); |
4794 | if (Call && getIntrinsicIDForCall(Call, TLI)) |
4795 | continue; |
4796 | |
4797 | LoadInst *Ld = dyn_cast<LoadInst>(it); |
4798 | if (!Ld || (!Ld->isSimple() && !IsAnnotatedParallel)) { |
4799 | emitAnalysis(Report(Ld) |
4800 | << "read with atomic ordering or volatile read"); |
4801 | DEBUG(dbgs() << "LV: Found a non-simple load.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a non-simple load.\n" ; } } while (0); |
4802 | return false; |
4803 | } |
4804 | NumLoads++; |
4805 | Loads.push_back(Ld); |
4806 | DepChecker.addAccess(Ld); |
4807 | continue; |
4808 | } |
4809 | |
4810 | // Save 'store' instructions. Abort if other instructions write to memory. |
4811 | if (it->mayWriteToMemory()) { |
4812 | StoreInst *St = dyn_cast<StoreInst>(it); |
4813 | if (!St) { |
4814 | emitAnalysis(Report(it) << "instruction cannot be vectorized"); |
4815 | return false; |
4816 | } |
4817 | if (!St->isSimple() && !IsAnnotatedParallel) { |
4818 | emitAnalysis(Report(St) |
4819 | << "write with atomic ordering or volatile write"); |
4820 | DEBUG(dbgs() << "LV: Found a non-simple store.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a non-simple store.\n" ; } } while (0); |
4821 | return false; |
4822 | } |
4823 | NumStores++; |
4824 | Stores.push_back(St); |
4825 | DepChecker.addAccess(St); |
4826 | } |
4827 | } // Next instr. |
4828 | } // Next block. |
4829 | |
4830 | // Now we have two lists that hold the loads and the stores. |
4831 | // Next, we find the pointers that they use. |
4832 | |
4833 | // Check if we see any stores. If there are no stores, then we don't |
4834 | // care if the pointers are *restrict*. |
4835 | if (!Stores.size()) { |
4836 | DEBUG(dbgs() << "LV: Found a read-only loop!\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a read-only loop!\n" ; } } while (0); |
4837 | return true; |
4838 | } |
4839 | |
4840 | AccessAnalysis::DepCandidates DependentAccesses; |
4841 | AccessAnalysis Accesses(DL, AA, DependentAccesses); |
4842 | |
4843 | // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects |
4844 | // multiple times on the same object. If the ptr is accessed twice, once |
4845 | // for read and once for write, it will only appear once (on the write |
4846 | // list). This is okay, since we are going to check for conflicts between |
4847 | // writes and between reads and writes, but not between reads and reads. |
4848 | ValueSet Seen; |
4849 | |
4850 | ValueVector::iterator I, IE; |
4851 | for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) { |
4852 | StoreInst *ST = cast<StoreInst>(*I); |
4853 | Value* Ptr = ST->getPointerOperand(); |
4854 | |
4855 | if (isUniform(Ptr)) { |
4856 | emitAnalysis( |
4857 | Report(ST) |
4858 | << "write to a loop invariant address could not be vectorized"); |
4859 | DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We don't allow storing to uniform addresses\n" ; } } while (0); |
4860 | return false; |
4861 | } |
4862 | |
4863 | // If we did *not* see this pointer before, insert it to the read-write |
4864 | // list. At this phase it is only a 'write' list. |
4865 | if (Seen.insert(Ptr).second) { |
4866 | ++NumReadWrites; |
4867 | |
4868 | AliasAnalysis::Location Loc = AA->getLocation(ST); |
4869 | // The TBAA metadata could have a control dependency on the predication |
4870 | // condition, so we cannot rely on it when determining whether or not we |
4871 | // need runtime pointer checks. |
4872 | if (blockNeedsPredication(ST->getParent())) |
4873 | Loc.AATags.TBAA = nullptr; |
4874 | |
4875 | Accesses.addStore(Loc); |
4876 | } |
4877 | } |
4878 | |
4879 | if (IsAnnotatedParallel) { |
4880 | DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: A loop annotated parallel, ignore memory dependency " << "checks.\n"; } } while (0) |
4881 | << "LV: A loop annotated parallel, ignore memory dependency "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: A loop annotated parallel, ignore memory dependency " << "checks.\n"; } } while (0) |
4882 | << "checks.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: A loop annotated parallel, ignore memory dependency " << "checks.\n"; } } while (0); |
4883 | return true; |
4884 | } |
4885 | |
4886 | for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { |
4887 | LoadInst *LD = cast<LoadInst>(*I); |
4888 | Value* Ptr = LD->getPointerOperand(); |
4889 | // If we did *not* see this pointer before, insert it to the |
4890 | // read list. If we *did* see it before, then it is already in |
4891 | // the read-write list. This allows us to vectorize expressions |
4892 | // such as A[i] += x; Because the address of A[i] is a read-write |
4893 | // pointer. This only works if the index of A[i] is consecutive. |
4894 | // If the address of i is unknown (for example A[B[i]]) then we may |
4895 | // read a few words, modify, and write a few words, and some of the |
4896 | // words may be written to the same address. |
4897 | bool IsReadOnlyPtr = false; |
4898 | if (Seen.insert(Ptr).second || |
4899 | !isStridedPtr(SE, DL, Ptr, TheLoop, Strides)) { |
4900 | ++NumReads; |
4901 | IsReadOnlyPtr = true; |
4902 | } |
4903 | |
4904 | AliasAnalysis::Location Loc = AA->getLocation(LD); |
4905 | // The TBAA metadata could have a control dependency on the predication |
4906 | // condition, so we cannot rely on it when determining whether or not we |
4907 | // need runtime pointer checks. |
4908 | if (blockNeedsPredication(LD->getParent())) |
4909 | Loc.AATags.TBAA = nullptr; |
4910 | |
4911 | Accesses.addLoad(Loc, IsReadOnlyPtr); |
4912 | } |
4913 | |
4914 | // If we write (or read-write) to a single destination and there are no |
4915 | // other reads in this loop then is it safe to vectorize. |
4916 | if (NumReadWrites == 1 && NumReads == 0) { |
4917 | DEBUG(dbgs() << "LV: Found a write-only loop!\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a write-only loop!\n" ; } } while (0); |
4918 | return true; |
4919 | } |
4920 | |
4921 | // Build dependence sets and check whether we need a runtime pointer bounds |
4922 | // check. |
4923 | Accesses.buildDependenceSets(); |
4924 | bool NeedRTCheck = Accesses.isRTCheckNeeded(); |
4925 | |
4926 | // Find pointers with computable bounds. We are going to use this information |
4927 | // to place a runtime bound check. |
4928 | unsigned NumComparisons = 0; |
4929 | bool CanDoRT = false; |
4930 | if (NeedRTCheck) |
4931 | CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, TheLoop, |
4932 | Strides); |
4933 | |
4934 | DEBUG(dbgs() << "LV: We need to do " << NumComparisons <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We need to do " << NumComparisons << " pointer comparisons.\n"; } } while (0) |
4935 | " pointer comparisons.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We need to do " << NumComparisons << " pointer comparisons.\n"; } } while (0); |
4936 | |
4937 | // If we only have one set of dependences to check pointers among we don't |
4938 | // need a runtime check. |
4939 | if (NumComparisons == 0 && NeedRTCheck) |
4940 | NeedRTCheck = false; |
4941 | |
4942 | // Check that we did not collect too many pointers or found an unsizeable |
4943 | // pointer. |
4944 | if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) { |
4945 | PtrRtCheck.reset(); |
4946 | CanDoRT = false; |
4947 | } |
4948 | |
4949 | if (CanDoRT) { |
4950 | DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We can perform a memory runtime check if needed.\n" ; } } while (0); |
4951 | } |
4952 | |
4953 | if (NeedRTCheck && !CanDoRT) { |
4954 | emitAnalysis(Report() << "cannot identify array bounds"); |
4955 | DEBUG(dbgs() << "LV: We can't vectorize because we can't find " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We can't vectorize because we can't find " << "the array bounds.\n"; } } while (0) |
4956 | "the array bounds.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We can't vectorize because we can't find " << "the array bounds.\n"; } } while (0); |
4957 | PtrRtCheck.reset(); |
4958 | return false; |
4959 | } |
4960 | |
4961 | PtrRtCheck.Need = NeedRTCheck; |
4962 | |
4963 | bool CanVecMem = true; |
4964 | if (Accesses.isDependencyCheckNeeded()) { |
4965 | DEBUG(dbgs() << "LV: Checking memory dependencies\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Checking memory dependencies\n" ; } } while (0); |
4966 | CanVecMem = DepChecker.areDepsSafe( |
4967 | DependentAccesses, Accesses.getDependenciesToCheck(), Strides); |
4968 | MaxSafeDepDistBytes = DepChecker.getMaxSafeDepDistBytes(); |
4969 | |
4970 | if (!CanVecMem && DepChecker.shouldRetryWithRuntimeCheck()) { |
4971 | DEBUG(dbgs() << "LV: Retrying with memory checks\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Retrying with memory checks\n" ; } } while (0); |
4972 | NeedRTCheck = true; |
4973 | |
4974 | // Clear the dependency checks. We assume they are not needed. |
4975 | Accesses.resetDepChecks(); |
4976 | |
4977 | PtrRtCheck.reset(); |
4978 | PtrRtCheck.Need = true; |
4979 | |
4980 | CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, |
4981 | TheLoop, Strides, true); |
4982 | // Check that we did not collect too many pointers or found an unsizeable |
4983 | // pointer. |
4984 | if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) { |
4985 | if (!CanDoRT && NumComparisons > 0) |
4986 | emitAnalysis(Report() |
4987 | << "cannot check memory dependencies at runtime"); |
4988 | else |
4989 | emitAnalysis(Report() |
4990 | << NumComparisons << " exceeds limit of " |
4991 | << RuntimeMemoryCheckThreshold |
4992 | << " dependent memory operations checked at runtime"); |
4993 | DEBUG(dbgs() << "LV: Can't vectorize with memory checks\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Can't vectorize with memory checks\n" ; } } while (0); |
4994 | PtrRtCheck.reset(); |
4995 | return false; |
4996 | } |
4997 | |
4998 | CanVecMem = true; |
4999 | } |
5000 | } |
5001 | |
5002 | if (!CanVecMem) |
5003 | emitAnalysis(Report() << "unsafe dependent memory operations in loop"); |
5004 | |
5005 | DEBUG(dbgs() << "LV: We" << (NeedRTCheck ? "" : " don't") <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We" << (NeedRTCheck ? "" : " don't") << " need a runtime memory check.\n"; } } while (0) |
5006 | " need a runtime memory check.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We" << (NeedRTCheck ? "" : " don't") << " need a runtime memory check.\n"; } } while (0); |
5007 | |
5008 | return CanVecMem; |
5009 | } |
5010 | |
5011 | static bool hasMultipleUsesOf(Instruction *I, |
5012 | SmallPtrSetImpl<Instruction *> &Insts) { |
5013 | unsigned NumUses = 0; |
5014 | for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) { |
5015 | if (Insts.count(dyn_cast<Instruction>(*Use))) |
5016 | ++NumUses; |
5017 | if (NumUses > 1) |
5018 | return true; |
5019 | } |
5020 | |
5021 | return false; |
5022 | } |
5023 | |
5024 | static bool areAllUsesIn(Instruction *I, SmallPtrSetImpl<Instruction *> &Set) { |
5025 | for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) |
5026 | if (!Set.count(dyn_cast<Instruction>(*Use))) |
5027 | return false; |
5028 | return true; |
5029 | } |
5030 | |
5031 | bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, |
5032 | ReductionKind Kind) { |
5033 | if (Phi->getNumIncomingValues() != 2) |
5034 | return false; |
5035 | |
5036 | // Reduction variables are only found in the loop header block. |
5037 | if (Phi->getParent() != TheLoop->getHeader()) |
5038 | return false; |
5039 | |
5040 | // Obtain the reduction start value from the value that comes from the loop |
5041 | // preheader. |
5042 | Value *RdxStart = Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader()); |
5043 | |
5044 | // ExitInstruction is the single value which is used outside the loop. |
5045 | // We only allow for a single reduction value to be used outside the loop. |
5046 | // This includes users of the reduction, variables (which form a cycle |
5047 | // which ends in the phi node). |
5048 | Instruction *ExitInstruction = nullptr; |
5049 | // Indicates that we found a reduction operation in our scan. |
5050 | bool FoundReduxOp = false; |
5051 | |
5052 | // We start with the PHI node and scan for all of the users of this |
5053 | // instruction. All users must be instructions that can be used as reduction |
5054 | // variables (such as ADD). We must have a single out-of-block user. The cycle |
5055 | // must include the original PHI. |
5056 | bool FoundStartPHI = false; |
5057 | |
5058 | // To recognize min/max patterns formed by a icmp select sequence, we store |
5059 | // the number of instruction we saw from the recognized min/max pattern, |
5060 | // to make sure we only see exactly the two instructions. |
5061 | unsigned NumCmpSelectPatternInst = 0; |
5062 | ReductionInstDesc ReduxDesc(false, nullptr); |
5063 | |
5064 | SmallPtrSet<Instruction *, 8> VisitedInsts; |
5065 | SmallVector<Instruction *, 8> Worklist; |
5066 | Worklist.push_back(Phi); |
5067 | VisitedInsts.insert(Phi); |
5068 | |
5069 | // A value in the reduction can be used: |
5070 | // - By the reduction: |
5071 | // - Reduction operation: |
5072 | // - One use of reduction value (safe). |
5073 | // - Multiple use of reduction value (not safe). |
5074 | // - PHI: |
5075 | // - All uses of the PHI must be the reduction (safe). |
5076 | // - Otherwise, not safe. |
5077 | // - By one instruction outside of the loop (safe). |
5078 | // - By further instructions outside of the loop (not safe). |
5079 | // - By an instruction that is not part of the reduction (not safe). |
5080 | // This is either: |
5081 | // * An instruction type other than PHI or the reduction operation. |
5082 | // * A PHI in the header other than the initial PHI. |
5083 | while (!Worklist.empty()) { |
5084 | Instruction *Cur = Worklist.back(); |
5085 | Worklist.pop_back(); |
5086 | |
5087 | // No Users. |
5088 | // If the instruction has no users then this is a broken chain and can't be |
5089 | // a reduction variable. |
5090 | if (Cur->use_empty()) |
5091 | return false; |
5092 | |
5093 | bool IsAPhi = isa<PHINode>(Cur); |
5094 | |
5095 | // A header PHI use other than the original PHI. |
5096 | if (Cur != Phi && IsAPhi && Cur->getParent() == Phi->getParent()) |
5097 | return false; |
5098 | |
5099 | // Reductions of instructions such as Div, and Sub is only possible if the |
5100 | // LHS is the reduction variable. |
5101 | if (!Cur->isCommutative() && !IsAPhi && !isa<SelectInst>(Cur) && |
5102 | !isa<ICmpInst>(Cur) && !isa<FCmpInst>(Cur) && |
5103 | !VisitedInsts.count(dyn_cast<Instruction>(Cur->getOperand(0)))) |
5104 | return false; |
5105 | |
5106 | // Any reduction instruction must be of one of the allowed kinds. |
5107 | ReduxDesc = isReductionInstr(Cur, Kind, ReduxDesc); |
5108 | if (!ReduxDesc.IsReduction) |
5109 | return false; |
5110 | |
5111 | // A reduction operation must only have one use of the reduction value. |
5112 | if (!IsAPhi && Kind != RK_IntegerMinMax && Kind != RK_FloatMinMax && |
5113 | hasMultipleUsesOf(Cur, VisitedInsts)) |
5114 | return false; |
5115 | |
5116 | // All inputs to a PHI node must be a reduction value. |
5117 | if(IsAPhi && Cur != Phi && !areAllUsesIn(Cur, VisitedInsts)) |
5118 | return false; |
5119 | |
5120 | if (Kind == RK_IntegerMinMax && (isa<ICmpInst>(Cur) || |
5121 | isa<SelectInst>(Cur))) |
5122 | ++NumCmpSelectPatternInst; |
5123 | if (Kind == RK_FloatMinMax && (isa<FCmpInst>(Cur) || |
5124 | isa<SelectInst>(Cur))) |
5125 | ++NumCmpSelectPatternInst; |
5126 | |
5127 | // Check whether we found a reduction operator. |
5128 | FoundReduxOp |= !IsAPhi; |
5129 | |
5130 | // Process users of current instruction. Push non-PHI nodes after PHI nodes |
5131 | // onto the stack. This way we are going to have seen all inputs to PHI |
5132 | // nodes once we get to them. |
5133 | SmallVector<Instruction *, 8> NonPHIs; |
5134 | SmallVector<Instruction *, 8> PHIs; |
5135 | for (User *U : Cur->users()) { |
5136 | Instruction *UI = cast<Instruction>(U); |
5137 | |
5138 | // Check if we found the exit user. |
5139 | BasicBlock *Parent = UI->getParent(); |
5140 | if (!TheLoop->contains(Parent)) { |
5141 | // Exit if you find multiple outside users or if the header phi node is |
5142 | // being used. In this case the user uses the value of the previous |
5143 | // iteration, in which case we would loose "VF-1" iterations of the |
5144 | // reduction operation if we vectorize. |
5145 | if (ExitInstruction != nullptr || Cur == Phi) |
5146 | return false; |
5147 | |
5148 | // The instruction used by an outside user must be the last instruction |
5149 | // before we feed back to the reduction phi. Otherwise, we loose VF-1 |
5150 | // operations on the value. |
5151 | if (std::find(Phi->op_begin(), Phi->op_end(), Cur) == Phi->op_end()) |
5152 | return false; |
5153 | |
5154 | ExitInstruction = Cur; |
5155 | continue; |
5156 | } |
5157 | |
5158 | // Process instructions only once (termination). Each reduction cycle |
5159 | // value must only be used once, except by phi nodes and min/max |
5160 | // reductions which are represented as a cmp followed by a select. |
5161 | ReductionInstDesc IgnoredVal(false, nullptr); |
5162 | if (VisitedInsts.insert(UI).second) { |
5163 | if (isa<PHINode>(UI)) |
5164 | PHIs.push_back(UI); |
5165 | else |
5166 | NonPHIs.push_back(UI); |
5167 | } else if (!isa<PHINode>(UI) && |
5168 | ((!isa<FCmpInst>(UI) && |
5169 | !isa<ICmpInst>(UI) && |
5170 | !isa<SelectInst>(UI)) || |
5171 | !isMinMaxSelectCmpPattern(UI, IgnoredVal).IsReduction)) |
5172 | return false; |
5173 | |
5174 | // Remember that we completed the cycle. |
5175 | if (UI == Phi) |
5176 | FoundStartPHI = true; |
5177 | } |
5178 | Worklist.append(PHIs.begin(), PHIs.end()); |
5179 | Worklist.append(NonPHIs.begin(), NonPHIs.end()); |
5180 | } |
5181 | |
5182 | // This means we have seen one but not the other instruction of the |
5183 | // pattern or more than just a select and cmp. |
5184 | if ((Kind == RK_IntegerMinMax || Kind == RK_FloatMinMax) && |
5185 | NumCmpSelectPatternInst != 2) |
5186 | return false; |
5187 | |
5188 | if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction) |
5189 | return false; |
5190 | |
5191 | // We found a reduction var if we have reached the original phi node and we |
5192 | // only have a single instruction with out-of-loop users. |
5193 | |
5194 | // This instruction is allowed to have out-of-loop users. |
5195 | AllowedExit.insert(ExitInstruction); |
5196 | |
5197 | // Save the description of this reduction variable. |
5198 | ReductionDescriptor RD(RdxStart, ExitInstruction, Kind, |
5199 | ReduxDesc.MinMaxKind); |
5200 | Reductions[Phi] = RD; |
5201 | // We've ended the cycle. This is a reduction variable if we have an |
5202 | // outside user and it has a binary op. |
5203 | |
5204 | return true; |
5205 | } |
5206 | |
5207 | /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction |
5208 | /// pattern corresponding to a min(X, Y) or max(X, Y). |
5209 | LoopVectorizationLegality::ReductionInstDesc |
5210 | LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I, |
5211 | ReductionInstDesc &Prev) { |
5212 | |
5213 | assert((isa<ICmpInst>(I) || isa<FCmpInst>(I) || isa<SelectInst>(I)) &&(((isa<ICmpInst>(I) || isa<FCmpInst>(I) || isa< SelectInst>(I)) && "Expect a select instruction") ? static_cast<void> (0) : __assert_fail ("(isa<ICmpInst>(I) || isa<FCmpInst>(I) || isa<SelectInst>(I)) && \"Expect a select instruction\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 5214, __PRETTY_FUNCTION__)) |
5214 | "Expect a select instruction")(((isa<ICmpInst>(I) || isa<FCmpInst>(I) || isa< SelectInst>(I)) && "Expect a select instruction") ? static_cast<void> (0) : __assert_fail ("(isa<ICmpInst>(I) || isa<FCmpInst>(I) || isa<SelectInst>(I)) && \"Expect a select instruction\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 5214, __PRETTY_FUNCTION__)); |
5215 | Instruction *Cmp = nullptr; |
5216 | SelectInst *Select = nullptr; |
5217 | |
5218 | // We must handle the select(cmp()) as a single instruction. Advance to the |
5219 | // select. |
5220 | if ((Cmp = dyn_cast<ICmpInst>(I)) || (Cmp = dyn_cast<FCmpInst>(I))) { |
5221 | if (!Cmp->hasOneUse() || !(Select = dyn_cast<SelectInst>(*I->user_begin()))) |
5222 | return ReductionInstDesc(false, I); |
5223 | return ReductionInstDesc(Select, Prev.MinMaxKind); |
5224 | } |
5225 | |
5226 | // Only handle single use cases for now. |
5227 | if (!(Select = dyn_cast<SelectInst>(I))) |
5228 | return ReductionInstDesc(false, I); |
5229 | if (!(Cmp = dyn_cast<ICmpInst>(I->getOperand(0))) && |
5230 | !(Cmp = dyn_cast<FCmpInst>(I->getOperand(0)))) |
5231 | return ReductionInstDesc(false, I); |
5232 | if (!Cmp->hasOneUse()) |
5233 | return ReductionInstDesc(false, I); |
5234 | |
5235 | Value *CmpLeft; |
5236 | Value *CmpRight; |
5237 | |
5238 | // Look for a min/max pattern. |
5239 | if (m_UMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) |
5240 | return ReductionInstDesc(Select, MRK_UIntMin); |
5241 | else if (m_UMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) |
5242 | return ReductionInstDesc(Select, MRK_UIntMax); |
5243 | else if (m_SMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) |
5244 | return ReductionInstDesc(Select, MRK_SIntMax); |
5245 | else if (m_SMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) |
5246 | return ReductionInstDesc(Select, MRK_SIntMin); |
5247 | else if (m_OrdFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) |
5248 | return ReductionInstDesc(Select, MRK_FloatMin); |
5249 | else if (m_OrdFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) |
5250 | return ReductionInstDesc(Select, MRK_FloatMax); |
5251 | else if (m_UnordFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) |
5252 | return ReductionInstDesc(Select, MRK_FloatMin); |
5253 | else if (m_UnordFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) |
5254 | return ReductionInstDesc(Select, MRK_FloatMax); |
5255 | |
5256 | return ReductionInstDesc(false, I); |
5257 | } |
5258 | |
5259 | LoopVectorizationLegality::ReductionInstDesc |
5260 | LoopVectorizationLegality::isReductionInstr(Instruction *I, |
5261 | ReductionKind Kind, |
5262 | ReductionInstDesc &Prev) { |
5263 | bool FP = I->getType()->isFloatingPointTy(); |
5264 | bool FastMath = FP && I->hasUnsafeAlgebra(); |
5265 | switch (I->getOpcode()) { |
5266 | default: |
5267 | return ReductionInstDesc(false, I); |
5268 | case Instruction::PHI: |
5269 | if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd && |
5270 | Kind != RK_FloatMinMax)) |
5271 | return ReductionInstDesc(false, I); |
5272 | return ReductionInstDesc(I, Prev.MinMaxKind); |
5273 | case Instruction::Sub: |
5274 | case Instruction::Add: |
5275 | return ReductionInstDesc(Kind == RK_IntegerAdd, I); |
5276 | case Instruction::Mul: |
5277 | return ReductionInstDesc(Kind == RK_IntegerMult, I); |
5278 | case Instruction::And: |
5279 | return ReductionInstDesc(Kind == RK_IntegerAnd, I); |
5280 | case Instruction::Or: |
5281 | return ReductionInstDesc(Kind == RK_IntegerOr, I); |
5282 | case Instruction::Xor: |
5283 | return ReductionInstDesc(Kind == RK_IntegerXor, I); |
5284 | case Instruction::FMul: |
5285 | return ReductionInstDesc(Kind == RK_FloatMult && FastMath, I); |
5286 | case Instruction::FSub: |
5287 | case Instruction::FAdd: |
5288 | return ReductionInstDesc(Kind == RK_FloatAdd && FastMath, I); |
5289 | case Instruction::FCmp: |
5290 | case Instruction::ICmp: |
5291 | case Instruction::Select: |
5292 | if (Kind != RK_IntegerMinMax && |
5293 | (!HasFunNoNaNAttr || Kind != RK_FloatMinMax)) |
5294 | return ReductionInstDesc(false, I); |
5295 | return isMinMaxSelectCmpPattern(I, Prev); |
5296 | } |
5297 | } |
5298 | |
5299 | LoopVectorizationLegality::InductionKind |
5300 | LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { |
5301 | Type *PhiTy = Phi->getType(); |
5302 | // We only handle integer and pointer inductions variables. |
5303 | if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy()) |
5304 | return IK_NoInduction; |
5305 | |
5306 | // Check that the PHI is consecutive. |
5307 | const SCEV *PhiScev = SE->getSCEV(Phi); |
5308 | const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); |
5309 | if (!AR) { |
5310 | DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: PHI is not a poly recurrence.\n" ; } } while (0); |
5311 | return IK_NoInduction; |
5312 | } |
5313 | const SCEV *Step = AR->getStepRecurrence(*SE); |
5314 | |
5315 | // Integer inductions need to have a stride of one. |
5316 | if (PhiTy->isIntegerTy()) { |
5317 | if (Step->isOne()) |
5318 | return IK_IntInduction; |
5319 | if (Step->isAllOnesValue()) |
5320 | return IK_ReverseIntInduction; |
5321 | return IK_NoInduction; |
5322 | } |
5323 | |
5324 | // Calculate the pointer stride and check if it is consecutive. |
5325 | const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); |
5326 | if (!C) |
5327 | return IK_NoInduction; |
5328 | |
5329 | assert(PhiTy->isPointerTy() && "The PHI must be a pointer")((PhiTy->isPointerTy() && "The PHI must be a pointer" ) ? static_cast<void> (0) : __assert_fail ("PhiTy->isPointerTy() && \"The PHI must be a pointer\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 5329, __PRETTY_FUNCTION__)); |
5330 | Type *PointerElementType = PhiTy->getPointerElementType(); |
5331 | // The pointer stride cannot be determined if the pointer element type is not |
5332 | // sized. |
5333 | if (!PointerElementType->isSized()) |
5334 | return IK_NoInduction; |
5335 | |
5336 | uint64_t Size = DL->getTypeAllocSize(PointerElementType); |
5337 | if (C->getValue()->equalsInt(Size)) |
5338 | return IK_PtrInduction; |
5339 | else if (C->getValue()->equalsInt(0 - Size)) |
5340 | return IK_ReversePtrInduction; |
5341 | |
5342 | return IK_NoInduction; |
5343 | } |
5344 | |
5345 | bool LoopVectorizationLegality::isInductionVariable(const Value *V) { |
5346 | Value *In0 = const_cast<Value*>(V); |
5347 | PHINode *PN = dyn_cast_or_null<PHINode>(In0); |
5348 | if (!PN) |
5349 | return false; |
5350 | |
5351 | return Inductions.count(PN); |
5352 | } |
5353 | |
5354 | bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { |
5355 | assert(TheLoop->contains(BB) && "Unknown block used")((TheLoop->contains(BB) && "Unknown block used") ? static_cast<void> (0) : __assert_fail ("TheLoop->contains(BB) && \"Unknown block used\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 5355, __PRETTY_FUNCTION__)); |
5356 | |
5357 | // Blocks that do not dominate the latch need predication. |
5358 | BasicBlock* Latch = TheLoop->getLoopLatch(); |
5359 | return !DT->dominates(BB, Latch); |
5360 | } |
5361 | |
5362 | bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB, |
5363 | SmallPtrSetImpl<Value *> &SafePtrs) { |
5364 | |
5365 | for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { |
5366 | // Check that we don't have a constant expression that can trap as operand. |
5367 | for (Instruction::op_iterator OI = it->op_begin(), OE = it->op_end(); |
5368 | OI != OE; ++OI) { |
5369 | if (Constant *C = dyn_cast<Constant>(*OI)) |
5370 | if (C->canTrap()) |
5371 | return false; |
5372 | } |
5373 | // We might be able to hoist the load. |
5374 | if (it->mayReadFromMemory()) { |
5375 | LoadInst *LI = dyn_cast<LoadInst>(it); |
5376 | if (!LI) |
5377 | return false; |
5378 | if (!SafePtrs.count(LI->getPointerOperand())) { |
5379 | if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand())) { |
5380 | MaskedOp.insert(LI); |
5381 | continue; |
5382 | } |
5383 | return false; |
5384 | } |
5385 | } |
5386 | |
5387 | // We don't predicate stores at the moment. |
5388 | if (it->mayWriteToMemory()) { |
5389 | StoreInst *SI = dyn_cast<StoreInst>(it); |
5390 | // We only support predication of stores in basic blocks with one |
5391 | // predecessor. |
5392 | if (!SI) |
5393 | return false; |
5394 | |
5395 | bool isSafePtr = (SafePtrs.count(SI->getPointerOperand()) != 0); |
5396 | bool isSinglePredecessor = SI->getParent()->getSinglePredecessor(); |
5397 | |
5398 | if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr || |
5399 | !isSinglePredecessor) { |
5400 | // Build a masked store if it is legal for the target, otherwise scalarize |
5401 | // the block. |
5402 | bool isLegalMaskedOp = |
5403 | isLegalMaskedStore(SI->getValueOperand()->getType(), |
5404 | SI->getPointerOperand()); |
5405 | if (isLegalMaskedOp) { |
5406 | --NumPredStores; |
5407 | MaskedOp.insert(SI); |
5408 | continue; |
5409 | } |
5410 | return false; |
5411 | } |
5412 | } |
5413 | if (it->mayThrow()) |
5414 | return false; |
5415 | |
5416 | // The instructions below can trap. |
5417 | switch (it->getOpcode()) { |
5418 | default: continue; |
5419 | case Instruction::UDiv: |
5420 | case Instruction::SDiv: |
5421 | case Instruction::URem: |
5422 | case Instruction::SRem: |
5423 | return false; |
5424 | } |
5425 | } |
5426 | |
5427 | return true; |
5428 | } |
5429 | |
5430 | LoopVectorizationCostModel::VectorizationFactor |
5431 | LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { |
5432 | // Width 1 means no vectorize |
5433 | VectorizationFactor Factor = { 1U, 0U }; |
5434 | if (OptForSize && Legal->getRuntimePointerCheck()->Need) { |
5435 | emitAnalysis(Report() << "runtime pointer checks needed. Enable vectorization of this loop with '#pragma clang loop vectorize(enable)' when compiling with -Os"); |
5436 | DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n" ; } } while (0); |
5437 | return Factor; |
5438 | } |
5439 | |
5440 | if (!EnableCondStoresVectorization && Legal->NumPredStores) { |
5441 | emitAnalysis(Report() << "store that is conditionally executed prevents vectorization"); |
5442 | DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: No vectorization. There are conditional stores.\n" ; } } while (0); |
5443 | return Factor; |
5444 | } |
5445 | |
5446 | // Find the trip count. |
5447 | unsigned TC = SE->getSmallConstantTripCount(TheLoop); |
5448 | DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found trip count: " << TC << '\n'; } } while (0); |
5449 | |
5450 | unsigned WidestType = getWidestType(); |
5451 | unsigned WidestRegister = TTI.getRegisterBitWidth(true); |
5452 | unsigned MaxSafeDepDist = -1U; |
5453 | if (Legal->getMaxSafeDepDistBytes() != -1U) |
5454 | MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; |
5455 | WidestRegister = ((WidestRegister < MaxSafeDepDist) ? |
5456 | WidestRegister : MaxSafeDepDist); |
5457 | unsigned MaxVectorSize = WidestRegister / WidestType; |
5458 | DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: The Widest type: " << WidestType << " bits.\n"; } } while (0); |
5459 | DEBUG(dbgs() << "LV: The Widest register is: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: The Widest register is: " << WidestRegister << " bits.\n"; } } while (0) |
5460 | << WidestRegister << " bits.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: The Widest register is: " << WidestRegister << " bits.\n"; } } while (0); |
5461 | |
5462 | if (MaxVectorSize == 0) { |
5463 | DEBUG(dbgs() << "LV: The target has no vector registers.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: The target has no vector registers.\n" ; } } while (0); |
5464 | MaxVectorSize = 1; |
5465 | } |
5466 | |
5467 | assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements"((MaxVectorSize <= 64 && "Did not expect to pack so many elements" " into one vector!") ? static_cast<void> (0) : __assert_fail ("MaxVectorSize <= 64 && \"Did not expect to pack so many elements\" \" into one vector!\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 5468, __PRETTY_FUNCTION__)) |
5468 | " into one vector!")((MaxVectorSize <= 64 && "Did not expect to pack so many elements" " into one vector!") ? static_cast<void> (0) : __assert_fail ("MaxVectorSize <= 64 && \"Did not expect to pack so many elements\" \" into one vector!\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 5468, __PRETTY_FUNCTION__)); |
5469 | |
5470 | unsigned VF = MaxVectorSize; |
5471 | |
5472 | // If we optimize the program for size, avoid creating the tail loop. |
5473 | if (OptForSize) { |
5474 | // If we are unable to calculate the trip count then don't try to vectorize. |
5475 | if (TC < 2) { |
5476 | emitAnalysis(Report() << "unable to calculate the loop count due to complex control flow"); |
5477 | DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Aborting. A tail loop is required in Os.\n" ; } } while (0); |
5478 | return Factor; |
5479 | } |
5480 | |
5481 | // Find the maximum SIMD width that can fit within the trip count. |
5482 | VF = TC % MaxVectorSize; |
5483 | |
5484 | if (VF == 0) |
5485 | VF = MaxVectorSize; |
5486 | |
5487 | // If the trip count that we found modulo the vectorization factor is not |
5488 | // zero then we require a tail. |
5489 | if (VF < 2) { |
5490 | emitAnalysis(Report() << "cannot optimize for size and vectorize at the " |
5491 | "same time. Enable vectorization of this loop " |
5492 | "with '#pragma clang loop vectorize(enable)' " |
5493 | "when compiling with -Os"); |
5494 | DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Aborting. A tail loop is required in Os.\n" ; } } while (0); |
5495 | return Factor; |
5496 | } |
5497 | } |
5498 | |
5499 | int UserVF = Hints->getWidth(); |
5500 | if (UserVF != 0) { |
5501 | assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two")((isPowerOf2_32(UserVF) && "VF needs to be a power of two" ) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(UserVF) && \"VF needs to be a power of two\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 5501, __PRETTY_FUNCTION__)); |
5502 | DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Using user VF " << UserVF << ".\n"; } } while (0); |
5503 | |
5504 | Factor.Width = UserVF; |
5505 | return Factor; |
5506 | } |
5507 | |
5508 | float Cost = expectedCost(1); |
5509 | #ifndef NDEBUG |
5510 | const float ScalarCost = Cost; |
5511 | #endif /* NDEBUG */ |
5512 | unsigned Width = 1; |
5513 | DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"; } } while (0); |
5514 | |
5515 | bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; |
5516 | // Ignore scalar width, because the user explicitly wants vectorization. |
5517 | if (ForceVectorization && VF > 1) { |
5518 | Width = 2; |
5519 | Cost = expectedCost(Width) / (float)Width; |
5520 | } |
5521 | |
5522 | for (unsigned i=2; i <= VF; i*=2) { |
5523 | // Notice that the vector loop needs to be executed less times, so |
5524 | // we need to divide the cost of the vector loops by the width of |
5525 | // the vector elements. |
5526 | float VectorCost = expectedCost(i) / (float)i; |
5527 | DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Vector loop of width " << i << " costs: " << (int)VectorCost << ".\n"; } } while (0) |
5528 | (int)VectorCost << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Vector loop of width " << i << " costs: " << (int)VectorCost << ".\n"; } } while (0); |
5529 | if (VectorCost < Cost) { |
5530 | Cost = VectorCost; |
5531 | Width = i; |
5532 | } |
5533 | } |
5534 | |
5535 | DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() << "LV: Vectorization seems to be not beneficial, " << "but was forced by a user.\n"; } } while (0) |
5536 | << "LV: Vectorization seems to be not beneficial, "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() << "LV: Vectorization seems to be not beneficial, " << "but was forced by a user.\n"; } } while (0) |
5537 | << "but was forced by a user.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() << "LV: Vectorization seems to be not beneficial, " << "but was forced by a user.\n"; } } while (0); |
5538 | DEBUG(dbgs() << "LV: Selecting VF: "<< Width << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Selecting VF: "<< Width << ".\n"; } } while (0); |
5539 | Factor.Width = Width; |
5540 | Factor.Cost = Width * Cost; |
5541 | return Factor; |
5542 | } |
5543 | |
5544 | unsigned LoopVectorizationCostModel::getWidestType() { |
5545 | unsigned MaxWidth = 8; |
5546 | |
5547 | // For each block. |
5548 | for (Loop::block_iterator bb = TheLoop->block_begin(), |
5549 | be = TheLoop->block_end(); bb != be; ++bb) { |
5550 | BasicBlock *BB = *bb; |
5551 | |
5552 | // For each instruction in the loop. |
5553 | for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { |
5554 | Type *T = it->getType(); |
5555 | |
5556 | // Ignore ephemeral values. |
5557 | if (EphValues.count(it)) |
5558 | continue; |
5559 | |
5560 | // Only examine Loads, Stores and PHINodes. |
5561 | if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it)) |
5562 | continue; |
5563 | |
5564 | // Examine PHI nodes that are reduction variables. |
5565 | if (PHINode *PN = dyn_cast<PHINode>(it)) |
5566 | if (!Legal->getReductionVars()->count(PN)) |
5567 | continue; |
5568 | |
5569 | // Examine the stored values. |
5570 | if (StoreInst *ST = dyn_cast<StoreInst>(it)) |
5571 | T = ST->getValueOperand()->getType(); |
5572 | |
5573 | // Ignore loaded pointer types and stored pointer types that are not |
5574 | // consecutive. However, we do want to take consecutive stores/loads of |
5575 | // pointer vectors into account. |
5576 | if (T->isPointerTy() && !isConsecutiveLoadOrStore(it)) |
5577 | continue; |
5578 | |
5579 | MaxWidth = std::max(MaxWidth, |
5580 | (unsigned)DL->getTypeSizeInBits(T->getScalarType())); |
5581 | } |
5582 | } |
5583 | |
5584 | return MaxWidth; |
5585 | } |
5586 | |
5587 | unsigned |
5588 | LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, |
5589 | unsigned VF, |
5590 | unsigned LoopCost) { |
5591 | |
5592 | // -- The unroll heuristics -- |
5593 | // We unroll the loop in order to expose ILP and reduce the loop overhead. |
5594 | // There are many micro-architectural considerations that we can't predict |
5595 | // at this level. For example, frontend pressure (on decode or fetch) due to |
5596 | // code size, or the number and capabilities of the execution ports. |
5597 | // |
5598 | // We use the following heuristics to select the unroll factor: |
5599 | // 1. If the code has reductions, then we unroll in order to break the cross |
5600 | // iteration dependency. |
5601 | // 2. If the loop is really small, then we unroll in order to reduce the loop |
5602 | // overhead. |
5603 | // 3. We don't unroll if we think that we will spill registers to memory due |
5604 | // to the increased register pressure. |
5605 | |
5606 | // Use the user preference, unless 'auto' is selected. |
5607 | int UserUF = Hints->getInterleave(); |
5608 | if (UserUF != 0) |
5609 | return UserUF; |
5610 | |
5611 | // When we optimize for size, we don't unroll. |
5612 | if (OptForSize) |
5613 | return 1; |
5614 | |
5615 | // We used the distance for the unroll factor. |
5616 | if (Legal->getMaxSafeDepDistBytes() != -1U) |
5617 | return 1; |
5618 | |
5619 | // Do not unroll loops with a relatively small trip count. |
5620 | unsigned TC = SE->getSmallConstantTripCount(TheLoop); |
5621 | if (TC > 1 && TC < TinyTripCountUnrollThreshold) |
5622 | return 1; |
5623 | |
5624 | unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1); |
5625 | DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: The target has " << TargetNumRegisters << " registers\n"; } } while (0) |
5626 | " registers\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: The target has " << TargetNumRegisters << " registers\n"; } } while (0); |
5627 | |
5628 | if (VF == 1) { |
5629 | if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) |
5630 | TargetNumRegisters = ForceTargetNumScalarRegs; |
5631 | } else { |
5632 | if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) |
5633 | TargetNumRegisters = ForceTargetNumVectorRegs; |
5634 | } |
5635 | |
5636 | LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage(); |
5637 | // We divide by these constants so assume that we have at least one |
5638 | // instruction that uses at least one register. |
5639 | R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); |
5640 | R.NumInstructions = std::max(R.NumInstructions, 1U); |
5641 | |
5642 | // We calculate the unroll factor using the following formula. |
5643 | // Subtract the number of loop invariants from the number of available |
5644 | // registers. These registers are used by all of the unrolled instances. |
5645 | // Next, divide the remaining registers by the number of registers that is |
5646 | // required by the loop, in order to estimate how many parallel instances |
5647 | // fit without causing spills. All of this is rounded down if necessary to be |
5648 | // a power of two. We want power of two unroll factors to simplify any |
5649 | // addressing operations or alignment considerations. |
5650 | unsigned UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) / |
5651 | R.MaxLocalUsers); |
5652 | |
5653 | // Don't count the induction variable as unrolled. |
5654 | if (EnableIndVarRegisterHeur) |
5655 | UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) / |
5656 | std::max(1U, (R.MaxLocalUsers - 1))); |
5657 | |
5658 | // Clamp the unroll factor ranges to reasonable factors. |
5659 | unsigned MaxInterleaveSize = TTI.getMaxInterleaveFactor(); |
5660 | |
5661 | // Check if the user has overridden the unroll max. |
5662 | if (VF == 1) { |
5663 | if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) |
5664 | MaxInterleaveSize = ForceTargetMaxScalarInterleaveFactor; |
5665 | } else { |
5666 | if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) |
5667 | MaxInterleaveSize = ForceTargetMaxVectorInterleaveFactor; |
5668 | } |
5669 | |
5670 | // If we did not calculate the cost for VF (because the user selected the VF) |
5671 | // then we calculate the cost of VF here. |
5672 | if (LoopCost == 0) |
5673 | LoopCost = expectedCost(VF); |
5674 | |
5675 | // Clamp the calculated UF to be between the 1 and the max unroll factor |
5676 | // that the target allows. |
5677 | if (UF > MaxInterleaveSize) |
5678 | UF = MaxInterleaveSize; |
5679 | else if (UF < 1) |
5680 | UF = 1; |
5681 | |
5682 | // Unroll if we vectorized this loop and there is a reduction that could |
5683 | // benefit from unrolling. |
5684 | if (VF > 1 && Legal->getReductionVars()->size()) { |
5685 | DEBUG(dbgs() << "LV: Unrolling because of reductions.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Unrolling because of reductions.\n" ; } } while (0); |
5686 | return UF; |
5687 | } |
5688 | |
5689 | // Note that if we've already vectorized the loop we will have done the |
5690 | // runtime check and so unrolling won't require further checks. |
5691 | bool UnrollingRequiresRuntimePointerCheck = |
5692 | (VF == 1 && Legal->getRuntimePointerCheck()->Need); |
5693 | |
5694 | // We want to unroll small loops in order to reduce the loop overhead and |
5695 | // potentially expose ILP opportunities. |
5696 | DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop cost is " << LoopCost << '\n'; } } while (0); |
5697 | if (!UnrollingRequiresRuntimePointerCheck && |
5698 | LoopCost < SmallLoopCost) { |
5699 | // We assume that the cost overhead is 1 and we use the cost model |
5700 | // to estimate the cost of the loop and unroll until the cost of the |
5701 | // loop overhead is about 5% of the cost of the loop. |
5702 | unsigned SmallUF = std::min(UF, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); |
5703 | |
5704 | // Unroll until store/load ports (estimated by max unroll factor) are |
5705 | // saturated. |
5706 | unsigned StoresUF = UF / (Legal->NumStores ? Legal->NumStores : 1); |
5707 | unsigned LoadsUF = UF / (Legal->NumLoads ? Legal->NumLoads : 1); |
5708 | |
5709 | // If we have a scalar reduction (vector reductions are already dealt with |
5710 | // by this point), we can increase the critical path length if the loop |
5711 | // we're unrolling is inside another loop. Limit, by default to 2, so the |
5712 | // critical path only gets increased by one reduction operation. |
5713 | if (Legal->getReductionVars()->size() && |
5714 | TheLoop->getLoopDepth() > 1) { |
5715 | unsigned F = static_cast<unsigned>(MaxNestedScalarReductionUF); |
5716 | SmallUF = std::min(SmallUF, F); |
5717 | StoresUF = std::min(StoresUF, F); |
5718 | LoadsUF = std::min(LoadsUF, F); |
5719 | } |
5720 | |
5721 | if (EnableLoadStoreRuntimeUnroll && std::max(StoresUF, LoadsUF) > SmallUF) { |
5722 | DEBUG(dbgs() << "LV: Unrolling to saturate store or load ports.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Unrolling to saturate store or load ports.\n" ; } } while (0); |
5723 | return std::max(StoresUF, LoadsUF); |
5724 | } |
5725 | |
5726 | DEBUG(dbgs() << "LV: Unrolling to reduce branch cost.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Unrolling to reduce branch cost.\n" ; } } while (0); |
5727 | return SmallUF; |
5728 | } |
5729 | |
5730 | DEBUG(dbgs() << "LV: Not Unrolling.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Not Unrolling.\n"; } } while (0); |
5731 | return 1; |
5732 | } |
5733 | |
5734 | LoopVectorizationCostModel::RegisterUsage |
5735 | LoopVectorizationCostModel::calculateRegisterUsage() { |
5736 | // This function calculates the register usage by measuring the highest number |
5737 | // of values that are alive at a single location. Obviously, this is a very |
5738 | // rough estimation. We scan the loop in a topological order in order and |
5739 | // assign a number to each instruction. We use RPO to ensure that defs are |
5740 | // met before their users. We assume that each instruction that has in-loop |
5741 | // users starts an interval. We record every time that an in-loop value is |
5742 | // used, so we have a list of the first and last occurrences of each |
5743 | // instruction. Next, we transpose this data structure into a multi map that |
5744 | // holds the list of intervals that *end* at a specific location. This multi |
5745 | // map allows us to perform a linear search. We scan the instructions linearly |
5746 | // and record each time that a new interval starts, by placing it in a set. |
5747 | // If we find this value in the multi-map then we remove it from the set. |
5748 | // The max register usage is the maximum size of the set. |
5749 | // We also search for instructions that are defined outside the loop, but are |
5750 | // used inside the loop. We need this number separately from the max-interval |
5751 | // usage number because when we unroll, loop-invariant values do not take |
5752 | // more register. |
5753 | LoopBlocksDFS DFS(TheLoop); |
5754 | DFS.perform(LI); |
5755 | |
5756 | RegisterUsage R; |
5757 | R.NumInstructions = 0; |
5758 | |
5759 | // Each 'key' in the map opens a new interval. The values |
5760 | // of the map are the index of the 'last seen' usage of the |
5761 | // instruction that is the key. |
5762 | typedef DenseMap<Instruction*, unsigned> IntervalMap; |
5763 | // Maps instruction to its index. |
5764 | DenseMap<unsigned, Instruction*> IdxToInstr; |
5765 | // Marks the end of each interval. |
5766 | IntervalMap EndPoint; |
5767 | // Saves the list of instruction indices that are used in the loop. |
5768 | SmallSet<Instruction*, 8> Ends; |
5769 | // Saves the list of values that are used in the loop but are |
5770 | // defined outside the loop, such as arguments and constants. |
5771 | SmallPtrSet<Value*, 8> LoopInvariants; |
5772 | |
5773 | unsigned Index = 0; |
5774 | for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), |
5775 | be = DFS.endRPO(); bb != be; ++bb) { |
5776 | R.NumInstructions += (*bb)->size(); |
5777 | for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; |
5778 | ++it) { |
5779 | Instruction *I = it; |
5780 | IdxToInstr[Index++] = I; |
5781 | |
5782 | // Save the end location of each USE. |
5783 | for (unsigned i = 0; i < I->getNumOperands(); ++i) { |
5784 | Value *U = I->getOperand(i); |
5785 | Instruction *Instr = dyn_cast<Instruction>(U); |
5786 | |
5787 | // Ignore non-instruction values such as arguments, constants, etc. |
5788 | if (!Instr) continue; |
5789 | |
5790 | // If this instruction is outside the loop then record it and continue. |
5791 | if (!TheLoop->contains(Instr)) { |
5792 | LoopInvariants.insert(Instr); |
5793 | continue; |
5794 | } |
5795 | |
5796 | // Overwrite previous end points. |
5797 | EndPoint[Instr] = Index; |
5798 | Ends.insert(Instr); |
5799 | } |
5800 | } |
5801 | } |
5802 | |
5803 | // Saves the list of intervals that end with the index in 'key'. |
5804 | typedef SmallVector<Instruction*, 2> InstrList; |
5805 | DenseMap<unsigned, InstrList> TransposeEnds; |
5806 | |
5807 | // Transpose the EndPoints to a list of values that end at each index. |
5808 | for (IntervalMap::iterator it = EndPoint.begin(), e = EndPoint.end(); |
5809 | it != e; ++it) |
5810 | TransposeEnds[it->second].push_back(it->first); |
5811 | |
5812 | SmallSet<Instruction*, 8> OpenIntervals; |
5813 | unsigned MaxUsage = 0; |
5814 | |
5815 | |
5816 | DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV(REG): Calculating max register usage:\n" ; } } while (0); |
5817 | for (unsigned int i = 0; i < Index; ++i) { |
5818 | Instruction *I = IdxToInstr[i]; |
5819 | // Ignore instructions that are never used within the loop. |
5820 | if (!Ends.count(I)) continue; |
5821 | |
5822 | // Ignore ephemeral values. |
5823 | if (EphValues.count(I)) |
5824 | continue; |
5825 | |
5826 | // Remove all of the instructions that end at this location. |
5827 | InstrList &List = TransposeEnds[i]; |
5828 | for (unsigned int j=0, e = List.size(); j < e; ++j) |
5829 | OpenIntervals.erase(List[j]); |
5830 | |
5831 | // Count the number of live interals. |
5832 | MaxUsage = std::max(MaxUsage, OpenIntervals.size()); |
5833 | |
5834 | DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV(REG): At #" << i << " Interval # " << OpenIntervals.size() << '\n'; } } while (0) |
5835 | OpenIntervals.size() << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV(REG): At #" << i << " Interval # " << OpenIntervals.size() << '\n'; } } while (0); |
5836 | |
5837 | // Add the current instruction to the list of open intervals. |
5838 | OpenIntervals.insert(I); |
5839 | } |
5840 | |
5841 | unsigned Invariant = LoopInvariants.size(); |
5842 | DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV(REG): Found max usage: " << MaxUsage << '\n'; } } while (0); |
5843 | DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n'; } } while (0); |
5844 | DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << '\n'; } } while (0); |
5845 | |
5846 | R.LoopInvariantRegs = Invariant; |
5847 | R.MaxLocalUsers = MaxUsage; |
5848 | return R; |
5849 | } |
5850 | |
5851 | unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { |
5852 | unsigned Cost = 0; |
5853 | |
5854 | // For each block. |
5855 | for (Loop::block_iterator bb = TheLoop->block_begin(), |
5856 | be = TheLoop->block_end(); bb != be; ++bb) { |
5857 | unsigned BlockCost = 0; |
5858 | BasicBlock *BB = *bb; |
5859 | |
5860 | // For each instruction in the old loop. |
5861 | for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { |
5862 | // Skip dbg intrinsics. |
5863 | if (isa<DbgInfoIntrinsic>(it)) |
5864 | continue; |
5865 | |
5866 | // Ignore ephemeral values. |
5867 | if (EphValues.count(it)) |
5868 | continue; |
5869 | |
5870 | unsigned C = getInstructionCost(it, VF); |
5871 | |
5872 | // Check if we should override the cost. |
5873 | if (ForceTargetInstructionCost.getNumOccurrences() > 0) |
5874 | C = ForceTargetInstructionCost; |
5875 | |
5876 | BlockCost += C; |
5877 | DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an estimated cost of " << C << " for VF " << VF << " For instruction: " << *it << '\n'; } } while (0) |
5878 | VF << " For instruction: " << *it << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an estimated cost of " << C << " for VF " << VF << " For instruction: " << *it << '\n'; } } while (0); |
5879 | } |
5880 | |
5881 | // We assume that if-converted blocks have a 50% chance of being executed. |
5882 | // When the code is scalar then some of the blocks are avoided due to CF. |
5883 | // When the code is vectorized we execute all code paths. |
5884 | if (VF == 1 && Legal->blockNeedsPredication(*bb)) |
5885 | BlockCost /= 2; |
5886 | |
5887 | Cost += BlockCost; |
5888 | } |
5889 | |
5890 | return Cost; |
5891 | } |
5892 | |
5893 | /// \brief Check whether the address computation for a non-consecutive memory |
5894 | /// access looks like an unlikely candidate for being merged into the indexing |
5895 | /// mode. |
5896 | /// |
5897 | /// We look for a GEP which has one index that is an induction variable and all |
5898 | /// other indices are loop invariant. If the stride of this access is also |
5899 | /// within a small bound we decide that this address computation can likely be |
5900 | /// merged into the addressing mode. |
5901 | /// In all other cases, we identify the address computation as complex. |
5902 | static bool isLikelyComplexAddressComputation(Value *Ptr, |
5903 | LoopVectorizationLegality *Legal, |
5904 | ScalarEvolution *SE, |
5905 | const Loop *TheLoop) { |
5906 | GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); |
5907 | if (!Gep) |
5908 | return true; |
5909 | |
5910 | // We are looking for a gep with all loop invariant indices except for one |
5911 | // which should be an induction variable. |
5912 | unsigned NumOperands = Gep->getNumOperands(); |
5913 | for (unsigned i = 1; i < NumOperands; ++i) { |
5914 | Value *Opd = Gep->getOperand(i); |
5915 | if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && |
5916 | !Legal->isInductionVariable(Opd)) |
5917 | return true; |
5918 | } |
5919 | |
5920 | // Now we know we have a GEP ptr, %inv, %ind, %inv. Make sure that the step |
5921 | // can likely be merged into the address computation. |
5922 | unsigned MaxMergeDistance = 64; |
5923 | |
5924 | const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Ptr)); |
5925 | if (!AddRec) |
5926 | return true; |
5927 | |
5928 | // Check the step is constant. |
5929 | const SCEV *Step = AddRec->getStepRecurrence(*SE); |
5930 | // Calculate the pointer stride and check if it is consecutive. |
5931 | const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); |
5932 | if (!C) |
5933 | return true; |
5934 | |
5935 | const APInt &APStepVal = C->getValue()->getValue(); |
5936 | |
5937 | // Huge step value - give up. |
5938 | if (APStepVal.getBitWidth() > 64) |
5939 | return true; |
5940 | |
5941 | int64_t StepVal = APStepVal.getSExtValue(); |
5942 | |
5943 | return StepVal > MaxMergeDistance; |
5944 | } |
5945 | |
5946 | static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { |
5947 | if (Legal->hasStride(I->getOperand(0)) || Legal->hasStride(I->getOperand(1))) |
5948 | return true; |
5949 | return false; |
5950 | } |
5951 | |
5952 | unsigned |
5953 | LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { |
5954 | // If we know that this instruction will remain uniform, check the cost of |
5955 | // the scalar version. |
5956 | if (Legal->isUniformAfterVectorization(I)) |
5957 | VF = 1; |
5958 | |
5959 | Type *RetTy = I->getType(); |
5960 | Type *VectorTy = ToVectorTy(RetTy, VF); |
5961 | |
5962 | // TODO: We need to estimate the cost of intrinsic calls. |
5963 | switch (I->getOpcode()) { |
5964 | case Instruction::GetElementPtr: |
5965 | // We mark this instruction as zero-cost because the cost of GEPs in |
5966 | // vectorized code depends on whether the corresponding memory instruction |
5967 | // is scalarized or not. Therefore, we handle GEPs with the memory |
5968 | // instruction cost. |
5969 | return 0; |
5970 | case Instruction::Br: { |
5971 | return TTI.getCFInstrCost(I->getOpcode()); |
5972 | } |
5973 | case Instruction::PHI: |
5974 | //TODO: IF-converted IFs become selects. |
5975 | return 0; |
5976 | case Instruction::Add: |
5977 | case Instruction::FAdd: |
5978 | case Instruction::Sub: |
5979 | case Instruction::FSub: |
5980 | case Instruction::Mul: |
5981 | case Instruction::FMul: |
5982 | case Instruction::UDiv: |
5983 | case Instruction::SDiv: |
5984 | case Instruction::FDiv: |
5985 | case Instruction::URem: |
5986 | case Instruction::SRem: |
5987 | case Instruction::FRem: |
5988 | case Instruction::Shl: |
5989 | case Instruction::LShr: |
5990 | case Instruction::AShr: |
5991 | case Instruction::And: |
5992 | case Instruction::Or: |
5993 | case Instruction::Xor: { |
5994 | // Since we will replace the stride by 1 the multiplication should go away. |
5995 | if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) |
5996 | return 0; |
5997 | // Certain instructions can be cheaper to vectorize if they have a constant |
5998 | // second vector operand. One example of this are shifts on x86. |
5999 | TargetTransformInfo::OperandValueKind Op1VK = |
6000 | TargetTransformInfo::OK_AnyValue; |
6001 | TargetTransformInfo::OperandValueKind Op2VK = |
6002 | TargetTransformInfo::OK_AnyValue; |
6003 | TargetTransformInfo::OperandValueProperties Op1VP = |
6004 | TargetTransformInfo::OP_None; |
6005 | TargetTransformInfo::OperandValueProperties Op2VP = |
6006 | TargetTransformInfo::OP_None; |
6007 | Value *Op2 = I->getOperand(1); |
6008 | |
6009 | // Check for a splat of a constant or for a non uniform vector of constants. |
6010 | if (isa<ConstantInt>(Op2)) { |
6011 | ConstantInt *CInt = cast<ConstantInt>(Op2); |
6012 | if (CInt && CInt->getValue().isPowerOf2()) |
6013 | Op2VP = TargetTransformInfo::OP_PowerOf2; |
6014 | Op2VK = TargetTransformInfo::OK_UniformConstantValue; |
6015 | } else if (isa<ConstantVector>(Op2) || isa<ConstantDataVector>(Op2)) { |
6016 | Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; |
6017 | Constant *SplatValue = cast<Constant>(Op2)->getSplatValue(); |
6018 | if (SplatValue) { |
6019 | ConstantInt *CInt = dyn_cast<ConstantInt>(SplatValue); |
6020 | if (CInt && CInt->getValue().isPowerOf2()) |
6021 | Op2VP = TargetTransformInfo::OP_PowerOf2; |
6022 | Op2VK = TargetTransformInfo::OK_UniformConstantValue; |
6023 | } |
6024 | } |
6025 | |
6026 | return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK, |
6027 | Op1VP, Op2VP); |
6028 | } |
6029 | case Instruction::Select: { |
6030 | SelectInst *SI = cast<SelectInst>(I); |
6031 | const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); |
6032 | bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); |
6033 | Type *CondTy = SI->getCondition()->getType(); |
6034 | if (!ScalarCond) |
6035 | CondTy = VectorType::get(CondTy, VF); |
6036 | |
6037 | return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); |
6038 | } |
6039 | case Instruction::ICmp: |
6040 | case Instruction::FCmp: { |
6041 | Type *ValTy = I->getOperand(0)->getType(); |
6042 | VectorTy = ToVectorTy(ValTy, VF); |
6043 | return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy); |
6044 | } |
6045 | case Instruction::Store: |
6046 | case Instruction::Load: { |
6047 | StoreInst *SI = dyn_cast<StoreInst>(I); |
6048 | LoadInst *LI = dyn_cast<LoadInst>(I); |
6049 | Type *ValTy = (SI ? SI->getValueOperand()->getType() : |
6050 | LI->getType()); |
6051 | VectorTy = ToVectorTy(ValTy, VF); |
6052 | |
6053 | unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment(); |
6054 | unsigned AS = SI ? SI->getPointerAddressSpace() : |
6055 | LI->getPointerAddressSpace(); |
6056 | Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand(); |
6057 | // We add the cost of address computation here instead of with the gep |
6058 | // instruction because only here we know whether the operation is |
6059 | // scalarized. |
6060 | if (VF == 1) |
6061 | return TTI.getAddressComputationCost(VectorTy) + |
6062 | TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); |
6063 | |
6064 | // Scalarized loads/stores. |
6065 | int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); |
6066 | bool Reverse = ConsecutiveStride < 0; |
6067 | unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ValTy); |
6068 | unsigned VectorElementSize = DL->getTypeStoreSize(VectorTy)/VF; |
6069 | if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) { |
6070 | bool IsComplexComputation = |
6071 | isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop); |
6072 | unsigned Cost = 0; |
6073 | // The cost of extracting from the value vector and pointer vector. |
6074 | Type *PtrTy = ToVectorTy(Ptr->getType(), VF); |
6075 | for (unsigned i = 0; i < VF; ++i) { |
6076 | // The cost of extracting the pointer operand. |
6077 | Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i); |
6078 | // In case of STORE, the cost of ExtractElement from the vector. |
6079 | // In case of LOAD, the cost of InsertElement into the returned |
6080 | // vector. |
6081 | Cost += TTI.getVectorInstrCost(SI ? Instruction::ExtractElement : |
6082 | Instruction::InsertElement, |
6083 | VectorTy, i); |
6084 | } |
6085 | |
6086 | // The cost of the scalar loads/stores. |
6087 | Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation); |
6088 | Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), |
6089 | Alignment, AS); |
6090 | return Cost; |
6091 | } |
6092 | |
6093 | // Wide load/stores. |
6094 | unsigned Cost = TTI.getAddressComputationCost(VectorTy); |
6095 | Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); |
6096 | |
6097 | if (Reverse) |
6098 | Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, |
6099 | VectorTy, 0); |
6100 | return Cost; |
6101 | } |
6102 | case Instruction::ZExt: |
6103 | case Instruction::SExt: |
6104 | case Instruction::FPToUI: |
6105 | case Instruction::FPToSI: |
6106 | case Instruction::FPExt: |
6107 | case Instruction::PtrToInt: |
6108 | case Instruction::IntToPtr: |
6109 | case Instruction::SIToFP: |
6110 | case Instruction::UIToFP: |
6111 | case Instruction::Trunc: |
6112 | case Instruction::FPTrunc: |
6113 | case Instruction::BitCast: { |
6114 | // We optimize the truncation of induction variable. |
6115 | // The cost of these is the same as the scalar operation. |
6116 | if (I->getOpcode() == Instruction::Trunc && |
6117 | Legal->isInductionVariable(I->getOperand(0))) |
6118 | return TTI.getCastInstrCost(I->getOpcode(), I->getType(), |
6119 | I->getOperand(0)->getType()); |
6120 | |
6121 | Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); |
6122 | return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); |
6123 | } |
6124 | case Instruction::Call: { |
6125 | CallInst *CI = cast<CallInst>(I); |
6126 | Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); |
6127 | assert(ID && "Not an intrinsic call!")((ID && "Not an intrinsic call!") ? static_cast<void > (0) : __assert_fail ("ID && \"Not an intrinsic call!\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6127, __PRETTY_FUNCTION__)); |
6128 | Type *RetTy = ToVectorTy(CI->getType(), VF); |
6129 | SmallVector<Type*, 4> Tys; |
6130 | for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) |
6131 | Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF)); |
6132 | return TTI.getIntrinsicInstrCost(ID, RetTy, Tys); |
6133 | } |
6134 | default: { |
6135 | // We are scalarizing the instruction. Return the cost of the scalar |
6136 | // instruction, plus the cost of insert and extract into vector |
6137 | // elements, times the vector width. |
6138 | unsigned Cost = 0; |
6139 | |
6140 | if (!RetTy->isVoidTy() && VF != 1) { |
6141 | unsigned InsCost = TTI.getVectorInstrCost(Instruction::InsertElement, |
6142 | VectorTy); |
6143 | unsigned ExtCost = TTI.getVectorInstrCost(Instruction::ExtractElement, |
6144 | VectorTy); |
6145 | |
6146 | // The cost of inserting the results plus extracting each one of the |
6147 | // operands. |
6148 | Cost += VF * (InsCost + ExtCost * I->getNumOperands()); |
6149 | } |
6150 | |
6151 | // The cost of executing VF copies of the scalar instruction. This opcode |
6152 | // is unknown. Assume that it is the same as 'mul'. |
6153 | Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy); |
6154 | return Cost; |
6155 | } |
6156 | }// end of switch. |
6157 | } |
6158 | |
6159 | Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) { |
6160 | if (Scalar->isVoidTy() || VF == 1) |
6161 | return Scalar; |
6162 | return VectorType::get(Scalar, VF); |
6163 | } |
6164 | |
6165 | char LoopVectorize::ID = 0; |
6166 | static const char lv_name[] = "Loop Vectorization"; |
6167 | INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)static void* initializeLoopVectorizePassOnce(PassRegistry & Registry) { |
6168 | INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)initializeTargetTransformInfoAnalysisGroup(Registry); |
6169 | INITIALIZE_AG_DEPENDENCY(AliasAnalysis)initializeAliasAnalysisAnalysisGroup(Registry); |
6170 | INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)initializeAssumptionTrackerPass(Registry); |
6171 | INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfo)initializeBlockFrequencyInfoPass(Registry); |
6172 | INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)initializeDominatorTreeWrapperPassPass(Registry); |
6173 | INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)initializeScalarEvolutionPass(Registry); |
6174 | INITIALIZE_PASS_DEPENDENCY(LCSSA)initializeLCSSAPass(Registry); |
6175 | INITIALIZE_PASS_DEPENDENCY(LoopInfo)initializeLoopInfoPass(Registry); |
6176 | INITIALIZE_PASS_DEPENDENCY(LoopSimplify)initializeLoopSimplifyPass(Registry); |
6177 | INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)PassInfo *PI = new PassInfo(lv_name, "loop-vectorize", & LoopVectorize ::ID, PassInfo::NormalCtor_t(callDefaultCtor< LoopVectorize >), false, false); Registry.registerPass(*PI, true); return PI; } void llvm::initializeLoopVectorizePass(PassRegistry & Registry) { static volatile sys::cas_flag initialized = 0; sys ::cas_flag old_val = sys::CompareAndSwap(&initialized, 1, 0); if (old_val == 0) { initializeLoopVectorizePassOnce(Registry ); sys::MemoryFence(); AnnotateIgnoreWritesBegin("/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6177); AnnotateHappensBefore("/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6177, &initialized); initialized = 2; AnnotateIgnoreWritesEnd ("/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6177); } else { sys::cas_flag tmp = initialized; sys::MemoryFence (); while (tmp != 2) { tmp = initialized; sys::MemoryFence(); } } AnnotateHappensAfter("/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6177, &initialized); } |
6178 | |
6179 | namespace llvm { |
6180 | Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) { |
6181 | return new LoopVectorize(NoUnrolling, AlwaysVectorize); |
6182 | } |
6183 | } |
6184 | |
6185 | bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { |
6186 | // Check for a store. |
6187 | if (StoreInst *ST = dyn_cast<StoreInst>(Inst)) |
6188 | return Legal->isConsecutivePtr(ST->getPointerOperand()) != 0; |
6189 | |
6190 | // Check for a load. |
6191 | if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) |
6192 | return Legal->isConsecutivePtr(LI->getPointerOperand()) != 0; |
6193 | |
6194 | return false; |
6195 | } |
6196 | |
6197 | |
6198 | void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, |
6199 | bool IfPredicateStore) { |
6200 | assert(!Instr->getType()->isAggregateType() && "Can't handle vectors")((!Instr->getType()->isAggregateType() && "Can't handle vectors" ) ? static_cast<void> (0) : __assert_fail ("!Instr->getType()->isAggregateType() && \"Can't handle vectors\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6200, __PRETTY_FUNCTION__)); |
6201 | // Holds vector parameters or scalars, in case of uniform vals. |
6202 | SmallVector<VectorParts, 4> Params; |
6203 | |
6204 | setDebugLocFromInst(Builder, Instr); |
6205 | |
6206 | // Find all of the vectorized parameters. |
6207 | for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { |
6208 | Value *SrcOp = Instr->getOperand(op); |
6209 | |
6210 | // If we are accessing the old induction variable, use the new one. |
6211 | if (SrcOp == OldInduction) { |
6212 | Params.push_back(getVectorValue(SrcOp)); |
6213 | continue; |
6214 | } |
6215 | |
6216 | // Try using previously calculated values. |
6217 | Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); |
6218 | |
6219 | // If the src is an instruction that appeared earlier in the basic block |
6220 | // then it should already be vectorized. |
6221 | if (SrcInst && OrigLoop->contains(SrcInst)) { |
6222 | assert(WidenMap.has(SrcInst) && "Source operand is unavailable")((WidenMap.has(SrcInst) && "Source operand is unavailable" ) ? static_cast<void> (0) : __assert_fail ("WidenMap.has(SrcInst) && \"Source operand is unavailable\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6222, __PRETTY_FUNCTION__)); |
6223 | // The parameter is a vector value from earlier. |
6224 | Params.push_back(WidenMap.get(SrcInst)); |
6225 | } else { |
6226 | // The parameter is a scalar from outside the loop. Maybe even a constant. |
6227 | VectorParts Scalars; |
6228 | Scalars.append(UF, SrcOp); |
6229 | Params.push_back(Scalars); |
6230 | } |
6231 | } |
6232 | |
6233 | assert(Params.size() == Instr->getNumOperands() &&((Params.size() == Instr->getNumOperands() && "Invalid number of operands" ) ? static_cast<void> (0) : __assert_fail ("Params.size() == Instr->getNumOperands() && \"Invalid number of operands\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6234, __PRETTY_FUNCTION__)) |
6234 | "Invalid number of operands")((Params.size() == Instr->getNumOperands() && "Invalid number of operands" ) ? static_cast<void> (0) : __assert_fail ("Params.size() == Instr->getNumOperands() && \"Invalid number of operands\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6234, __PRETTY_FUNCTION__)); |
6235 | |
6236 | // Does this instruction return a value ? |
6237 | bool IsVoidRetTy = Instr->getType()->isVoidTy(); |
6238 | |
6239 | Value *UndefVec = IsVoidRetTy ? nullptr : |
6240 | UndefValue::get(Instr->getType()); |
6241 | // Create a new entry in the WidenMap and initialize it to Undef or Null. |
6242 | VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); |
6243 | |
6244 | Instruction *InsertPt = Builder.GetInsertPoint(); |
6245 | BasicBlock *IfBlock = Builder.GetInsertBlock(); |
6246 | BasicBlock *CondBlock = nullptr; |
6247 | |
6248 | VectorParts Cond; |
6249 | Loop *VectorLp = nullptr; |
6250 | if (IfPredicateStore) { |
6251 | assert(Instr->getParent()->getSinglePredecessor() &&((Instr->getParent()->getSinglePredecessor() && "Only support single predecessor blocks") ? static_cast<void > (0) : __assert_fail ("Instr->getParent()->getSinglePredecessor() && \"Only support single predecessor blocks\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6252, __PRETTY_FUNCTION__)) |
6252 | "Only support single predecessor blocks")((Instr->getParent()->getSinglePredecessor() && "Only support single predecessor blocks") ? static_cast<void > (0) : __assert_fail ("Instr->getParent()->getSinglePredecessor() && \"Only support single predecessor blocks\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6252, __PRETTY_FUNCTION__)); |
6253 | Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(), |
6254 | Instr->getParent()); |
6255 | VectorLp = LI->getLoopFor(IfBlock); |
6256 | assert(VectorLp && "Must have a loop for this block")((VectorLp && "Must have a loop for this block") ? static_cast <void> (0) : __assert_fail ("VectorLp && \"Must have a loop for this block\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6256, __PRETTY_FUNCTION__)); |
6257 | } |
6258 | |
6259 | // For each vector unroll 'part': |
6260 | for (unsigned Part = 0; Part < UF; ++Part) { |
6261 | // For each scalar that we create: |
6262 | |
6263 | // Start an "if (pred) a[i] = ..." block. |
6264 | Value *Cmp = nullptr; |
6265 | if (IfPredicateStore) { |
6266 | if (Cond[Part]->getType()->isVectorTy()) |
6267 | Cond[Part] = |
6268 | Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0)); |
6269 | Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part], |
6270 | ConstantInt::get(Cond[Part]->getType(), 1)); |
6271 | CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); |
6272 | LoopVectorBody.push_back(CondBlock); |
6273 | VectorLp->addBasicBlockToLoop(CondBlock, LI->getBase()); |
6274 | // Update Builder with newly created basic block. |
6275 | Builder.SetInsertPoint(InsertPt); |
6276 | } |
6277 | |
6278 | Instruction *Cloned = Instr->clone(); |
6279 | if (!IsVoidRetTy) |
6280 | Cloned->setName(Instr->getName() + ".cloned"); |
6281 | // Replace the operands of the cloned instructions with extracted scalars. |
6282 | for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { |
6283 | Value *Op = Params[op][Part]; |
6284 | Cloned->setOperand(op, Op); |
6285 | } |
6286 | |
6287 | // Place the cloned scalar in the new loop. |
6288 | Builder.Insert(Cloned); |
6289 | |
6290 | // If the original scalar returns a value we need to place it in a vector |
6291 | // so that future users will be able to use it. |
6292 | if (!IsVoidRetTy) |
6293 | VecResults[Part] = Cloned; |
6294 | |
6295 | // End if-block. |
6296 | if (IfPredicateStore) { |
6297 | BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); |
6298 | LoopVectorBody.push_back(NewIfBlock); |
6299 | VectorLp->addBasicBlockToLoop(NewIfBlock, LI->getBase()); |
6300 | Builder.SetInsertPoint(InsertPt); |
6301 | Instruction *OldBr = IfBlock->getTerminator(); |
6302 | BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); |
6303 | OldBr->eraseFromParent(); |
6304 | IfBlock = NewIfBlock; |
6305 | } |
6306 | } |
6307 | } |
6308 | |
6309 | void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) { |
6310 | StoreInst *SI = dyn_cast<StoreInst>(Instr); |
6311 | bool IfPredicateStore = (SI && Legal->blockNeedsPredication(SI->getParent())); |
6312 | |
6313 | return scalarizeInstruction(Instr, IfPredicateStore); |
6314 | } |
6315 | |
6316 | Value *InnerLoopUnroller::reverseVector(Value *Vec) { |
6317 | return Vec; |
6318 | } |
6319 | |
6320 | Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { |
6321 | return V; |
6322 | } |
6323 | |
6324 | Value *InnerLoopUnroller::getConsecutiveVector(Value* Val, int StartIdx, |
6325 | bool Negate) { |
6326 | // When unrolling and the VF is 1, we only need to add a simple scalar. |
6327 | Type *ITy = Val->getType(); |
6328 | assert(!ITy->isVectorTy() && "Val must be a scalar")((!ITy->isVectorTy() && "Val must be a scalar") ? static_cast <void> (0) : __assert_fail ("!ITy->isVectorTy() && \"Val must be a scalar\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn224456/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6328, __PRETTY_FUNCTION__)); |
6329 | Constant *C = ConstantInt::get(ITy, StartIdx, Negate); |
6330 | return Builder.CreateAdd(Val, C, "induction"); |
6331 | } |