File: | lib/Transforms/Vectorize/LoopVectorize.cpp |
Location: | line 1199, column 5 |
Description: | Value stored to 'LoopID' is never read |
1 | //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// |
2 | // |
3 | // The LLVM Compiler Infrastructure |
4 | // |
5 | // This file is distributed under the University of Illinois Open Source |
6 | // License. See LICENSE.TXT for details. |
7 | // |
8 | //===----------------------------------------------------------------------===// |
9 | // |
10 | // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops |
11 | // and generates target-independent LLVM-IR. |
12 | // The vectorizer uses the TargetTransformInfo analysis to estimate the costs |
13 | // of instructions in order to estimate the profitability of vectorization. |
14 | // |
15 | // The loop vectorizer combines consecutive loop iterations into a single |
16 | // 'wide' iteration. After this transformation the index is incremented |
17 | // by the SIMD vector width, and not by one. |
18 | // |
19 | // This pass has three parts: |
20 | // 1. The main loop pass that drives the different parts. |
21 | // 2. LoopVectorizationLegality - A unit that checks for the legality |
22 | // of the vectorization. |
23 | // 3. InnerLoopVectorizer - A unit that performs the actual |
24 | // widening of instructions. |
25 | // 4. LoopVectorizationCostModel - A unit that checks for the profitability |
26 | // of vectorization. It decides on the optimal vector width, which |
27 | // can be one, if vectorization is not profitable. |
28 | // |
29 | //===----------------------------------------------------------------------===// |
30 | // |
31 | // The reduction-variable vectorization is based on the paper: |
32 | // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. |
33 | // |
34 | // Variable uniformity checks are inspired by: |
35 | // Karrenberg, R. and Hack, S. Whole Function Vectorization. |
36 | // |
37 | // Other ideas/concepts are from: |
38 | // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. |
39 | // |
40 | // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of |
41 | // Vectorizing Compilers. |
42 | // |
43 | //===----------------------------------------------------------------------===// |
44 | |
45 | #include "llvm/Transforms/Vectorize.h" |
46 | #include "llvm/ADT/DenseMap.h" |
47 | #include "llvm/ADT/EquivalenceClasses.h" |
48 | #include "llvm/ADT/Hashing.h" |
49 | #include "llvm/ADT/MapVector.h" |
50 | #include "llvm/ADT/SetVector.h" |
51 | #include "llvm/ADT/SmallPtrSet.h" |
52 | #include "llvm/ADT/SmallSet.h" |
53 | #include "llvm/ADT/SmallVector.h" |
54 | #include "llvm/ADT/Statistic.h" |
55 | #include "llvm/ADT/StringExtras.h" |
56 | #include "llvm/Analysis/AliasAnalysis.h" |
57 | #include "llvm/Analysis/AliasSetTracker.h" |
58 | #include "llvm/Analysis/AssumptionTracker.h" |
59 | #include "llvm/Analysis/BlockFrequencyInfo.h" |
60 | #include "llvm/Analysis/CodeMetrics.h" |
61 | #include "llvm/Analysis/LoopInfo.h" |
62 | #include "llvm/Analysis/LoopIterator.h" |
63 | #include "llvm/Analysis/LoopPass.h" |
64 | #include "llvm/Analysis/ScalarEvolution.h" |
65 | #include "llvm/Analysis/ScalarEvolutionExpander.h" |
66 | #include "llvm/Analysis/ScalarEvolutionExpressions.h" |
67 | #include "llvm/Analysis/TargetTransformInfo.h" |
68 | #include "llvm/Analysis/ValueTracking.h" |
69 | #include "llvm/IR/Constants.h" |
70 | #include "llvm/IR/DataLayout.h" |
71 | #include "llvm/IR/DebugInfo.h" |
72 | #include "llvm/IR/DerivedTypes.h" |
73 | #include "llvm/IR/DiagnosticInfo.h" |
74 | #include "llvm/IR/Dominators.h" |
75 | #include "llvm/IR/Function.h" |
76 | #include "llvm/IR/IRBuilder.h" |
77 | #include "llvm/IR/Instructions.h" |
78 | #include "llvm/IR/IntrinsicInst.h" |
79 | #include "llvm/IR/LLVMContext.h" |
80 | #include "llvm/IR/Module.h" |
81 | #include "llvm/IR/PatternMatch.h" |
82 | #include "llvm/IR/Type.h" |
83 | #include "llvm/IR/Value.h" |
84 | #include "llvm/IR/ValueHandle.h" |
85 | #include "llvm/IR/Verifier.h" |
86 | #include "llvm/Pass.h" |
87 | #include "llvm/Support/BranchProbability.h" |
88 | #include "llvm/Support/CommandLine.h" |
89 | #include "llvm/Support/Debug.h" |
90 | #include "llvm/Support/raw_ostream.h" |
91 | #include "llvm/Transforms/Scalar.h" |
92 | #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
93 | #include "llvm/Transforms/Utils/Local.h" |
94 | #include "llvm/Transforms/Utils/VectorUtils.h" |
95 | #include <algorithm> |
96 | #include <map> |
97 | #include <tuple> |
98 | |
99 | using namespace llvm; |
100 | using namespace llvm::PatternMatch; |
101 | |
102 | #define LV_NAME"loop-vectorize" "loop-vectorize" |
103 | #define DEBUG_TYPE"loop-vectorize" LV_NAME"loop-vectorize" |
104 | |
105 | STATISTIC(LoopsVectorized, "Number of loops vectorized")static llvm::Statistic LoopsVectorized = { "loop-vectorize", "Number of loops vectorized" , 0, 0 }; |
106 | STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization")static llvm::Statistic LoopsAnalyzed = { "loop-vectorize", "Number of loops analyzed for vectorization" , 0, 0 }; |
107 | |
108 | static cl::opt<unsigned> |
109 | VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, |
110 | cl::desc("Sets the SIMD width. Zero is autoselect.")); |
111 | |
112 | static cl::opt<unsigned> |
113 | VectorizationInterleave("force-vector-interleave", cl::init(0), cl::Hidden, |
114 | cl::desc("Sets the vectorization interleave count. " |
115 | "Zero is autoselect.")); |
116 | |
117 | static cl::opt<bool> |
118 | EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, |
119 | cl::desc("Enable if-conversion during vectorization.")); |
120 | |
121 | /// We don't vectorize loops with a known constant trip count below this number. |
122 | static cl::opt<unsigned> |
123 | TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), |
124 | cl::Hidden, |
125 | cl::desc("Don't vectorize loops with a constant " |
126 | "trip count that is smaller than this " |
127 | "value.")); |
128 | |
129 | /// This enables versioning on the strides of symbolically striding memory |
130 | /// accesses in code like the following. |
131 | /// for (i = 0; i < N; ++i) |
132 | /// A[i * Stride1] += B[i * Stride2] ... |
133 | /// |
134 | /// Will be roughly translated to |
135 | /// if (Stride1 == 1 && Stride2 == 1) { |
136 | /// for (i = 0; i < N; i+=4) |
137 | /// A[i:i+3] += ... |
138 | /// } else |
139 | /// ... |
140 | static cl::opt<bool> EnableMemAccessVersioning( |
141 | "enable-mem-access-versioning", cl::init(true), cl::Hidden, |
142 | cl::desc("Enable symblic stride memory access versioning")); |
143 | |
144 | /// We don't unroll loops with a known constant trip count below this number. |
145 | static const unsigned TinyTripCountUnrollThreshold = 128; |
146 | |
147 | /// When performing memory disambiguation checks at runtime do not make more |
148 | /// than this number of comparisons. |
149 | static const unsigned RuntimeMemoryCheckThreshold = 8; |
150 | |
151 | /// Maximum simd width. |
152 | static const unsigned MaxVectorWidth = 64; |
153 | |
154 | static cl::opt<unsigned> ForceTargetNumScalarRegs( |
155 | "force-target-num-scalar-regs", cl::init(0), cl::Hidden, |
156 | cl::desc("A flag that overrides the target's number of scalar registers.")); |
157 | |
158 | static cl::opt<unsigned> ForceTargetNumVectorRegs( |
159 | "force-target-num-vector-regs", cl::init(0), cl::Hidden, |
160 | cl::desc("A flag that overrides the target's number of vector registers.")); |
161 | |
162 | /// Maximum vectorization interleave count. |
163 | static const unsigned MaxInterleaveFactor = 16; |
164 | |
165 | static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( |
166 | "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, |
167 | cl::desc("A flag that overrides the target's max interleave factor for " |
168 | "scalar loops.")); |
169 | |
170 | static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( |
171 | "force-target-max-vector-interleave", cl::init(0), cl::Hidden, |
172 | cl::desc("A flag that overrides the target's max interleave factor for " |
173 | "vectorized loops.")); |
174 | |
175 | static cl::opt<unsigned> ForceTargetInstructionCost( |
176 | "force-target-instruction-cost", cl::init(0), cl::Hidden, |
177 | cl::desc("A flag that overrides the target's expected cost for " |
178 | "an instruction to a single constant value. Mostly " |
179 | "useful for getting consistent testing.")); |
180 | |
181 | static cl::opt<unsigned> SmallLoopCost( |
182 | "small-loop-cost", cl::init(20), cl::Hidden, |
183 | cl::desc("The cost of a loop that is considered 'small' by the unroller.")); |
184 | |
185 | static cl::opt<bool> LoopVectorizeWithBlockFrequency( |
186 | "loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden, |
187 | cl::desc("Enable the use of the block frequency analysis to access PGO " |
188 | "heuristics minimizing code growth in cold regions and being more " |
189 | "aggressive in hot regions.")); |
190 | |
191 | // Runtime unroll loops for load/store throughput. |
192 | static cl::opt<bool> EnableLoadStoreRuntimeUnroll( |
193 | "enable-loadstore-runtime-unroll", cl::init(true), cl::Hidden, |
194 | cl::desc("Enable runtime unrolling until load/store ports are saturated")); |
195 | |
196 | /// The number of stores in a loop that are allowed to need predication. |
197 | static cl::opt<unsigned> NumberOfStoresToPredicate( |
198 | "vectorize-num-stores-pred", cl::init(1), cl::Hidden, |
199 | cl::desc("Max number of stores to be predicated behind an if.")); |
200 | |
201 | static cl::opt<bool> EnableIndVarRegisterHeur( |
202 | "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, |
203 | cl::desc("Count the induction variable only once when unrolling")); |
204 | |
205 | static cl::opt<bool> EnableCondStoresVectorization( |
206 | "enable-cond-stores-vec", cl::init(false), cl::Hidden, |
207 | cl::desc("Enable if predication of stores during vectorization.")); |
208 | |
209 | static cl::opt<unsigned> MaxNestedScalarReductionUF( |
210 | "max-nested-scalar-reduction-unroll", cl::init(2), cl::Hidden, |
211 | cl::desc("The maximum unroll factor to use when unrolling a scalar " |
212 | "reduction in a nested loop.")); |
213 | |
214 | namespace { |
215 | |
216 | // Forward declarations. |
217 | class LoopVectorizationLegality; |
218 | class LoopVectorizationCostModel; |
219 | class LoopVectorizeHints; |
220 | |
221 | /// Optimization analysis message produced during vectorization. Messages inform |
222 | /// the user why vectorization did not occur. |
223 | class Report { |
224 | std::string Message; |
225 | raw_string_ostream Out; |
226 | Instruction *Instr; |
227 | |
228 | public: |
229 | Report(Instruction *I = nullptr) : Out(Message), Instr(I) { |
230 | Out << "loop not vectorized: "; |
231 | } |
232 | |
233 | template <typename A> Report &operator<<(const A &Value) { |
234 | Out << Value; |
235 | return *this; |
236 | } |
237 | |
238 | Instruction *getInstr() { return Instr; } |
239 | |
240 | std::string &str() { return Out.str(); } |
241 | operator Twine() { return Out.str(); } |
242 | }; |
243 | |
244 | /// InnerLoopVectorizer vectorizes loops which contain only one basic |
245 | /// block to a specified vectorization factor (VF). |
246 | /// This class performs the widening of scalars into vectors, or multiple |
247 | /// scalars. This class also implements the following features: |
248 | /// * It inserts an epilogue loop for handling loops that don't have iteration |
249 | /// counts that are known to be a multiple of the vectorization factor. |
250 | /// * It handles the code generation for reduction variables. |
251 | /// * Scalarization (implementation using scalars) of un-vectorizable |
252 | /// instructions. |
253 | /// InnerLoopVectorizer does not perform any vectorization-legality |
254 | /// checks, and relies on the caller to check for the different legality |
255 | /// aspects. The InnerLoopVectorizer relies on the |
256 | /// LoopVectorizationLegality class to provide information about the induction |
257 | /// and reduction variables that were found to a given vectorization factor. |
258 | class InnerLoopVectorizer { |
259 | public: |
260 | InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, |
261 | DominatorTree *DT, const DataLayout *DL, |
262 | const TargetLibraryInfo *TLI, unsigned VecWidth, |
263 | unsigned UnrollFactor) |
264 | : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), TLI(TLI), |
265 | VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()), |
266 | Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor), |
267 | Legal(nullptr) {} |
268 | |
269 | // Perform the actual loop widening (vectorization). |
270 | void vectorize(LoopVectorizationLegality *L) { |
271 | Legal = L; |
272 | // Create a new empty loop. Unlink the old loop and connect the new one. |
273 | createEmptyLoop(); |
274 | // Widen each instruction in the old loop to a new one in the new loop. |
275 | // Use the Legality module to find the induction and reduction variables. |
276 | vectorizeLoop(); |
277 | // Register the new loop and update the analysis passes. |
278 | updateAnalysis(); |
279 | } |
280 | |
281 | virtual ~InnerLoopVectorizer() {} |
282 | |
283 | protected: |
284 | /// A small list of PHINodes. |
285 | typedef SmallVector<PHINode*, 4> PhiVector; |
286 | /// When we unroll loops we have multiple vector values for each scalar. |
287 | /// This data structure holds the unrolled and vectorized values that |
288 | /// originated from one scalar instruction. |
289 | typedef SmallVector<Value*, 2> VectorParts; |
290 | |
291 | // When we if-convert we need create edge masks. We have to cache values so |
292 | // that we don't end up with exponential recursion/IR. |
293 | typedef DenseMap<std::pair<BasicBlock*, BasicBlock*>, |
294 | VectorParts> EdgeMaskCache; |
295 | |
296 | /// \brief Add code that checks at runtime if the accessed arrays overlap. |
297 | /// |
298 | /// Returns a pair of instructions where the first element is the first |
299 | /// instruction generated in possibly a sequence of instructions and the |
300 | /// second value is the final comparator value or NULL if no check is needed. |
301 | std::pair<Instruction *, Instruction *> addRuntimeCheck(Instruction *Loc); |
302 | |
303 | /// \brief Add checks for strides that where assumed to be 1. |
304 | /// |
305 | /// Returns the last check instruction and the first check instruction in the |
306 | /// pair as (first, last). |
307 | std::pair<Instruction *, Instruction *> addStrideCheck(Instruction *Loc); |
308 | |
309 | /// Create an empty loop, based on the loop ranges of the old loop. |
310 | void createEmptyLoop(); |
311 | /// Copy and widen the instructions from the old loop. |
312 | virtual void vectorizeLoop(); |
313 | |
314 | /// \brief The Loop exit block may have single value PHI nodes where the |
315 | /// incoming value is 'Undef'. While vectorizing we only handled real values |
316 | /// that were defined inside the loop. Here we fix the 'undef case'. |
317 | /// See PR14725. |
318 | void fixLCSSAPHIs(); |
319 | |
320 | /// A helper function that computes the predicate of the block BB, assuming |
321 | /// that the header block of the loop is set to True. It returns the *entry* |
322 | /// mask for the block BB. |
323 | VectorParts createBlockInMask(BasicBlock *BB); |
324 | /// A helper function that computes the predicate of the edge between SRC |
325 | /// and DST. |
326 | VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst); |
327 | |
328 | /// A helper function to vectorize a single BB within the innermost loop. |
329 | void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV); |
330 | |
331 | /// Vectorize a single PHINode in a block. This method handles the induction |
332 | /// variable canonicalization. It supports both VF = 1 for unrolled loops and |
333 | /// arbitrary length vectors. |
334 | void widenPHIInstruction(Instruction *PN, VectorParts &Entry, |
335 | unsigned UF, unsigned VF, PhiVector *PV); |
336 | |
337 | /// Insert the new loop to the loop hierarchy and pass manager |
338 | /// and update the analysis passes. |
339 | void updateAnalysis(); |
340 | |
341 | /// This instruction is un-vectorizable. Implement it as a sequence |
342 | /// of scalars. If \p IfPredicateStore is true we need to 'hide' each |
343 | /// scalarized instruction behind an if block predicated on the control |
344 | /// dependence of the instruction. |
345 | virtual void scalarizeInstruction(Instruction *Instr, |
346 | bool IfPredicateStore=false); |
347 | |
348 | /// Vectorize Load and Store instructions, |
349 | virtual void vectorizeMemoryInstruction(Instruction *Instr); |
350 | |
351 | /// Create a broadcast instruction. This method generates a broadcast |
352 | /// instruction (shuffle) for loop invariant values and for the induction |
353 | /// value. If this is the induction variable then we extend it to N, N+1, ... |
354 | /// this is needed because each iteration in the loop corresponds to a SIMD |
355 | /// element. |
356 | virtual Value *getBroadcastInstrs(Value *V); |
357 | |
358 | /// This function adds 0, 1, 2 ... to each vector element, starting at zero. |
359 | /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...). |
360 | /// The sequence starts at StartIndex. |
361 | virtual Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate); |
362 | |
363 | /// When we go over instructions in the basic block we rely on previous |
364 | /// values within the current basic block or on loop invariant values. |
365 | /// When we widen (vectorize) values we place them in the map. If the values |
366 | /// are not within the map, they have to be loop invariant, so we simply |
367 | /// broadcast them into a vector. |
368 | VectorParts &getVectorValue(Value *V); |
369 | |
370 | /// Generate a shuffle sequence that will reverse the vector Vec. |
371 | virtual Value *reverseVector(Value *Vec); |
372 | |
373 | /// This is a helper class that holds the vectorizer state. It maps scalar |
374 | /// instructions to vector instructions. When the code is 'unrolled' then |
375 | /// then a single scalar value is mapped to multiple vector parts. The parts |
376 | /// are stored in the VectorPart type. |
377 | struct ValueMap { |
378 | /// C'tor. UnrollFactor controls the number of vectors ('parts') that |
379 | /// are mapped. |
380 | ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {} |
381 | |
382 | /// \return True if 'Key' is saved in the Value Map. |
383 | bool has(Value *Key) const { return MapStorage.count(Key); } |
384 | |
385 | /// Initializes a new entry in the map. Sets all of the vector parts to the |
386 | /// save value in 'Val'. |
387 | /// \return A reference to a vector with splat values. |
388 | VectorParts &splat(Value *Key, Value *Val) { |
389 | VectorParts &Entry = MapStorage[Key]; |
390 | Entry.assign(UF, Val); |
391 | return Entry; |
392 | } |
393 | |
394 | ///\return A reference to the value that is stored at 'Key'. |
395 | VectorParts &get(Value *Key) { |
396 | VectorParts &Entry = MapStorage[Key]; |
397 | if (Entry.empty()) |
398 | Entry.resize(UF); |
399 | assert(Entry.size() == UF)((Entry.size() == UF) ? static_cast<void> (0) : __assert_fail ("Entry.size() == UF", "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 399, __PRETTY_FUNCTION__)); |
400 | return Entry; |
401 | } |
402 | |
403 | private: |
404 | /// The unroll factor. Each entry in the map stores this number of vector |
405 | /// elements. |
406 | unsigned UF; |
407 | |
408 | /// Map storage. We use std::map and not DenseMap because insertions to a |
409 | /// dense map invalidates its iterators. |
410 | std::map<Value *, VectorParts> MapStorage; |
411 | }; |
412 | |
413 | /// The original loop. |
414 | Loop *OrigLoop; |
415 | /// Scev analysis to use. |
416 | ScalarEvolution *SE; |
417 | /// Loop Info. |
418 | LoopInfo *LI; |
419 | /// Dominator Tree. |
420 | DominatorTree *DT; |
421 | /// Alias Analysis. |
422 | AliasAnalysis *AA; |
423 | /// Data Layout. |
424 | const DataLayout *DL; |
425 | /// Target Library Info. |
426 | const TargetLibraryInfo *TLI; |
427 | |
428 | /// The vectorization SIMD factor to use. Each vector will have this many |
429 | /// vector elements. |
430 | unsigned VF; |
431 | |
432 | protected: |
433 | /// The vectorization unroll factor to use. Each scalar is vectorized to this |
434 | /// many different vector instructions. |
435 | unsigned UF; |
436 | |
437 | /// The builder that we use |
438 | IRBuilder<> Builder; |
439 | |
440 | // --- Vectorization state --- |
441 | |
442 | /// The vector-loop preheader. |
443 | BasicBlock *LoopVectorPreHeader; |
444 | /// The scalar-loop preheader. |
445 | BasicBlock *LoopScalarPreHeader; |
446 | /// Middle Block between the vector and the scalar. |
447 | BasicBlock *LoopMiddleBlock; |
448 | ///The ExitBlock of the scalar loop. |
449 | BasicBlock *LoopExitBlock; |
450 | ///The vector loop body. |
451 | SmallVector<BasicBlock *, 4> LoopVectorBody; |
452 | ///The scalar loop body. |
453 | BasicBlock *LoopScalarBody; |
454 | /// A list of all bypass blocks. The first block is the entry of the loop. |
455 | SmallVector<BasicBlock *, 4> LoopBypassBlocks; |
456 | |
457 | /// The new Induction variable which was added to the new block. |
458 | PHINode *Induction; |
459 | /// The induction variable of the old basic block. |
460 | PHINode *OldInduction; |
461 | /// Holds the extended (to the widest induction type) start index. |
462 | Value *ExtendedIdx; |
463 | /// Maps scalars to widened vectors. |
464 | ValueMap WidenMap; |
465 | EdgeMaskCache MaskCache; |
466 | |
467 | LoopVectorizationLegality *Legal; |
468 | }; |
469 | |
470 | class InnerLoopUnroller : public InnerLoopVectorizer { |
471 | public: |
472 | InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, |
473 | DominatorTree *DT, const DataLayout *DL, |
474 | const TargetLibraryInfo *TLI, unsigned UnrollFactor) : |
475 | InnerLoopVectorizer(OrigLoop, SE, LI, DT, DL, TLI, 1, UnrollFactor) { } |
476 | |
477 | private: |
478 | void scalarizeInstruction(Instruction *Instr, |
479 | bool IfPredicateStore = false) override; |
480 | void vectorizeMemoryInstruction(Instruction *Instr) override; |
481 | Value *getBroadcastInstrs(Value *V) override; |
482 | Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate) override; |
483 | Value *reverseVector(Value *Vec) override; |
484 | }; |
485 | |
486 | /// \brief Look for a meaningful debug location on the instruction or it's |
487 | /// operands. |
488 | static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { |
489 | if (!I) |
490 | return I; |
491 | |
492 | DebugLoc Empty; |
493 | if (I->getDebugLoc() != Empty) |
494 | return I; |
495 | |
496 | for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { |
497 | if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) |
498 | if (OpInst->getDebugLoc() != Empty) |
499 | return OpInst; |
500 | } |
501 | |
502 | return I; |
503 | } |
504 | |
505 | /// \brief Set the debug location in the builder using the debug location in the |
506 | /// instruction. |
507 | static void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { |
508 | if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) |
509 | B.SetCurrentDebugLocation(Inst->getDebugLoc()); |
510 | else |
511 | B.SetCurrentDebugLocation(DebugLoc()); |
512 | } |
513 | |
514 | #ifndef NDEBUG |
515 | /// \return string containing a file name and a line # for the given loop. |
516 | static std::string getDebugLocString(const Loop *L) { |
517 | std::string Result; |
518 | if (L) { |
519 | raw_string_ostream OS(Result); |
520 | const DebugLoc LoopDbgLoc = L->getStartLoc(); |
521 | if (!LoopDbgLoc.isUnknown()) |
522 | LoopDbgLoc.print(L->getHeader()->getContext(), OS); |
523 | else |
524 | // Just print the module name. |
525 | OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); |
526 | OS.flush(); |
527 | } |
528 | return Result; |
529 | } |
530 | #endif |
531 | |
532 | /// \brief Propagate known metadata from one instruction to another. |
533 | static void propagateMetadata(Instruction *To, const Instruction *From) { |
534 | SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata; |
535 | From->getAllMetadataOtherThanDebugLoc(Metadata); |
536 | |
537 | for (auto M : Metadata) { |
538 | unsigned Kind = M.first; |
539 | |
540 | // These are safe to transfer (this is safe for TBAA, even when we |
541 | // if-convert, because should that metadata have had a control dependency |
542 | // on the condition, and thus actually aliased with some other |
543 | // non-speculated memory access when the condition was false, this would be |
544 | // caught by the runtime overlap checks). |
545 | if (Kind != LLVMContext::MD_tbaa && |
546 | Kind != LLVMContext::MD_alias_scope && |
547 | Kind != LLVMContext::MD_noalias && |
548 | Kind != LLVMContext::MD_fpmath) |
549 | continue; |
550 | |
551 | To->setMetadata(Kind, M.second); |
552 | } |
553 | } |
554 | |
555 | /// \brief Propagate known metadata from one instruction to a vector of others. |
556 | static void propagateMetadata(SmallVectorImpl<Value *> &To, const Instruction *From) { |
557 | for (Value *V : To) |
558 | if (Instruction *I = dyn_cast<Instruction>(V)) |
559 | propagateMetadata(I, From); |
560 | } |
561 | |
562 | /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and |
563 | /// to what vectorization factor. |
564 | /// This class does not look at the profitability of vectorization, only the |
565 | /// legality. This class has two main kinds of checks: |
566 | /// * Memory checks - The code in canVectorizeMemory checks if vectorization |
567 | /// will change the order of memory accesses in a way that will change the |
568 | /// correctness of the program. |
569 | /// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory |
570 | /// checks for a number of different conditions, such as the availability of a |
571 | /// single induction variable, that all types are supported and vectorize-able, |
572 | /// etc. This code reflects the capabilities of InnerLoopVectorizer. |
573 | /// This class is also used by InnerLoopVectorizer for identifying |
574 | /// induction variable and the different reduction variables. |
575 | class LoopVectorizationLegality { |
576 | public: |
577 | unsigned NumLoads; |
578 | unsigned NumStores; |
579 | unsigned NumPredStores; |
580 | |
581 | LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, const DataLayout *DL, |
582 | DominatorTree *DT, TargetLibraryInfo *TLI, |
583 | AliasAnalysis *AA, Function *F) |
584 | : NumLoads(0), NumStores(0), NumPredStores(0), TheLoop(L), SE(SE), DL(DL), |
585 | DT(DT), TLI(TLI), AA(AA), TheFunction(F), Induction(nullptr), |
586 | WidestIndTy(nullptr), HasFunNoNaNAttr(false), MaxSafeDepDistBytes(-1U) { |
587 | } |
588 | |
589 | /// This enum represents the kinds of reductions that we support. |
590 | enum ReductionKind { |
591 | RK_NoReduction, ///< Not a reduction. |
592 | RK_IntegerAdd, ///< Sum of integers. |
593 | RK_IntegerMult, ///< Product of integers. |
594 | RK_IntegerOr, ///< Bitwise or logical OR of numbers. |
595 | RK_IntegerAnd, ///< Bitwise or logical AND of numbers. |
596 | RK_IntegerXor, ///< Bitwise or logical XOR of numbers. |
597 | RK_IntegerMinMax, ///< Min/max implemented in terms of select(cmp()). |
598 | RK_FloatAdd, ///< Sum of floats. |
599 | RK_FloatMult, ///< Product of floats. |
600 | RK_FloatMinMax ///< Min/max implemented in terms of select(cmp()). |
601 | }; |
602 | |
603 | /// This enum represents the kinds of inductions that we support. |
604 | enum InductionKind { |
605 | IK_NoInduction, ///< Not an induction variable. |
606 | IK_IntInduction, ///< Integer induction variable. Step = 1. |
607 | IK_ReverseIntInduction, ///< Reverse int induction variable. Step = -1. |
608 | IK_PtrInduction, ///< Pointer induction var. Step = sizeof(elem). |
609 | IK_ReversePtrInduction ///< Reverse ptr indvar. Step = - sizeof(elem). |
610 | }; |
611 | |
612 | // This enum represents the kind of minmax reduction. |
613 | enum MinMaxReductionKind { |
614 | MRK_Invalid, |
615 | MRK_UIntMin, |
616 | MRK_UIntMax, |
617 | MRK_SIntMin, |
618 | MRK_SIntMax, |
619 | MRK_FloatMin, |
620 | MRK_FloatMax |
621 | }; |
622 | |
623 | /// This struct holds information about reduction variables. |
624 | struct ReductionDescriptor { |
625 | ReductionDescriptor() : StartValue(nullptr), LoopExitInstr(nullptr), |
626 | Kind(RK_NoReduction), MinMaxKind(MRK_Invalid) {} |
627 | |
628 | ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K, |
629 | MinMaxReductionKind MK) |
630 | : StartValue(Start), LoopExitInstr(Exit), Kind(K), MinMaxKind(MK) {} |
631 | |
632 | // The starting value of the reduction. |
633 | // It does not have to be zero! |
634 | TrackingVH<Value> StartValue; |
635 | // The instruction who's value is used outside the loop. |
636 | Instruction *LoopExitInstr; |
637 | // The kind of the reduction. |
638 | ReductionKind Kind; |
639 | // If this a min/max reduction the kind of reduction. |
640 | MinMaxReductionKind MinMaxKind; |
641 | }; |
642 | |
643 | /// This POD struct holds information about a potential reduction operation. |
644 | struct ReductionInstDesc { |
645 | ReductionInstDesc(bool IsRedux, Instruction *I) : |
646 | IsReduction(IsRedux), PatternLastInst(I), MinMaxKind(MRK_Invalid) {} |
647 | |
648 | ReductionInstDesc(Instruction *I, MinMaxReductionKind K) : |
649 | IsReduction(true), PatternLastInst(I), MinMaxKind(K) {} |
650 | |
651 | // Is this instruction a reduction candidate. |
652 | bool IsReduction; |
653 | // The last instruction in a min/max pattern (select of the select(icmp()) |
654 | // pattern), or the current reduction instruction otherwise. |
655 | Instruction *PatternLastInst; |
656 | // If this is a min/max pattern the comparison predicate. |
657 | MinMaxReductionKind MinMaxKind; |
658 | }; |
659 | |
660 | /// This struct holds information about the memory runtime legality |
661 | /// check that a group of pointers do not overlap. |
662 | struct RuntimePointerCheck { |
663 | RuntimePointerCheck() : Need(false) {} |
664 | |
665 | /// Reset the state of the pointer runtime information. |
666 | void reset() { |
667 | Need = false; |
668 | Pointers.clear(); |
669 | Starts.clear(); |
670 | Ends.clear(); |
671 | IsWritePtr.clear(); |
672 | DependencySetId.clear(); |
673 | AliasSetId.clear(); |
674 | } |
675 | |
676 | /// Insert a pointer and calculate the start and end SCEVs. |
677 | void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr, |
678 | unsigned DepSetId, unsigned ASId, ValueToValueMap &Strides); |
679 | |
680 | /// This flag indicates if we need to add the runtime check. |
681 | bool Need; |
682 | /// Holds the pointers that we need to check. |
683 | SmallVector<TrackingVH<Value>, 2> Pointers; |
684 | /// Holds the pointer value at the beginning of the loop. |
685 | SmallVector<const SCEV*, 2> Starts; |
686 | /// Holds the pointer value at the end of the loop. |
687 | SmallVector<const SCEV*, 2> Ends; |
688 | /// Holds the information if this pointer is used for writing to memory. |
689 | SmallVector<bool, 2> IsWritePtr; |
690 | /// Holds the id of the set of pointers that could be dependent because of a |
691 | /// shared underlying object. |
692 | SmallVector<unsigned, 2> DependencySetId; |
693 | /// Holds the id of the disjoint alias set to which this pointer belongs. |
694 | SmallVector<unsigned, 2> AliasSetId; |
695 | }; |
696 | |
697 | /// A struct for saving information about induction variables. |
698 | struct InductionInfo { |
699 | InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {} |
700 | InductionInfo() : StartValue(nullptr), IK(IK_NoInduction) {} |
701 | /// Start value. |
702 | TrackingVH<Value> StartValue; |
703 | /// Induction kind. |
704 | InductionKind IK; |
705 | }; |
706 | |
707 | /// ReductionList contains the reduction descriptors for all |
708 | /// of the reductions that were found in the loop. |
709 | typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList; |
710 | |
711 | /// InductionList saves induction variables and maps them to the |
712 | /// induction descriptor. |
713 | typedef MapVector<PHINode*, InductionInfo> InductionList; |
714 | |
715 | /// Returns true if it is legal to vectorize this loop. |
716 | /// This does not mean that it is profitable to vectorize this |
717 | /// loop, only that it is legal to do so. |
718 | bool canVectorize(); |
719 | |
720 | /// Returns the Induction variable. |
721 | PHINode *getInduction() { return Induction; } |
722 | |
723 | /// Returns the reduction variables found in the loop. |
724 | ReductionList *getReductionVars() { return &Reductions; } |
725 | |
726 | /// Returns the induction variables found in the loop. |
727 | InductionList *getInductionVars() { return &Inductions; } |
728 | |
729 | /// Returns the widest induction type. |
730 | Type *getWidestInductionType() { return WidestIndTy; } |
731 | |
732 | /// Returns True if V is an induction variable in this loop. |
733 | bool isInductionVariable(const Value *V); |
734 | |
735 | /// Return true if the block BB needs to be predicated in order for the loop |
736 | /// to be vectorized. |
737 | bool blockNeedsPredication(BasicBlock *BB); |
738 | |
739 | /// Check if this pointer is consecutive when vectorizing. This happens |
740 | /// when the last index of the GEP is the induction variable, or that the |
741 | /// pointer itself is an induction variable. |
742 | /// This check allows us to vectorize A[idx] into a wide load/store. |
743 | /// Returns: |
744 | /// 0 - Stride is unknown or non-consecutive. |
745 | /// 1 - Address is consecutive. |
746 | /// -1 - Address is consecutive, and decreasing. |
747 | int isConsecutivePtr(Value *Ptr); |
748 | |
749 | /// Returns true if the value V is uniform within the loop. |
750 | bool isUniform(Value *V); |
751 | |
752 | /// Returns true if this instruction will remain scalar after vectorization. |
753 | bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); } |
754 | |
755 | /// Returns the information that we collected about runtime memory check. |
756 | RuntimePointerCheck *getRuntimePointerCheck() { return &PtrRtCheck; } |
757 | |
758 | /// This function returns the identity element (or neutral element) for |
759 | /// the operation K. |
760 | static Constant *getReductionIdentity(ReductionKind K, Type *Tp); |
761 | |
762 | unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; } |
763 | |
764 | bool hasStride(Value *V) { return StrideSet.count(V); } |
765 | bool mustCheckStrides() { return !StrideSet.empty(); } |
766 | SmallPtrSet<Value *, 8>::iterator strides_begin() { |
767 | return StrideSet.begin(); |
768 | } |
769 | SmallPtrSet<Value *, 8>::iterator strides_end() { return StrideSet.end(); } |
770 | |
771 | private: |
772 | /// Check if a single basic block loop is vectorizable. |
773 | /// At this point we know that this is a loop with a constant trip count |
774 | /// and we only need to check individual instructions. |
775 | bool canVectorizeInstrs(); |
776 | |
777 | /// When we vectorize loops we may change the order in which |
778 | /// we read and write from memory. This method checks if it is |
779 | /// legal to vectorize the code, considering only memory constrains. |
780 | /// Returns true if the loop is vectorizable |
781 | bool canVectorizeMemory(); |
782 | |
783 | /// Return true if we can vectorize this loop using the IF-conversion |
784 | /// transformation. |
785 | bool canVectorizeWithIfConvert(); |
786 | |
787 | /// Collect the variables that need to stay uniform after vectorization. |
788 | void collectLoopUniforms(); |
789 | |
790 | /// Return true if all of the instructions in the block can be speculatively |
791 | /// executed. \p SafePtrs is a list of addresses that are known to be legal |
792 | /// and we know that we can read from them without segfault. |
793 | bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs); |
794 | |
795 | /// Returns True, if 'Phi' is the kind of reduction variable for type |
796 | /// 'Kind'. If this is a reduction variable, it adds it to ReductionList. |
797 | bool AddReductionVar(PHINode *Phi, ReductionKind Kind); |
798 | /// Returns a struct describing if the instruction 'I' can be a reduction |
799 | /// variable of type 'Kind'. If the reduction is a min/max pattern of |
800 | /// select(icmp()) this function advances the instruction pointer 'I' from the |
801 | /// compare instruction to the select instruction and stores this pointer in |
802 | /// 'PatternLastInst' member of the returned struct. |
803 | ReductionInstDesc isReductionInstr(Instruction *I, ReductionKind Kind, |
804 | ReductionInstDesc &Desc); |
805 | /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction |
806 | /// pattern corresponding to a min(X, Y) or max(X, Y). |
807 | static ReductionInstDesc isMinMaxSelectCmpPattern(Instruction *I, |
808 | ReductionInstDesc &Prev); |
809 | /// Returns the induction kind of Phi. This function may return NoInduction |
810 | /// if the PHI is not an induction variable. |
811 | InductionKind isInductionVariable(PHINode *Phi); |
812 | |
813 | /// \brief Collect memory access with loop invariant strides. |
814 | /// |
815 | /// Looks for accesses like "a[i * StrideA]" where "StrideA" is loop |
816 | /// invariant. |
817 | void collectStridedAcccess(Value *LoadOrStoreInst); |
818 | |
819 | /// Report an analysis message to assist the user in diagnosing loops that are |
820 | /// not vectorized. |
821 | void emitAnalysis(Report &Message) { |
822 | DebugLoc DL = TheLoop->getStartLoc(); |
823 | if (Instruction *I = Message.getInstr()) |
824 | DL = I->getDebugLoc(); |
825 | emitOptimizationRemarkAnalysis(TheFunction->getContext(), DEBUG_TYPE"loop-vectorize", |
826 | *TheFunction, DL, Message.str()); |
827 | } |
828 | |
829 | /// The loop that we evaluate. |
830 | Loop *TheLoop; |
831 | /// Scev analysis. |
832 | ScalarEvolution *SE; |
833 | /// DataLayout analysis. |
834 | const DataLayout *DL; |
835 | /// Dominators. |
836 | DominatorTree *DT; |
837 | /// Target Library Info. |
838 | TargetLibraryInfo *TLI; |
839 | /// Alias analysis. |
840 | AliasAnalysis *AA; |
841 | /// Parent function |
842 | Function *TheFunction; |
843 | |
844 | // --- vectorization state --- // |
845 | |
846 | /// Holds the integer induction variable. This is the counter of the |
847 | /// loop. |
848 | PHINode *Induction; |
849 | /// Holds the reduction variables. |
850 | ReductionList Reductions; |
851 | /// Holds all of the induction variables that we found in the loop. |
852 | /// Notice that inductions don't need to start at zero and that induction |
853 | /// variables can be pointers. |
854 | InductionList Inductions; |
855 | /// Holds the widest induction type encountered. |
856 | Type *WidestIndTy; |
857 | |
858 | /// Allowed outside users. This holds the reduction |
859 | /// vars which can be accessed from outside the loop. |
860 | SmallPtrSet<Value*, 4> AllowedExit; |
861 | /// This set holds the variables which are known to be uniform after |
862 | /// vectorization. |
863 | SmallPtrSet<Instruction*, 4> Uniforms; |
864 | /// We need to check that all of the pointers in this list are disjoint |
865 | /// at runtime. |
866 | RuntimePointerCheck PtrRtCheck; |
867 | /// Can we assume the absence of NaNs. |
868 | bool HasFunNoNaNAttr; |
869 | |
870 | unsigned MaxSafeDepDistBytes; |
871 | |
872 | ValueToValueMap Strides; |
873 | SmallPtrSet<Value *, 8> StrideSet; |
874 | }; |
875 | |
876 | /// LoopVectorizationCostModel - estimates the expected speedups due to |
877 | /// vectorization. |
878 | /// In many cases vectorization is not profitable. This can happen because of |
879 | /// a number of reasons. In this class we mainly attempt to predict the |
880 | /// expected speedup/slowdowns due to the supported instruction set. We use the |
881 | /// TargetTransformInfo to query the different backends for the cost of |
882 | /// different operations. |
883 | class LoopVectorizationCostModel { |
884 | public: |
885 | LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI, |
886 | LoopVectorizationLegality *Legal, |
887 | const TargetTransformInfo &TTI, |
888 | const DataLayout *DL, const TargetLibraryInfo *TLI, |
889 | AssumptionTracker *AT, const Function *F, |
890 | const LoopVectorizeHints *Hints) |
891 | : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI), |
892 | TheFunction(F), Hints(Hints) { |
893 | CodeMetrics::collectEphemeralValues(L, AT, EphValues); |
894 | } |
895 | |
896 | /// Information about vectorization costs |
897 | struct VectorizationFactor { |
898 | unsigned Width; // Vector width with best cost |
899 | unsigned Cost; // Cost of the loop with that width |
900 | }; |
901 | /// \return The most profitable vectorization factor and the cost of that VF. |
902 | /// This method checks every power of two up to VF. If UserVF is not ZERO |
903 | /// then this vectorization factor will be selected if vectorization is |
904 | /// possible. |
905 | VectorizationFactor selectVectorizationFactor(bool OptForSize); |
906 | |
907 | /// \return The size (in bits) of the widest type in the code that |
908 | /// needs to be vectorized. We ignore values that remain scalar such as |
909 | /// 64 bit loop indices. |
910 | unsigned getWidestType(); |
911 | |
912 | /// \return The most profitable unroll factor. |
913 | /// If UserUF is non-zero then this method finds the best unroll-factor |
914 | /// based on register pressure and other parameters. |
915 | /// VF and LoopCost are the selected vectorization factor and the cost of the |
916 | /// selected VF. |
917 | unsigned selectUnrollFactor(bool OptForSize, unsigned VF, unsigned LoopCost); |
918 | |
919 | /// \brief A struct that represents some properties of the register usage |
920 | /// of a loop. |
921 | struct RegisterUsage { |
922 | /// Holds the number of loop invariant values that are used in the loop. |
923 | unsigned LoopInvariantRegs; |
924 | /// Holds the maximum number of concurrent live intervals in the loop. |
925 | unsigned MaxLocalUsers; |
926 | /// Holds the number of instructions in the loop. |
927 | unsigned NumInstructions; |
928 | }; |
929 | |
930 | /// \return information about the register usage of the loop. |
931 | RegisterUsage calculateRegisterUsage(); |
932 | |
933 | private: |
934 | /// Returns the expected execution cost. The unit of the cost does |
935 | /// not matter because we use the 'cost' units to compare different |
936 | /// vector widths. The cost that is returned is *not* normalized by |
937 | /// the factor width. |
938 | unsigned expectedCost(unsigned VF); |
939 | |
940 | /// Returns the execution time cost of an instruction for a given vector |
941 | /// width. Vector width of one means scalar. |
942 | unsigned getInstructionCost(Instruction *I, unsigned VF); |
943 | |
944 | /// A helper function for converting Scalar types to vector types. |
945 | /// If the incoming type is void, we return void. If the VF is 1, we return |
946 | /// the scalar type. |
947 | static Type* ToVectorTy(Type *Scalar, unsigned VF); |
948 | |
949 | /// Returns whether the instruction is a load or store and will be a emitted |
950 | /// as a vector operation. |
951 | bool isConsecutiveLoadOrStore(Instruction *I); |
952 | |
953 | /// Report an analysis message to assist the user in diagnosing loops that are |
954 | /// not vectorized. |
955 | void emitAnalysis(Report &Message) { |
956 | DebugLoc DL = TheLoop->getStartLoc(); |
957 | if (Instruction *I = Message.getInstr()) |
958 | DL = I->getDebugLoc(); |
959 | emitOptimizationRemarkAnalysis(TheFunction->getContext(), DEBUG_TYPE"loop-vectorize", |
960 | *TheFunction, DL, Message.str()); |
961 | } |
962 | |
963 | /// Values used only by @llvm.assume calls. |
964 | SmallPtrSet<const Value *, 32> EphValues; |
965 | |
966 | /// The loop that we evaluate. |
967 | Loop *TheLoop; |
968 | /// Scev analysis. |
969 | ScalarEvolution *SE; |
970 | /// Loop Info analysis. |
971 | LoopInfo *LI; |
972 | /// Vectorization legality. |
973 | LoopVectorizationLegality *Legal; |
974 | /// Vector target information. |
975 | const TargetTransformInfo &TTI; |
976 | /// Target data layout information. |
977 | const DataLayout *DL; |
978 | /// Target Library Info. |
979 | const TargetLibraryInfo *TLI; |
980 | const Function *TheFunction; |
981 | // Loop Vectorize Hint. |
982 | const LoopVectorizeHints *Hints; |
983 | }; |
984 | |
985 | /// Utility class for getting and setting loop vectorizer hints in the form |
986 | /// of loop metadata. |
987 | /// This class keeps a number of loop annotations locally (as member variables) |
988 | /// and can, upon request, write them back as metadata on the loop. It will |
989 | /// initially scan the loop for existing metadata, and will update the local |
990 | /// values based on information in the loop. |
991 | /// We cannot write all values to metadata, as the mere presence of some info, |
992 | /// for example 'force', means a decision has been made. So, we need to be |
993 | /// careful NOT to add them if the user hasn't specifically asked so. |
994 | class LoopVectorizeHints { |
995 | enum HintKind { |
996 | HK_WIDTH, |
997 | HK_UNROLL, |
998 | HK_FORCE |
999 | }; |
1000 | |
1001 | /// Hint - associates name and validation with the hint value. |
1002 | struct Hint { |
1003 | const char * Name; |
1004 | unsigned Value; // This may have to change for non-numeric values. |
1005 | HintKind Kind; |
1006 | |
1007 | Hint(const char * Name, unsigned Value, HintKind Kind) |
1008 | : Name(Name), Value(Value), Kind(Kind) { } |
1009 | |
1010 | bool validate(unsigned Val) { |
1011 | switch (Kind) { |
1012 | case HK_WIDTH: |
1013 | return isPowerOf2_32(Val) && Val <= MaxVectorWidth; |
1014 | case HK_UNROLL: |
1015 | return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor; |
1016 | case HK_FORCE: |
1017 | return (Val <= 1); |
1018 | } |
1019 | return false; |
1020 | } |
1021 | }; |
1022 | |
1023 | /// Vectorization width. |
1024 | Hint Width; |
1025 | /// Vectorization interleave factor. |
1026 | Hint Interleave; |
1027 | /// Vectorization forced |
1028 | Hint Force; |
1029 | |
1030 | /// Return the loop metadata prefix. |
1031 | static StringRef Prefix() { return "llvm.loop."; } |
1032 | |
1033 | public: |
1034 | enum ForceKind { |
1035 | FK_Undefined = -1, ///< Not selected. |
1036 | FK_Disabled = 0, ///< Forcing disabled. |
1037 | FK_Enabled = 1, ///< Forcing enabled. |
1038 | }; |
1039 | |
1040 | LoopVectorizeHints(const Loop *L, bool DisableInterleaving) |
1041 | : Width("vectorize.width", VectorizationFactor, HK_WIDTH), |
1042 | Interleave("interleave.count", DisableInterleaving, HK_UNROLL), |
1043 | Force("vectorize.enable", FK_Undefined, HK_FORCE), |
1044 | TheLoop(L) { |
1045 | // Populate values with existing loop metadata. |
1046 | getHintsFromMetadata(); |
1047 | |
1048 | // force-vector-interleave overrides DisableInterleaving. |
1049 | if (VectorizationInterleave.getNumOccurrences() > 0) |
1050 | Interleave.Value = VectorizationInterleave; |
1051 | |
1052 | DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { if (DisableInterleaving && Interleave .Value == 1) dbgs() << "LV: Interleaving disabled by the pass manager\n" ; } } while (0) |
1053 | << "LV: Interleaving disabled by the pass manager\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { if (DisableInterleaving && Interleave .Value == 1) dbgs() << "LV: Interleaving disabled by the pass manager\n" ; } } while (0); |
1054 | } |
1055 | |
1056 | /// Mark the loop L as already vectorized by setting the width to 1. |
1057 | void setAlreadyVectorized() { |
1058 | Width.Value = Interleave.Value = 1; |
1059 | Hint Hints[] = {Width, Interleave}; |
1060 | writeHintsToMetadata(Hints); |
1061 | } |
1062 | |
1063 | /// Dumps all the hint information. |
1064 | std::string emitRemark() const { |
1065 | Report R; |
1066 | if (Force.Value == LoopVectorizeHints::FK_Disabled) |
1067 | R << "vectorization is explicitly disabled"; |
1068 | else { |
1069 | R << "use -Rpass-analysis=loop-vectorize for more info"; |
1070 | if (Force.Value == LoopVectorizeHints::FK_Enabled) { |
1071 | R << " (Force=true"; |
1072 | if (Width.Value != 0) |
1073 | R << ", Vector Width=" << Width.Value; |
1074 | if (Interleave.Value != 0) |
1075 | R << ", Interleave Count=" << Interleave.Value; |
1076 | R << ")"; |
1077 | } |
1078 | } |
1079 | |
1080 | return R.str(); |
1081 | } |
1082 | |
1083 | unsigned getWidth() const { return Width.Value; } |
1084 | unsigned getInterleave() const { return Interleave.Value; } |
1085 | enum ForceKind getForce() const { return (ForceKind)Force.Value; } |
1086 | |
1087 | private: |
1088 | /// Find hints specified in the loop metadata and update local values. |
1089 | void getHintsFromMetadata() { |
1090 | MDNode *LoopID = TheLoop->getLoopID(); |
1091 | if (!LoopID) |
1092 | return; |
1093 | |
1094 | // First operand should refer to the loop id itself. |
1095 | assert(LoopID->getNumOperands() > 0 && "requires at least one operand")((LoopID->getNumOperands() > 0 && "requires at least one operand" ) ? static_cast<void> (0) : __assert_fail ("LoopID->getNumOperands() > 0 && \"requires at least one operand\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1095, __PRETTY_FUNCTION__)); |
1096 | assert(LoopID->getOperand(0) == LoopID && "invalid loop id")((LoopID->getOperand(0) == LoopID && "invalid loop id" ) ? static_cast<void> (0) : __assert_fail ("LoopID->getOperand(0) == LoopID && \"invalid loop id\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1096, __PRETTY_FUNCTION__)); |
1097 | |
1098 | for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { |
1099 | const MDString *S = nullptr; |
1100 | SmallVector<Value*, 4> Args; |
1101 | |
1102 | // The expected hint is either a MDString or a MDNode with the first |
1103 | // operand a MDString. |
1104 | if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) { |
1105 | if (!MD || MD->getNumOperands() == 0) |
1106 | continue; |
1107 | S = dyn_cast<MDString>(MD->getOperand(0)); |
1108 | for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i) |
1109 | Args.push_back(MD->getOperand(i)); |
1110 | } else { |
1111 | S = dyn_cast<MDString>(LoopID->getOperand(i)); |
1112 | assert(Args.size() == 0 && "too many arguments for MDString")((Args.size() == 0 && "too many arguments for MDString" ) ? static_cast<void> (0) : __assert_fail ("Args.size() == 0 && \"too many arguments for MDString\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1112, __PRETTY_FUNCTION__)); |
1113 | } |
1114 | |
1115 | if (!S) |
1116 | continue; |
1117 | |
1118 | // Check if the hint starts with the loop metadata prefix. |
1119 | StringRef Name = S->getString(); |
1120 | if (Args.size() == 1) |
1121 | setHint(Name, Args[0]); |
1122 | } |
1123 | } |
1124 | |
1125 | /// Checks string hint with one operand and set value if valid. |
1126 | void setHint(StringRef Name, Value *Arg) { |
1127 | if (!Name.startswith(Prefix())) |
1128 | return; |
1129 | Name = Name.substr(Prefix().size(), StringRef::npos); |
1130 | |
1131 | const ConstantInt *C = dyn_cast<ConstantInt>(Arg); |
1132 | if (!C) return; |
1133 | unsigned Val = C->getZExtValue(); |
1134 | |
1135 | Hint *Hints[] = {&Width, &Interleave, &Force}; |
1136 | for (auto H : Hints) { |
1137 | if (Name == H->Name) { |
1138 | if (H->validate(Val)) |
1139 | H->Value = Val; |
1140 | else |
1141 | DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: ignoring invalid hint '" << Name << "'\n"; } } while (0); |
1142 | break; |
1143 | } |
1144 | } |
1145 | } |
1146 | |
1147 | /// Create a new hint from name / value pair. |
1148 | MDNode *createHintMetadata(StringRef Name, unsigned V) const { |
1149 | LLVMContext &Context = TheLoop->getHeader()->getContext(); |
1150 | Value *Vals[] = {MDString::get(Context, Name), |
1151 | ConstantInt::get(Type::getInt32Ty(Context), V)}; |
1152 | return MDNode::get(Context, Vals); |
1153 | } |
1154 | |
1155 | /// Matches metadata with hint name. |
1156 | bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) { |
1157 | MDString* Name = dyn_cast<MDString>(Node->getOperand(0)); |
1158 | if (!Name) |
1159 | return false; |
1160 | |
1161 | for (auto H : HintTypes) |
1162 | if (Name->getString().endswith(H.Name)) |
1163 | return true; |
1164 | return false; |
1165 | } |
1166 | |
1167 | /// Sets current hints into loop metadata, keeping other values intact. |
1168 | void writeHintsToMetadata(ArrayRef<Hint> HintTypes) { |
1169 | if (HintTypes.size() == 0) |
1170 | return; |
1171 | |
1172 | // Reserve the first element to LoopID (see below). |
1173 | SmallVector<Value*, 4> Vals(1); |
1174 | // If the loop already has metadata, then ignore the existing operands. |
1175 | MDNode *LoopID = TheLoop->getLoopID(); |
1176 | if (LoopID) { |
1177 | for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { |
1178 | MDNode *Node = cast<MDNode>(LoopID->getOperand(i)); |
1179 | // If node in update list, ignore old value. |
1180 | if (!matchesHintMetadataName(Node, HintTypes)) |
1181 | Vals.push_back(Node); |
1182 | } |
1183 | } |
1184 | |
1185 | // Now, add the missing hints. |
1186 | for (auto H : HintTypes) |
1187 | Vals.push_back( |
1188 | createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value)); |
1189 | |
1190 | // Replace current metadata node with new one. |
1191 | LLVMContext &Context = TheLoop->getHeader()->getContext(); |
1192 | MDNode *NewLoopID = MDNode::get(Context, Vals); |
1193 | // Set operand 0 to refer to the loop id itself. |
1194 | NewLoopID->replaceOperandWith(0, NewLoopID); |
1195 | |
1196 | TheLoop->setLoopID(NewLoopID); |
1197 | if (LoopID) |
1198 | LoopID->replaceAllUsesWith(NewLoopID); |
1199 | LoopID = NewLoopID; |
Value stored to 'LoopID' is never read | |
1200 | } |
1201 | |
1202 | /// The loop these hints belong to. |
1203 | const Loop *TheLoop; |
1204 | }; |
1205 | |
1206 | static void emitMissedWarning(Function *F, Loop *L, |
1207 | const LoopVectorizeHints &LH) { |
1208 | emitOptimizationRemarkMissed(F->getContext(), DEBUG_TYPE"loop-vectorize", *F, |
1209 | L->getStartLoc(), LH.emitRemark()); |
1210 | |
1211 | if (LH.getForce() == LoopVectorizeHints::FK_Enabled) { |
1212 | if (LH.getWidth() != 1) |
1213 | emitLoopVectorizeWarning( |
1214 | F->getContext(), *F, L->getStartLoc(), |
1215 | "failed explicitly specified loop vectorization"); |
1216 | else if (LH.getInterleave() != 1) |
1217 | emitLoopInterleaveWarning( |
1218 | F->getContext(), *F, L->getStartLoc(), |
1219 | "failed explicitly specified loop interleaving"); |
1220 | } |
1221 | } |
1222 | |
1223 | static void addInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) { |
1224 | if (L.empty()) |
1225 | return V.push_back(&L); |
1226 | |
1227 | for (Loop *InnerL : L) |
1228 | addInnerLoop(*InnerL, V); |
1229 | } |
1230 | |
1231 | /// The LoopVectorize Pass. |
1232 | struct LoopVectorize : public FunctionPass { |
1233 | /// Pass identification, replacement for typeid |
1234 | static char ID; |
1235 | |
1236 | explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true) |
1237 | : FunctionPass(ID), |
1238 | DisableUnrolling(NoUnrolling), |
1239 | AlwaysVectorize(AlwaysVectorize) { |
1240 | initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); |
1241 | } |
1242 | |
1243 | ScalarEvolution *SE; |
1244 | const DataLayout *DL; |
1245 | LoopInfo *LI; |
1246 | TargetTransformInfo *TTI; |
1247 | DominatorTree *DT; |
1248 | BlockFrequencyInfo *BFI; |
1249 | TargetLibraryInfo *TLI; |
1250 | AliasAnalysis *AA; |
1251 | AssumptionTracker *AT; |
1252 | bool DisableUnrolling; |
1253 | bool AlwaysVectorize; |
1254 | |
1255 | BlockFrequency ColdEntryFreq; |
1256 | |
1257 | bool runOnFunction(Function &F) override { |
1258 | SE = &getAnalysis<ScalarEvolution>(); |
1259 | DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); |
1260 | DL = DLP ? &DLP->getDataLayout() : nullptr; |
1261 | LI = &getAnalysis<LoopInfo>(); |
1262 | TTI = &getAnalysis<TargetTransformInfo>(); |
1263 | DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); |
1264 | BFI = &getAnalysis<BlockFrequencyInfo>(); |
1265 | TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); |
1266 | AA = &getAnalysis<AliasAnalysis>(); |
1267 | AT = &getAnalysis<AssumptionTracker>(); |
1268 | |
1269 | // Compute some weights outside of the loop over the loops. Compute this |
1270 | // using a BranchProbability to re-use its scaling math. |
1271 | const BranchProbability ColdProb(1, 5); // 20% |
1272 | ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb; |
1273 | |
1274 | // If the target claims to have no vector registers don't attempt |
1275 | // vectorization. |
1276 | if (!TTI->getNumberOfRegisters(true)) |
1277 | return false; |
1278 | |
1279 | if (!DL) { |
1280 | DEBUG(dbgs() << "\nLV: Not vectorizing " << F.getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "\nLV: Not vectorizing " << F.getName() << ": Missing data layout\n"; } } while (0) |
1281 | << ": Missing data layout\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "\nLV: Not vectorizing " << F.getName() << ": Missing data layout\n"; } } while (0); |
1282 | return false; |
1283 | } |
1284 | |
1285 | // Build up a worklist of inner-loops to vectorize. This is necessary as |
1286 | // the act of vectorizing or partially unrolling a loop creates new loops |
1287 | // and can invalidate iterators across the loops. |
1288 | SmallVector<Loop *, 8> Worklist; |
1289 | |
1290 | for (Loop *L : *LI) |
1291 | addInnerLoop(*L, Worklist); |
1292 | |
1293 | LoopsAnalyzed += Worklist.size(); |
1294 | |
1295 | // Now walk the identified inner loops. |
1296 | bool Changed = false; |
1297 | while (!Worklist.empty()) |
1298 | Changed |= processLoop(Worklist.pop_back_val()); |
1299 | |
1300 | // Process each loop nest in the function. |
1301 | return Changed; |
1302 | } |
1303 | |
1304 | bool processLoop(Loop *L) { |
1305 | assert(L->empty() && "Only process inner loops.")((L->empty() && "Only process inner loops.") ? static_cast <void> (0) : __assert_fail ("L->empty() && \"Only process inner loops.\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1305, __PRETTY_FUNCTION__)); |
1306 | |
1307 | #ifndef NDEBUG |
1308 | const std::string DebugLocStr = getDebugLocString(L); |
1309 | #endif /* NDEBUG */ |
1310 | |
1311 | DEBUG(dbgs() << "\nLV: Checking a loop in \""do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in \"" << L->getHeader()->getParent()->getName() << "\" from " << DebugLocStr << "\n"; } } while (0) |
1312 | << L->getHeader()->getParent()->getName() << "\" from "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in \"" << L->getHeader()->getParent()->getName() << "\" from " << DebugLocStr << "\n"; } } while (0) |
1313 | << DebugLocStr << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in \"" << L->getHeader()->getParent()->getName() << "\" from " << DebugLocStr << "\n"; } } while (0); |
1314 | |
1315 | LoopVectorizeHints Hints(L, DisableUnrolling); |
1316 | |
1317 | DEBUG(dbgs() << "LV: Loop hints:"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " unroll=" << Hints.getInterleave( ) << "\n"; } } while (0) |
1318 | << " force="do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " unroll=" << Hints.getInterleave( ) << "\n"; } } while (0) |
1319 | << (Hints.getForce() == LoopVectorizeHints::FK_Disableddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " unroll=" << Hints.getInterleave( ) << "\n"; } } while (0) |
1320 | ? "disabled"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " unroll=" << Hints.getInterleave( ) << "\n"; } } while (0) |
1321 | : (Hints.getForce() == LoopVectorizeHints::FK_Enableddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " unroll=" << Hints.getInterleave( ) << "\n"; } } while (0) |
1322 | ? "enabled"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " unroll=" << Hints.getInterleave( ) << "\n"; } } while (0) |
1323 | : "?")) << " width=" << Hints.getWidth()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " unroll=" << Hints.getInterleave( ) << "\n"; } } while (0) |
1324 | << " unroll=" << Hints.getInterleave() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop hints:" << " force=" << (Hints.getForce() == LoopVectorizeHints:: FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints ::FK_Enabled ? "enabled" : "?")) << " width=" << Hints .getWidth() << " unroll=" << Hints.getInterleave( ) << "\n"; } } while (0); |
1325 | |
1326 | // Function containing loop |
1327 | Function *F = L->getHeader()->getParent(); |
1328 | |
1329 | // Looking at the diagnostic output is the only way to determine if a loop |
1330 | // was vectorized (other than looking at the IR or machine code), so it |
1331 | // is important to generate an optimization remark for each loop. Most of |
1332 | // these messages are generated by emitOptimizationRemarkAnalysis. Remarks |
1333 | // generated by emitOptimizationRemark and emitOptimizationRemarkMissed are |
1334 | // less verbose reporting vectorized loops and unvectorized loops that may |
1335 | // benefit from vectorization, respectively. |
1336 | |
1337 | if (Hints.getForce() == LoopVectorizeHints::FK_Disabled) { |
1338 | DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n" ; } } while (0); |
1339 | emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE"loop-vectorize", *F, |
1340 | L->getStartLoc(), Hints.emitRemark()); |
1341 | return false; |
1342 | } |
1343 | |
1344 | if (!AlwaysVectorize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) { |
1345 | DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n" ; } } while (0); |
1346 | emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE"loop-vectorize", *F, |
1347 | L->getStartLoc(), Hints.emitRemark()); |
1348 | return false; |
1349 | } |
1350 | |
1351 | if (Hints.getWidth() == 1 && Hints.getInterleave() == 1) { |
1352 | DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n" ; } } while (0); |
1353 | emitOptimizationRemarkAnalysis( |
1354 | F->getContext(), DEBUG_TYPE"loop-vectorize", *F, L->getStartLoc(), |
1355 | "loop not vectorized: vector width and interleave count are " |
1356 | "explicitly set to 1"); |
1357 | return false; |
1358 | } |
1359 | |
1360 | // Check the loop for a trip count threshold: |
1361 | // do not vectorize loops with a tiny trip count. |
1362 | const unsigned TC = SE->getSmallConstantTripCount(L); |
1363 | if (TC > 0u && TC < TinyTripCountVectorThreshold) { |
1364 | DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a loop with a very small trip count. " << "This loop is not worth vectorizing."; } } while (0 ) |
1365 | << "This loop is not worth vectorizing.")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a loop with a very small trip count. " << "This loop is not worth vectorizing."; } } while (0 ); |
1366 | if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) |
1367 | DEBUG(dbgs() << " But vectorizing was explicitly forced.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << " But vectorizing was explicitly forced.\n" ; } } while (0); |
1368 | else { |
1369 | DEBUG(dbgs() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "\n"; } } while (0); |
1370 | emitOptimizationRemarkAnalysis( |
1371 | F->getContext(), DEBUG_TYPE"loop-vectorize", *F, L->getStartLoc(), |
1372 | "vectorization is not beneficial and is not explicitly forced"); |
1373 | return false; |
1374 | } |
1375 | } |
1376 | |
1377 | // Check if it is legal to vectorize the loop. |
1378 | LoopVectorizationLegality LVL(L, SE, DL, DT, TLI, AA, F); |
1379 | if (!LVL.canVectorize()) { |
1380 | DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Cannot prove legality.\n" ; } } while (0); |
1381 | emitMissedWarning(F, L, Hints); |
1382 | return false; |
1383 | } |
1384 | |
1385 | // Use the cost model. |
1386 | LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL, TLI, AT, F, |
1387 | &Hints); |
1388 | |
1389 | // Check the function attributes to find out if this function should be |
1390 | // optimized for size. |
1391 | bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled && |
1392 | F->hasFnAttribute(Attribute::OptimizeForSize); |
1393 | |
1394 | // Compute the weighted frequency of this loop being executed and see if it |
1395 | // is less than 20% of the function entry baseline frequency. Note that we |
1396 | // always have a canonical loop here because we think we *can* vectoriez. |
1397 | // FIXME: This is hidden behind a flag due to pervasive problems with |
1398 | // exactly what block frequency models. |
1399 | if (LoopVectorizeWithBlockFrequency) { |
1400 | BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader()); |
1401 | if (Hints.getForce() != LoopVectorizeHints::FK_Enabled && |
1402 | LoopEntryFreq < ColdEntryFreq) |
1403 | OptForSize = true; |
1404 | } |
1405 | |
1406 | // Check the function attributes to see if implicit floats are allowed.a |
1407 | // FIXME: This check doesn't seem possibly correct -- what if the loop is |
1408 | // an integer loop and the vector instructions selected are purely integer |
1409 | // vector instructions? |
1410 | if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { |
1411 | DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Can't vectorize when the NoImplicitFloat" "attribute is used.\n"; } } while (0) |
1412 | "attribute is used.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Can't vectorize when the NoImplicitFloat" "attribute is used.\n"; } } while (0); |
1413 | emitOptimizationRemarkAnalysis( |
1414 | F->getContext(), DEBUG_TYPE"loop-vectorize", *F, L->getStartLoc(), |
1415 | "loop not vectorized due to NoImplicitFloat attribute"); |
1416 | emitMissedWarning(F, L, Hints); |
1417 | return false; |
1418 | } |
1419 | |
1420 | // Select the optimal vectorization factor. |
1421 | const LoopVectorizationCostModel::VectorizationFactor VF = |
1422 | CM.selectVectorizationFactor(OptForSize); |
1423 | |
1424 | // Select the unroll factor. |
1425 | const unsigned UF = |
1426 | CM.selectUnrollFactor(OptForSize, VF.Width, VF.Cost); |
1427 | |
1428 | DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " << DebugLocStr << '\n'; } } while (0) |
1429 | << DebugLocStr << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " << DebugLocStr << '\n'; } } while (0); |
1430 | DEBUG(dbgs() << "LV: Unroll Factor is " << UF << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Unroll Factor is " << UF << '\n'; } } while (0); |
1431 | |
1432 | if (VF.Width == 1) { |
1433 | DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Vectorization is possible but not beneficial\n" ; } } while (0); |
1434 | |
1435 | if (UF == 1) { |
1436 | emitOptimizationRemarkAnalysis( |
1437 | F->getContext(), DEBUG_TYPE"loop-vectorize", *F, L->getStartLoc(), |
1438 | "not beneficial to vectorize and user disabled interleaving"); |
1439 | return false; |
1440 | } |
1441 | DEBUG(dbgs() << "LV: Trying to at least unroll the loops.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Trying to at least unroll the loops.\n" ; } } while (0); |
1442 | |
1443 | // Report the unrolling decision. |
1444 | emitOptimizationRemark(F->getContext(), DEBUG_TYPE"loop-vectorize", *F, L->getStartLoc(), |
1445 | Twine("unrolled with interleaving factor " + |
1446 | Twine(UF) + |
1447 | " (vectorization not beneficial)")); |
1448 | |
1449 | // We decided not to vectorize, but we may want to unroll. |
1450 | |
1451 | InnerLoopUnroller Unroller(L, SE, LI, DT, DL, TLI, UF); |
1452 | Unroller.vectorize(&LVL); |
1453 | } else { |
1454 | // If we decided that it is *legal* to vectorize the loop then do it. |
1455 | InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF); |
1456 | LB.vectorize(&LVL); |
1457 | ++LoopsVectorized; |
1458 | |
1459 | // Report the vectorization decision. |
1460 | emitOptimizationRemark( |
1461 | F->getContext(), DEBUG_TYPE"loop-vectorize", *F, L->getStartLoc(), |
1462 | Twine("vectorized loop (vectorization factor: ") + Twine(VF.Width) + |
1463 | ", unrolling interleave factor: " + Twine(UF) + ")"); |
1464 | } |
1465 | |
1466 | // Mark the loop as already vectorized to avoid vectorizing again. |
1467 | Hints.setAlreadyVectorized(); |
1468 | |
1469 | DEBUG(verifyFunction(*L->getHeader()->getParent()))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { verifyFunction(*L->getHeader()->getParent ()); } } while (0); |
1470 | return true; |
1471 | } |
1472 | |
1473 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
1474 | AU.addRequired<AssumptionTracker>(); |
1475 | AU.addRequiredID(LoopSimplifyID); |
1476 | AU.addRequiredID(LCSSAID); |
1477 | AU.addRequired<BlockFrequencyInfo>(); |
1478 | AU.addRequired<DominatorTreeWrapperPass>(); |
1479 | AU.addRequired<LoopInfo>(); |
1480 | AU.addRequired<ScalarEvolution>(); |
1481 | AU.addRequired<TargetTransformInfo>(); |
1482 | AU.addRequired<AliasAnalysis>(); |
1483 | AU.addPreserved<LoopInfo>(); |
1484 | AU.addPreserved<DominatorTreeWrapperPass>(); |
1485 | AU.addPreserved<AliasAnalysis>(); |
1486 | } |
1487 | |
1488 | }; |
1489 | |
1490 | } // end anonymous namespace |
1491 | |
1492 | //===----------------------------------------------------------------------===// |
1493 | // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and |
1494 | // LoopVectorizationCostModel. |
1495 | //===----------------------------------------------------------------------===// |
1496 | |
1497 | static Value *stripIntegerCast(Value *V) { |
1498 | if (CastInst *CI = dyn_cast<CastInst>(V)) |
1499 | if (CI->getOperand(0)->getType()->isIntegerTy()) |
1500 | return CI->getOperand(0); |
1501 | return V; |
1502 | } |
1503 | |
1504 | ///\brief Replaces the symbolic stride in a pointer SCEV expression by one. |
1505 | /// |
1506 | /// If \p OrigPtr is not null, use it to look up the stride value instead of |
1507 | /// \p Ptr. |
1508 | static const SCEV *replaceSymbolicStrideSCEV(ScalarEvolution *SE, |
1509 | ValueToValueMap &PtrToStride, |
1510 | Value *Ptr, Value *OrigPtr = nullptr) { |
1511 | |
1512 | const SCEV *OrigSCEV = SE->getSCEV(Ptr); |
1513 | |
1514 | // If there is an entry in the map return the SCEV of the pointer with the |
1515 | // symbolic stride replaced by one. |
1516 | ValueToValueMap::iterator SI = PtrToStride.find(OrigPtr ? OrigPtr : Ptr); |
1517 | if (SI != PtrToStride.end()) { |
1518 | Value *StrideVal = SI->second; |
1519 | |
1520 | // Strip casts. |
1521 | StrideVal = stripIntegerCast(StrideVal); |
1522 | |
1523 | // Replace symbolic stride by one. |
1524 | Value *One = ConstantInt::get(StrideVal->getType(), 1); |
1525 | ValueToValueMap RewriteMap; |
1526 | RewriteMap[StrideVal] = One; |
1527 | |
1528 | const SCEV *ByOne = |
1529 | SCEVParameterRewriter::rewrite(OrigSCEV, *SE, RewriteMap, true); |
1530 | DEBUG(dbgs() << "LV: Replacing SCEV: " << *OrigSCEV << " by: " << *ByOnedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Replacing SCEV: " << *OrigSCEV << " by: " << *ByOne << "\n"; } } while (0) |
1531 | << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Replacing SCEV: " << *OrigSCEV << " by: " << *ByOne << "\n"; } } while (0); |
1532 | return ByOne; |
1533 | } |
1534 | |
1535 | // Otherwise, just return the SCEV of the original pointer. |
1536 | return SE->getSCEV(Ptr); |
1537 | } |
1538 | |
1539 | void LoopVectorizationLegality::RuntimePointerCheck::insert( |
1540 | ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr, unsigned DepSetId, |
1541 | unsigned ASId, ValueToValueMap &Strides) { |
1542 | // Get the stride replaced scev. |
1543 | const SCEV *Sc = replaceSymbolicStrideSCEV(SE, Strides, Ptr); |
1544 | const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc); |
1545 | assert(AR && "Invalid addrec expression")((AR && "Invalid addrec expression") ? static_cast< void> (0) : __assert_fail ("AR && \"Invalid addrec expression\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1545, __PRETTY_FUNCTION__)); |
1546 | const SCEV *Ex = SE->getBackedgeTakenCount(Lp); |
1547 | const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE); |
1548 | Pointers.push_back(Ptr); |
1549 | Starts.push_back(AR->getStart()); |
1550 | Ends.push_back(ScEnd); |
1551 | IsWritePtr.push_back(WritePtr); |
1552 | DependencySetId.push_back(DepSetId); |
1553 | AliasSetId.push_back(ASId); |
1554 | } |
1555 | |
1556 | Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { |
1557 | // We need to place the broadcast of invariant variables outside the loop. |
1558 | Instruction *Instr = dyn_cast<Instruction>(V); |
1559 | bool NewInstr = |
1560 | (Instr && std::find(LoopVectorBody.begin(), LoopVectorBody.end(), |
1561 | Instr->getParent()) != LoopVectorBody.end()); |
1562 | bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr; |
1563 | |
1564 | // Place the code for broadcasting invariant variables in the new preheader. |
1565 | IRBuilder<>::InsertPointGuard Guard(Builder); |
1566 | if (Invariant) |
1567 | Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); |
1568 | |
1569 | // Broadcast the scalar into all locations in the vector. |
1570 | Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); |
1571 | |
1572 | return Shuf; |
1573 | } |
1574 | |
1575 | Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, int StartIdx, |
1576 | bool Negate) { |
1577 | assert(Val->getType()->isVectorTy() && "Must be a vector")((Val->getType()->isVectorTy() && "Must be a vector" ) ? static_cast<void> (0) : __assert_fail ("Val->getType()->isVectorTy() && \"Must be a vector\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1577, __PRETTY_FUNCTION__)); |
1578 | assert(Val->getType()->getScalarType()->isIntegerTy() &&((Val->getType()->getScalarType()->isIntegerTy() && "Elem must be an integer") ? static_cast<void> (0) : __assert_fail ("Val->getType()->getScalarType()->isIntegerTy() && \"Elem must be an integer\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1579, __PRETTY_FUNCTION__)) |
1579 | "Elem must be an integer")((Val->getType()->getScalarType()->isIntegerTy() && "Elem must be an integer") ? static_cast<void> (0) : __assert_fail ("Val->getType()->getScalarType()->isIntegerTy() && \"Elem must be an integer\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1579, __PRETTY_FUNCTION__)); |
1580 | // Create the types. |
1581 | Type *ITy = Val->getType()->getScalarType(); |
1582 | VectorType *Ty = cast<VectorType>(Val->getType()); |
1583 | int VLen = Ty->getNumElements(); |
1584 | SmallVector<Constant*, 8> Indices; |
1585 | |
1586 | // Create a vector of consecutive numbers from zero to VF. |
1587 | for (int i = 0; i < VLen; ++i) { |
1588 | int64_t Idx = Negate ? (-i) : i; |
1589 | Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx, Negate)); |
1590 | } |
1591 | |
1592 | // Add the consecutive indices to the vector value. |
1593 | Constant *Cv = ConstantVector::get(Indices); |
1594 | assert(Cv->getType() == Val->getType() && "Invalid consecutive vec")((Cv->getType() == Val->getType() && "Invalid consecutive vec" ) ? static_cast<void> (0) : __assert_fail ("Cv->getType() == Val->getType() && \"Invalid consecutive vec\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1594, __PRETTY_FUNCTION__)); |
1595 | return Builder.CreateAdd(Val, Cv, "induction"); |
1596 | } |
1597 | |
1598 | /// \brief Find the operand of the GEP that should be checked for consecutive |
1599 | /// stores. This ignores trailing indices that have no effect on the final |
1600 | /// pointer. |
1601 | static unsigned getGEPInductionOperand(const DataLayout *DL, |
1602 | const GetElementPtrInst *Gep) { |
1603 | unsigned LastOperand = Gep->getNumOperands() - 1; |
1604 | unsigned GEPAllocSize = DL->getTypeAllocSize( |
1605 | cast<PointerType>(Gep->getType()->getScalarType())->getElementType()); |
1606 | |
1607 | // Walk backwards and try to peel off zeros. |
1608 | while (LastOperand > 1 && match(Gep->getOperand(LastOperand), m_Zero())) { |
1609 | // Find the type we're currently indexing into. |
1610 | gep_type_iterator GEPTI = gep_type_begin(Gep); |
1611 | std::advance(GEPTI, LastOperand - 1); |
1612 | |
1613 | // If it's a type with the same allocation size as the result of the GEP we |
1614 | // can peel off the zero index. |
1615 | if (DL->getTypeAllocSize(*GEPTI) != GEPAllocSize) |
1616 | break; |
1617 | --LastOperand; |
1618 | } |
1619 | |
1620 | return LastOperand; |
1621 | } |
1622 | |
1623 | int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { |
1624 | assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr")((Ptr->getType()->isPointerTy() && "Unexpected non-ptr" ) ? static_cast<void> (0) : __assert_fail ("Ptr->getType()->isPointerTy() && \"Unexpected non-ptr\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1624, __PRETTY_FUNCTION__)); |
1625 | // Make sure that the pointer does not point to structs. |
1626 | if (Ptr->getType()->getPointerElementType()->isAggregateType()) |
1627 | return 0; |
1628 | |
1629 | // If this value is a pointer induction variable we know it is consecutive. |
1630 | PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr); |
1631 | if (Phi && Inductions.count(Phi)) { |
1632 | InductionInfo II = Inductions[Phi]; |
1633 | if (IK_PtrInduction == II.IK) |
1634 | return 1; |
1635 | else if (IK_ReversePtrInduction == II.IK) |
1636 | return -1; |
1637 | } |
1638 | |
1639 | GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr); |
1640 | if (!Gep) |
1641 | return 0; |
1642 | |
1643 | unsigned NumOperands = Gep->getNumOperands(); |
1644 | Value *GpPtr = Gep->getPointerOperand(); |
1645 | // If this GEP value is a consecutive pointer induction variable and all of |
1646 | // the indices are constant then we know it is consecutive. We can |
1647 | Phi = dyn_cast<PHINode>(GpPtr); |
1648 | if (Phi && Inductions.count(Phi)) { |
1649 | |
1650 | // Make sure that the pointer does not point to structs. |
1651 | PointerType *GepPtrType = cast<PointerType>(GpPtr->getType()); |
1652 | if (GepPtrType->getElementType()->isAggregateType()) |
1653 | return 0; |
1654 | |
1655 | // Make sure that all of the index operands are loop invariant. |
1656 | for (unsigned i = 1; i < NumOperands; ++i) |
1657 | if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) |
1658 | return 0; |
1659 | |
1660 | InductionInfo II = Inductions[Phi]; |
1661 | if (IK_PtrInduction == II.IK) |
1662 | return 1; |
1663 | else if (IK_ReversePtrInduction == II.IK) |
1664 | return -1; |
1665 | } |
1666 | |
1667 | unsigned InductionOperand = getGEPInductionOperand(DL, Gep); |
1668 | |
1669 | // Check that all of the gep indices are uniform except for our induction |
1670 | // operand. |
1671 | for (unsigned i = 0; i != NumOperands; ++i) |
1672 | if (i != InductionOperand && |
1673 | !SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) |
1674 | return 0; |
1675 | |
1676 | // We can emit wide load/stores only if the last non-zero index is the |
1677 | // induction variable. |
1678 | const SCEV *Last = nullptr; |
1679 | if (!Strides.count(Gep)) |
1680 | Last = SE->getSCEV(Gep->getOperand(InductionOperand)); |
1681 | else { |
1682 | // Because of the multiplication by a stride we can have a s/zext cast. |
1683 | // We are going to replace this stride by 1 so the cast is safe to ignore. |
1684 | // |
1685 | // %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] |
1686 | // %0 = trunc i64 %indvars.iv to i32 |
1687 | // %mul = mul i32 %0, %Stride1 |
1688 | // %idxprom = zext i32 %mul to i64 << Safe cast. |
1689 | // %arrayidx = getelementptr inbounds i32* %B, i64 %idxprom |
1690 | // |
1691 | Last = replaceSymbolicStrideSCEV(SE, Strides, |
1692 | Gep->getOperand(InductionOperand), Gep); |
1693 | if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(Last)) |
1694 | Last = |
1695 | (C->getSCEVType() == scSignExtend || C->getSCEVType() == scZeroExtend) |
1696 | ? C->getOperand() |
1697 | : Last; |
1698 | } |
1699 | if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) { |
1700 | const SCEV *Step = AR->getStepRecurrence(*SE); |
1701 | |
1702 | // The memory is consecutive because the last index is consecutive |
1703 | // and all other indices are loop invariant. |
1704 | if (Step->isOne()) |
1705 | return 1; |
1706 | if (Step->isAllOnesValue()) |
1707 | return -1; |
1708 | } |
1709 | |
1710 | return 0; |
1711 | } |
1712 | |
1713 | bool LoopVectorizationLegality::isUniform(Value *V) { |
1714 | return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop)); |
1715 | } |
1716 | |
1717 | InnerLoopVectorizer::VectorParts& |
1718 | InnerLoopVectorizer::getVectorValue(Value *V) { |
1719 | assert(V != Induction && "The new induction variable should not be used.")((V != Induction && "The new induction variable should not be used." ) ? static_cast<void> (0) : __assert_fail ("V != Induction && \"The new induction variable should not be used.\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1719, __PRETTY_FUNCTION__)); |
1720 | assert(!V->getType()->isVectorTy() && "Can't widen a vector")((!V->getType()->isVectorTy() && "Can't widen a vector" ) ? static_cast<void> (0) : __assert_fail ("!V->getType()->isVectorTy() && \"Can't widen a vector\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1720, __PRETTY_FUNCTION__)); |
1721 | |
1722 | // If we have a stride that is replaced by one, do it here. |
1723 | if (Legal->hasStride(V)) |
1724 | V = ConstantInt::get(V->getType(), 1); |
1725 | |
1726 | // If we have this scalar in the map, return it. |
1727 | if (WidenMap.has(V)) |
1728 | return WidenMap.get(V); |
1729 | |
1730 | // If this scalar is unknown, assume that it is a constant or that it is |
1731 | // loop invariant. Broadcast V and save the value for future uses. |
1732 | Value *B = getBroadcastInstrs(V); |
1733 | return WidenMap.splat(V, B); |
1734 | } |
1735 | |
1736 | Value *InnerLoopVectorizer::reverseVector(Value *Vec) { |
1737 | assert(Vec->getType()->isVectorTy() && "Invalid type")((Vec->getType()->isVectorTy() && "Invalid type" ) ? static_cast<void> (0) : __assert_fail ("Vec->getType()->isVectorTy() && \"Invalid type\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1737, __PRETTY_FUNCTION__)); |
1738 | SmallVector<Constant*, 8> ShuffleMask; |
1739 | for (unsigned i = 0; i < VF; ++i) |
1740 | ShuffleMask.push_back(Builder.getInt32(VF - i - 1)); |
1741 | |
1742 | return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), |
1743 | ConstantVector::get(ShuffleMask), |
1744 | "reverse"); |
1745 | } |
1746 | |
1747 | void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { |
1748 | // Attempt to issue a wide load. |
1749 | LoadInst *LI = dyn_cast<LoadInst>(Instr); |
1750 | StoreInst *SI = dyn_cast<StoreInst>(Instr); |
1751 | |
1752 | assert((LI || SI) && "Invalid Load/Store instruction")(((LI || SI) && "Invalid Load/Store instruction") ? static_cast <void> (0) : __assert_fail ("(LI || SI) && \"Invalid Load/Store instruction\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1752, __PRETTY_FUNCTION__)); |
1753 | |
1754 | Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType(); |
1755 | Type *DataTy = VectorType::get(ScalarDataTy, VF); |
1756 | Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); |
1757 | unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment(); |
1758 | // An alignment of 0 means target abi alignment. We need to use the scalar's |
1759 | // target abi alignment in such a case. |
1760 | if (!Alignment) |
1761 | Alignment = DL->getABITypeAlignment(ScalarDataTy); |
1762 | unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); |
1763 | unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy); |
1764 | unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF; |
1765 | |
1766 | if (SI && Legal->blockNeedsPredication(SI->getParent())) |
1767 | return scalarizeInstruction(Instr, true); |
1768 | |
1769 | if (ScalarAllocatedSize != VectorElementSize) |
1770 | return scalarizeInstruction(Instr); |
1771 | |
1772 | // If the pointer is loop invariant or if it is non-consecutive, |
1773 | // scalarize the load. |
1774 | int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); |
1775 | bool Reverse = ConsecutiveStride < 0; |
1776 | bool UniformLoad = LI && Legal->isUniform(Ptr); |
1777 | if (!ConsecutiveStride || UniformLoad) |
1778 | return scalarizeInstruction(Instr); |
1779 | |
1780 | Constant *Zero = Builder.getInt32(0); |
1781 | VectorParts &Entry = WidenMap.get(Instr); |
1782 | |
1783 | // Handle consecutive loads/stores. |
1784 | GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); |
1785 | if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) { |
1786 | setDebugLocFromInst(Builder, Gep); |
1787 | Value *PtrOperand = Gep->getPointerOperand(); |
1788 | Value *FirstBasePtr = getVectorValue(PtrOperand)[0]; |
1789 | FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero); |
1790 | |
1791 | // Create the new GEP with the new induction variable. |
1792 | GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); |
1793 | Gep2->setOperand(0, FirstBasePtr); |
1794 | Gep2->setName("gep.indvar.base"); |
1795 | Ptr = Builder.Insert(Gep2); |
1796 | } else if (Gep) { |
1797 | setDebugLocFromInst(Builder, Gep); |
1798 | assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()),((SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand ()), OrigLoop) && "Base ptr must be invariant") ? static_cast <void> (0) : __assert_fail ("SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()), OrigLoop) && \"Base ptr must be invariant\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1799, __PRETTY_FUNCTION__)) |
1799 | OrigLoop) && "Base ptr must be invariant")((SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand ()), OrigLoop) && "Base ptr must be invariant") ? static_cast <void> (0) : __assert_fail ("SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()), OrigLoop) && \"Base ptr must be invariant\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1799, __PRETTY_FUNCTION__)); |
1800 | |
1801 | // The last index does not have to be the induction. It can be |
1802 | // consecutive and be a function of the index. For example A[I+1]; |
1803 | unsigned NumOperands = Gep->getNumOperands(); |
1804 | unsigned InductionOperand = getGEPInductionOperand(DL, Gep); |
1805 | // Create the new GEP with the new induction variable. |
1806 | GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); |
1807 | |
1808 | for (unsigned i = 0; i < NumOperands; ++i) { |
1809 | Value *GepOperand = Gep->getOperand(i); |
1810 | Instruction *GepOperandInst = dyn_cast<Instruction>(GepOperand); |
1811 | |
1812 | // Update last index or loop invariant instruction anchored in loop. |
1813 | if (i == InductionOperand || |
1814 | (GepOperandInst && OrigLoop->contains(GepOperandInst))) { |
1815 | assert((i == InductionOperand ||(((i == InductionOperand || SE->isLoopInvariant(SE->getSCEV (GepOperandInst), OrigLoop)) && "Must be last index or loop invariant" ) ? static_cast<void> (0) : __assert_fail ("(i == InductionOperand || SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) && \"Must be last index or loop invariant\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1817, __PRETTY_FUNCTION__)) |
1816 | SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) &&(((i == InductionOperand || SE->isLoopInvariant(SE->getSCEV (GepOperandInst), OrigLoop)) && "Must be last index or loop invariant" ) ? static_cast<void> (0) : __assert_fail ("(i == InductionOperand || SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) && \"Must be last index or loop invariant\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1817, __PRETTY_FUNCTION__)) |
1817 | "Must be last index or loop invariant")(((i == InductionOperand || SE->isLoopInvariant(SE->getSCEV (GepOperandInst), OrigLoop)) && "Must be last index or loop invariant" ) ? static_cast<void> (0) : __assert_fail ("(i == InductionOperand || SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) && \"Must be last index or loop invariant\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1817, __PRETTY_FUNCTION__)); |
1818 | |
1819 | VectorParts &GEPParts = getVectorValue(GepOperand); |
1820 | Value *Index = GEPParts[0]; |
1821 | Index = Builder.CreateExtractElement(Index, Zero); |
1822 | Gep2->setOperand(i, Index); |
1823 | Gep2->setName("gep.indvar.idx"); |
1824 | } |
1825 | } |
1826 | Ptr = Builder.Insert(Gep2); |
1827 | } else { |
1828 | // Use the induction element ptr. |
1829 | assert(isa<PHINode>(Ptr) && "Invalid induction ptr")((isa<PHINode>(Ptr) && "Invalid induction ptr") ? static_cast<void> (0) : __assert_fail ("isa<PHINode>(Ptr) && \"Invalid induction ptr\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1829, __PRETTY_FUNCTION__)); |
1830 | setDebugLocFromInst(Builder, Ptr); |
1831 | VectorParts &PtrVal = getVectorValue(Ptr); |
1832 | Ptr = Builder.CreateExtractElement(PtrVal[0], Zero); |
1833 | } |
1834 | |
1835 | // Handle Stores: |
1836 | if (SI) { |
1837 | assert(!Legal->isUniform(SI->getPointerOperand()) &&((!Legal->isUniform(SI->getPointerOperand()) && "We do not allow storing to uniform addresses") ? static_cast <void> (0) : __assert_fail ("!Legal->isUniform(SI->getPointerOperand()) && \"We do not allow storing to uniform addresses\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1838, __PRETTY_FUNCTION__)) |
1838 | "We do not allow storing to uniform addresses")((!Legal->isUniform(SI->getPointerOperand()) && "We do not allow storing to uniform addresses") ? static_cast <void> (0) : __assert_fail ("!Legal->isUniform(SI->getPointerOperand()) && \"We do not allow storing to uniform addresses\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1838, __PRETTY_FUNCTION__)); |
1839 | setDebugLocFromInst(Builder, SI); |
1840 | // We don't want to update the value in the map as it might be used in |
1841 | // another expression. So don't use a reference type for "StoredVal". |
1842 | VectorParts StoredVal = getVectorValue(SI->getValueOperand()); |
1843 | |
1844 | for (unsigned Part = 0; Part < UF; ++Part) { |
1845 | // Calculate the pointer for the specific unroll-part. |
1846 | Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)); |
1847 | |
1848 | if (Reverse) { |
1849 | // If we store to reverse consecutive memory locations then we need |
1850 | // to reverse the order of elements in the stored value. |
1851 | StoredVal[Part] = reverseVector(StoredVal[Part]); |
1852 | // If the address is consecutive but reversed, then the |
1853 | // wide store needs to start at the last vector element. |
1854 | PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)); |
1855 | PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); |
1856 | } |
1857 | |
1858 | Value *VecPtr = Builder.CreateBitCast(PartPtr, |
1859 | DataTy->getPointerTo(AddressSpace)); |
1860 | StoreInst *NewSI = |
1861 | Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment); |
1862 | propagateMetadata(NewSI, SI); |
1863 | } |
1864 | return; |
1865 | } |
1866 | |
1867 | // Handle loads. |
1868 | assert(LI && "Must have a load instruction")((LI && "Must have a load instruction") ? static_cast <void> (0) : __assert_fail ("LI && \"Must have a load instruction\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1868, __PRETTY_FUNCTION__)); |
1869 | setDebugLocFromInst(Builder, LI); |
1870 | for (unsigned Part = 0; Part < UF; ++Part) { |
1871 | // Calculate the pointer for the specific unroll-part. |
1872 | Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)); |
1873 | |
1874 | if (Reverse) { |
1875 | // If the address is consecutive but reversed, then the |
1876 | // wide store needs to start at the last vector element. |
1877 | PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)); |
1878 | PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); |
1879 | } |
1880 | |
1881 | Value *VecPtr = Builder.CreateBitCast(PartPtr, |
1882 | DataTy->getPointerTo(AddressSpace)); |
1883 | LoadInst *NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load"); |
1884 | propagateMetadata(NewLI, LI); |
1885 | Entry[Part] = Reverse ? reverseVector(NewLI) : NewLI; |
1886 | } |
1887 | } |
1888 | |
1889 | void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredicateStore) { |
1890 | assert(!Instr->getType()->isAggregateType() && "Can't handle vectors")((!Instr->getType()->isAggregateType() && "Can't handle vectors" ) ? static_cast<void> (0) : __assert_fail ("!Instr->getType()->isAggregateType() && \"Can't handle vectors\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1890, __PRETTY_FUNCTION__)); |
1891 | // Holds vector parameters or scalars, in case of uniform vals. |
1892 | SmallVector<VectorParts, 4> Params; |
1893 | |
1894 | setDebugLocFromInst(Builder, Instr); |
1895 | |
1896 | // Find all of the vectorized parameters. |
1897 | for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { |
1898 | Value *SrcOp = Instr->getOperand(op); |
1899 | |
1900 | // If we are accessing the old induction variable, use the new one. |
1901 | if (SrcOp == OldInduction) { |
1902 | Params.push_back(getVectorValue(SrcOp)); |
1903 | continue; |
1904 | } |
1905 | |
1906 | // Try using previously calculated values. |
1907 | Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); |
1908 | |
1909 | // If the src is an instruction that appeared earlier in the basic block |
1910 | // then it should already be vectorized. |
1911 | if (SrcInst && OrigLoop->contains(SrcInst)) { |
1912 | assert(WidenMap.has(SrcInst) && "Source operand is unavailable")((WidenMap.has(SrcInst) && "Source operand is unavailable" ) ? static_cast<void> (0) : __assert_fail ("WidenMap.has(SrcInst) && \"Source operand is unavailable\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1912, __PRETTY_FUNCTION__)); |
1913 | // The parameter is a vector value from earlier. |
1914 | Params.push_back(WidenMap.get(SrcInst)); |
1915 | } else { |
1916 | // The parameter is a scalar from outside the loop. Maybe even a constant. |
1917 | VectorParts Scalars; |
1918 | Scalars.append(UF, SrcOp); |
1919 | Params.push_back(Scalars); |
1920 | } |
1921 | } |
1922 | |
1923 | assert(Params.size() == Instr->getNumOperands() &&((Params.size() == Instr->getNumOperands() && "Invalid number of operands" ) ? static_cast<void> (0) : __assert_fail ("Params.size() == Instr->getNumOperands() && \"Invalid number of operands\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1924, __PRETTY_FUNCTION__)) |
1924 | "Invalid number of operands")((Params.size() == Instr->getNumOperands() && "Invalid number of operands" ) ? static_cast<void> (0) : __assert_fail ("Params.size() == Instr->getNumOperands() && \"Invalid number of operands\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1924, __PRETTY_FUNCTION__)); |
1925 | |
1926 | // Does this instruction return a value ? |
1927 | bool IsVoidRetTy = Instr->getType()->isVoidTy(); |
1928 | |
1929 | Value *UndefVec = IsVoidRetTy ? nullptr : |
1930 | UndefValue::get(VectorType::get(Instr->getType(), VF)); |
1931 | // Create a new entry in the WidenMap and initialize it to Undef or Null. |
1932 | VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); |
1933 | |
1934 | Instruction *InsertPt = Builder.GetInsertPoint(); |
1935 | BasicBlock *IfBlock = Builder.GetInsertBlock(); |
1936 | BasicBlock *CondBlock = nullptr; |
1937 | |
1938 | VectorParts Cond; |
1939 | Loop *VectorLp = nullptr; |
1940 | if (IfPredicateStore) { |
1941 | assert(Instr->getParent()->getSinglePredecessor() &&((Instr->getParent()->getSinglePredecessor() && "Only support single predecessor blocks") ? static_cast<void > (0) : __assert_fail ("Instr->getParent()->getSinglePredecessor() && \"Only support single predecessor blocks\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1942, __PRETTY_FUNCTION__)) |
1942 | "Only support single predecessor blocks")((Instr->getParent()->getSinglePredecessor() && "Only support single predecessor blocks") ? static_cast<void > (0) : __assert_fail ("Instr->getParent()->getSinglePredecessor() && \"Only support single predecessor blocks\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1942, __PRETTY_FUNCTION__)); |
1943 | Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(), |
1944 | Instr->getParent()); |
1945 | VectorLp = LI->getLoopFor(IfBlock); |
1946 | assert(VectorLp && "Must have a loop for this block")((VectorLp && "Must have a loop for this block") ? static_cast <void> (0) : __assert_fail ("VectorLp && \"Must have a loop for this block\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 1946, __PRETTY_FUNCTION__)); |
1947 | } |
1948 | |
1949 | // For each vector unroll 'part': |
1950 | for (unsigned Part = 0; Part < UF; ++Part) { |
1951 | // For each scalar that we create: |
1952 | for (unsigned Width = 0; Width < VF; ++Width) { |
1953 | |
1954 | // Start if-block. |
1955 | Value *Cmp = nullptr; |
1956 | if (IfPredicateStore) { |
1957 | Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width)); |
1958 | Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1)); |
1959 | CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); |
1960 | LoopVectorBody.push_back(CondBlock); |
1961 | VectorLp->addBasicBlockToLoop(CondBlock, LI->getBase()); |
1962 | // Update Builder with newly created basic block. |
1963 | Builder.SetInsertPoint(InsertPt); |
1964 | } |
1965 | |
1966 | Instruction *Cloned = Instr->clone(); |
1967 | if (!IsVoidRetTy) |
1968 | Cloned->setName(Instr->getName() + ".cloned"); |
1969 | // Replace the operands of the cloned instructions with extracted scalars. |
1970 | for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { |
1971 | Value *Op = Params[op][Part]; |
1972 | // Param is a vector. Need to extract the right lane. |
1973 | if (Op->getType()->isVectorTy()) |
1974 | Op = Builder.CreateExtractElement(Op, Builder.getInt32(Width)); |
1975 | Cloned->setOperand(op, Op); |
1976 | } |
1977 | |
1978 | // Place the cloned scalar in the new loop. |
1979 | Builder.Insert(Cloned); |
1980 | |
1981 | // If the original scalar returns a value we need to place it in a vector |
1982 | // so that future users will be able to use it. |
1983 | if (!IsVoidRetTy) |
1984 | VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned, |
1985 | Builder.getInt32(Width)); |
1986 | // End if-block. |
1987 | if (IfPredicateStore) { |
1988 | BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); |
1989 | LoopVectorBody.push_back(NewIfBlock); |
1990 | VectorLp->addBasicBlockToLoop(NewIfBlock, LI->getBase()); |
1991 | Builder.SetInsertPoint(InsertPt); |
1992 | Instruction *OldBr = IfBlock->getTerminator(); |
1993 | BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); |
1994 | OldBr->eraseFromParent(); |
1995 | IfBlock = NewIfBlock; |
1996 | } |
1997 | } |
1998 | } |
1999 | } |
2000 | |
2001 | static Instruction *getFirstInst(Instruction *FirstInst, Value *V, |
2002 | Instruction *Loc) { |
2003 | if (FirstInst) |
2004 | return FirstInst; |
2005 | if (Instruction *I = dyn_cast<Instruction>(V)) |
2006 | return I->getParent() == Loc->getParent() ? I : nullptr; |
2007 | return nullptr; |
2008 | } |
2009 | |
2010 | std::pair<Instruction *, Instruction *> |
2011 | InnerLoopVectorizer::addStrideCheck(Instruction *Loc) { |
2012 | Instruction *tnullptr = nullptr; |
2013 | if (!Legal->mustCheckStrides()) |
2014 | return std::pair<Instruction *, Instruction *>(tnullptr, tnullptr); |
2015 | |
2016 | IRBuilder<> ChkBuilder(Loc); |
2017 | |
2018 | // Emit checks. |
2019 | Value *Check = nullptr; |
2020 | Instruction *FirstInst = nullptr; |
2021 | for (SmallPtrSet<Value *, 8>::iterator SI = Legal->strides_begin(), |
2022 | SE = Legal->strides_end(); |
2023 | SI != SE; ++SI) { |
2024 | Value *Ptr = stripIntegerCast(*SI); |
2025 | Value *C = ChkBuilder.CreateICmpNE(Ptr, ConstantInt::get(Ptr->getType(), 1), |
2026 | "stride.chk"); |
2027 | // Store the first instruction we create. |
2028 | FirstInst = getFirstInst(FirstInst, C, Loc); |
2029 | if (Check) |
2030 | Check = ChkBuilder.CreateOr(Check, C); |
2031 | else |
2032 | Check = C; |
2033 | } |
2034 | |
2035 | // We have to do this trickery because the IRBuilder might fold the check to a |
2036 | // constant expression in which case there is no Instruction anchored in a |
2037 | // the block. |
2038 | LLVMContext &Ctx = Loc->getContext(); |
2039 | Instruction *TheCheck = |
2040 | BinaryOperator::CreateAnd(Check, ConstantInt::getTrue(Ctx)); |
2041 | ChkBuilder.Insert(TheCheck, "stride.not.one"); |
2042 | FirstInst = getFirstInst(FirstInst, TheCheck, Loc); |
2043 | |
2044 | return std::make_pair(FirstInst, TheCheck); |
2045 | } |
2046 | |
2047 | std::pair<Instruction *, Instruction *> |
2048 | InnerLoopVectorizer::addRuntimeCheck(Instruction *Loc) { |
2049 | LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck = |
2050 | Legal->getRuntimePointerCheck(); |
2051 | |
2052 | Instruction *tnullptr = nullptr; |
2053 | if (!PtrRtCheck->Need) |
2054 | return std::pair<Instruction *, Instruction *>(tnullptr, tnullptr); |
2055 | |
2056 | unsigned NumPointers = PtrRtCheck->Pointers.size(); |
2057 | SmallVector<TrackingVH<Value> , 2> Starts; |
2058 | SmallVector<TrackingVH<Value> , 2> Ends; |
2059 | |
2060 | LLVMContext &Ctx = Loc->getContext(); |
2061 | SCEVExpander Exp(*SE, "induction"); |
2062 | Instruction *FirstInst = nullptr; |
2063 | |
2064 | for (unsigned i = 0; i < NumPointers; ++i) { |
2065 | Value *Ptr = PtrRtCheck->Pointers[i]; |
2066 | const SCEV *Sc = SE->getSCEV(Ptr); |
2067 | |
2068 | if (SE->isLoopInvariant(Sc, OrigLoop)) { |
2069 | DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Adding RT check for a loop invariant ptr:" << *Ptr <<"\n"; } } while (0) |
2070 | *Ptr <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Adding RT check for a loop invariant ptr:" << *Ptr <<"\n"; } } while (0); |
2071 | Starts.push_back(Ptr); |
2072 | Ends.push_back(Ptr); |
2073 | } else { |
2074 | DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Adding RT check for range:" << *Ptr << '\n'; } } while (0); |
2075 | unsigned AS = Ptr->getType()->getPointerAddressSpace(); |
2076 | |
2077 | // Use this type for pointer arithmetic. |
2078 | Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS); |
2079 | |
2080 | Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc); |
2081 | Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc); |
2082 | Starts.push_back(Start); |
2083 | Ends.push_back(End); |
2084 | } |
2085 | } |
2086 | |
2087 | IRBuilder<> ChkBuilder(Loc); |
2088 | // Our instructions might fold to a constant. |
2089 | Value *MemoryRuntimeCheck = nullptr; |
2090 | for (unsigned i = 0; i < NumPointers; ++i) { |
2091 | for (unsigned j = i+1; j < NumPointers; ++j) { |
2092 | // No need to check if two readonly pointers intersect. |
2093 | if (!PtrRtCheck->IsWritePtr[i] && !PtrRtCheck->IsWritePtr[j]) |
2094 | continue; |
2095 | |
2096 | // Only need to check pointers between two different dependency sets. |
2097 | if (PtrRtCheck->DependencySetId[i] == PtrRtCheck->DependencySetId[j]) |
2098 | continue; |
2099 | // Only need to check pointers in the same alias set. |
2100 | if (PtrRtCheck->AliasSetId[i] != PtrRtCheck->AliasSetId[j]) |
2101 | continue; |
2102 | |
2103 | unsigned AS0 = Starts[i]->getType()->getPointerAddressSpace(); |
2104 | unsigned AS1 = Starts[j]->getType()->getPointerAddressSpace(); |
2105 | |
2106 | assert((AS0 == Ends[j]->getType()->getPointerAddressSpace()) &&(((AS0 == Ends[j]->getType()->getPointerAddressSpace()) && (AS1 == Ends[i]->getType()->getPointerAddressSpace ()) && "Trying to bounds check pointers with different address spaces" ) ? static_cast<void> (0) : __assert_fail ("(AS0 == Ends[j]->getType()->getPointerAddressSpace()) && (AS1 == Ends[i]->getType()->getPointerAddressSpace()) && \"Trying to bounds check pointers with different address spaces\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2108, __PRETTY_FUNCTION__)) |
2107 | (AS1 == Ends[i]->getType()->getPointerAddressSpace()) &&(((AS0 == Ends[j]->getType()->getPointerAddressSpace()) && (AS1 == Ends[i]->getType()->getPointerAddressSpace ()) && "Trying to bounds check pointers with different address spaces" ) ? static_cast<void> (0) : __assert_fail ("(AS0 == Ends[j]->getType()->getPointerAddressSpace()) && (AS1 == Ends[i]->getType()->getPointerAddressSpace()) && \"Trying to bounds check pointers with different address spaces\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2108, __PRETTY_FUNCTION__)) |
2108 | "Trying to bounds check pointers with different address spaces")(((AS0 == Ends[j]->getType()->getPointerAddressSpace()) && (AS1 == Ends[i]->getType()->getPointerAddressSpace ()) && "Trying to bounds check pointers with different address spaces" ) ? static_cast<void> (0) : __assert_fail ("(AS0 == Ends[j]->getType()->getPointerAddressSpace()) && (AS1 == Ends[i]->getType()->getPointerAddressSpace()) && \"Trying to bounds check pointers with different address spaces\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2108, __PRETTY_FUNCTION__)); |
2109 | |
2110 | Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0); |
2111 | Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1); |
2112 | |
2113 | Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy0, "bc"); |
2114 | Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy1, "bc"); |
2115 | Value *End0 = ChkBuilder.CreateBitCast(Ends[i], PtrArithTy1, "bc"); |
2116 | Value *End1 = ChkBuilder.CreateBitCast(Ends[j], PtrArithTy0, "bc"); |
2117 | |
2118 | Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0"); |
2119 | FirstInst = getFirstInst(FirstInst, Cmp0, Loc); |
2120 | Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1"); |
2121 | FirstInst = getFirstInst(FirstInst, Cmp1, Loc); |
2122 | Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict"); |
2123 | FirstInst = getFirstInst(FirstInst, IsConflict, Loc); |
2124 | if (MemoryRuntimeCheck) { |
2125 | IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, |
2126 | "conflict.rdx"); |
2127 | FirstInst = getFirstInst(FirstInst, IsConflict, Loc); |
2128 | } |
2129 | MemoryRuntimeCheck = IsConflict; |
2130 | } |
2131 | } |
2132 | |
2133 | // We have to do this trickery because the IRBuilder might fold the check to a |
2134 | // constant expression in which case there is no Instruction anchored in a |
2135 | // the block. |
2136 | Instruction *Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck, |
2137 | ConstantInt::getTrue(Ctx)); |
2138 | ChkBuilder.Insert(Check, "memcheck.conflict"); |
2139 | FirstInst = getFirstInst(FirstInst, Check, Loc); |
2140 | return std::make_pair(FirstInst, Check); |
2141 | } |
2142 | |
2143 | void InnerLoopVectorizer::createEmptyLoop() { |
2144 | /* |
2145 | In this function we generate a new loop. The new loop will contain |
2146 | the vectorized instructions while the old loop will continue to run the |
2147 | scalar remainder. |
2148 | |
2149 | [ ] <-- Back-edge taken count overflow check. |
2150 | / | |
2151 | / v |
2152 | | [ ] <-- vector loop bypass (may consist of multiple blocks). |
2153 | | / | |
2154 | | / v |
2155 | || [ ] <-- vector pre header. |
2156 | || | |
2157 | || v |
2158 | || [ ] \ |
2159 | || [ ]_| <-- vector loop. |
2160 | || | |
2161 | | \ v |
2162 | | >[ ] <--- middle-block. |
2163 | | / | |
2164 | | / v |
2165 | -|- >[ ] <--- new preheader. |
2166 | | | |
2167 | | v |
2168 | | [ ] \ |
2169 | | [ ]_| <-- old scalar loop to handle remainder. |
2170 | \ | |
2171 | \ v |
2172 | >[ ] <-- exit block. |
2173 | ... |
2174 | */ |
2175 | |
2176 | BasicBlock *OldBasicBlock = OrigLoop->getHeader(); |
2177 | BasicBlock *BypassBlock = OrigLoop->getLoopPreheader(); |
2178 | BasicBlock *ExitBlock = OrigLoop->getExitBlock(); |
2179 | assert(BypassBlock && "Invalid loop structure")((BypassBlock && "Invalid loop structure") ? static_cast <void> (0) : __assert_fail ("BypassBlock && \"Invalid loop structure\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2179, __PRETTY_FUNCTION__)); |
2180 | assert(ExitBlock && "Must have an exit block")((ExitBlock && "Must have an exit block") ? static_cast <void> (0) : __assert_fail ("ExitBlock && \"Must have an exit block\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2180, __PRETTY_FUNCTION__)); |
2181 | |
2182 | // Some loops have a single integer induction variable, while other loops |
2183 | // don't. One example is c++ iterators that often have multiple pointer |
2184 | // induction variables. In the code below we also support a case where we |
2185 | // don't have a single induction variable. |
2186 | OldInduction = Legal->getInduction(); |
2187 | Type *IdxTy = Legal->getWidestInductionType(); |
2188 | |
2189 | // Find the loop boundaries. |
2190 | const SCEV *ExitCount = SE->getBackedgeTakenCount(OrigLoop); |
2191 | assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count")((ExitCount != SE->getCouldNotCompute() && "Invalid loop count" ) ? static_cast<void> (0) : __assert_fail ("ExitCount != SE->getCouldNotCompute() && \"Invalid loop count\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2191, __PRETTY_FUNCTION__)); |
2192 | |
2193 | // The exit count might have the type of i64 while the phi is i32. This can |
2194 | // happen if we have an induction variable that is sign extended before the |
2195 | // compare. The only way that we get a backedge taken count is that the |
2196 | // induction variable was signed and as such will not overflow. In such a case |
2197 | // truncation is legal. |
2198 | if (ExitCount->getType()->getPrimitiveSizeInBits() > |
2199 | IdxTy->getPrimitiveSizeInBits()) |
2200 | ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy); |
2201 | |
2202 | const SCEV *BackedgeTakeCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy); |
2203 | // Get the total trip count from the count by adding 1. |
2204 | ExitCount = SE->getAddExpr(BackedgeTakeCount, |
2205 | SE->getConstant(BackedgeTakeCount->getType(), 1)); |
2206 | |
2207 | // Expand the trip count and place the new instructions in the preheader. |
2208 | // Notice that the pre-header does not change, only the loop body. |
2209 | SCEVExpander Exp(*SE, "induction"); |
2210 | |
2211 | // We need to test whether the backedge-taken count is uint##_max. Adding one |
2212 | // to it will cause overflow and an incorrect loop trip count in the vector |
2213 | // body. In case of overflow we want to directly jump to the scalar remainder |
2214 | // loop. |
2215 | Value *BackedgeCount = |
2216 | Exp.expandCodeFor(BackedgeTakeCount, BackedgeTakeCount->getType(), |
2217 | BypassBlock->getTerminator()); |
2218 | if (BackedgeCount->getType()->isPointerTy()) |
2219 | BackedgeCount = CastInst::CreatePointerCast(BackedgeCount, IdxTy, |
2220 | "backedge.ptrcnt.to.int", |
2221 | BypassBlock->getTerminator()); |
2222 | Instruction *CheckBCOverflow = |
2223 | CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, BackedgeCount, |
2224 | Constant::getAllOnesValue(BackedgeCount->getType()), |
2225 | "backedge.overflow", BypassBlock->getTerminator()); |
2226 | |
2227 | // The loop index does not have to start at Zero. Find the original start |
2228 | // value from the induction PHI node. If we don't have an induction variable |
2229 | // then we know that it starts at zero. |
2230 | Builder.SetInsertPoint(BypassBlock->getTerminator()); |
2231 | Value *StartIdx = ExtendedIdx = OldInduction ? |
2232 | Builder.CreateZExt(OldInduction->getIncomingValueForBlock(BypassBlock), |
2233 | IdxTy): |
2234 | ConstantInt::get(IdxTy, 0); |
2235 | |
2236 | // We need an instruction to anchor the overflow check on. StartIdx needs to |
2237 | // be defined before the overflow check branch. Because the scalar preheader |
2238 | // is going to merge the start index and so the overflow branch block needs to |
2239 | // contain a definition of the start index. |
2240 | Instruction *OverflowCheckAnchor = BinaryOperator::CreateAdd( |
2241 | StartIdx, ConstantInt::get(IdxTy, 0), "overflow.check.anchor", |
2242 | BypassBlock->getTerminator()); |
2243 | |
2244 | // Count holds the overall loop count (N). |
2245 | Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(), |
2246 | BypassBlock->getTerminator()); |
2247 | |
2248 | LoopBypassBlocks.push_back(BypassBlock); |
2249 | |
2250 | // Split the single block loop into the two loop structure described above. |
2251 | BasicBlock *VectorPH = |
2252 | BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); |
2253 | BasicBlock *VecBody = |
2254 | VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); |
2255 | BasicBlock *MiddleBlock = |
2256 | VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); |
2257 | BasicBlock *ScalarPH = |
2258 | MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); |
2259 | |
2260 | // Create and register the new vector loop. |
2261 | Loop* Lp = new Loop(); |
2262 | Loop *ParentLoop = OrigLoop->getParentLoop(); |
2263 | |
2264 | // Insert the new loop into the loop nest and register the new basic blocks |
2265 | // before calling any utilities such as SCEV that require valid LoopInfo. |
2266 | if (ParentLoop) { |
2267 | ParentLoop->addChildLoop(Lp); |
2268 | ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase()); |
2269 | ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); |
2270 | ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); |
2271 | } else { |
2272 | LI->addTopLevelLoop(Lp); |
2273 | } |
2274 | Lp->addBasicBlockToLoop(VecBody, LI->getBase()); |
2275 | |
2276 | // Use this IR builder to create the loop instructions (Phi, Br, Cmp) |
2277 | // inside the loop. |
2278 | Builder.SetInsertPoint(VecBody->getFirstNonPHI()); |
2279 | |
2280 | // Generate the induction variable. |
2281 | setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction)); |
2282 | Induction = Builder.CreatePHI(IdxTy, 2, "index"); |
2283 | // The loop step is equal to the vectorization factor (num of SIMD elements) |
2284 | // times the unroll factor (num of SIMD instructions). |
2285 | Constant *Step = ConstantInt::get(IdxTy, VF * UF); |
2286 | |
2287 | // This is the IR builder that we use to add all of the logic for bypassing |
2288 | // the new vector loop. |
2289 | IRBuilder<> BypassBuilder(BypassBlock->getTerminator()); |
2290 | setDebugLocFromInst(BypassBuilder, |
2291 | getDebugLocFromInstOrOperands(OldInduction)); |
2292 | |
2293 | // We may need to extend the index in case there is a type mismatch. |
2294 | // We know that the count starts at zero and does not overflow. |
2295 | if (Count->getType() != IdxTy) { |
2296 | // The exit count can be of pointer type. Convert it to the correct |
2297 | // integer type. |
2298 | if (ExitCount->getType()->isPointerTy()) |
2299 | Count = BypassBuilder.CreatePointerCast(Count, IdxTy, "ptrcnt.to.int"); |
2300 | else |
2301 | Count = BypassBuilder.CreateZExtOrTrunc(Count, IdxTy, "cnt.cast"); |
2302 | } |
2303 | |
2304 | // Add the start index to the loop count to get the new end index. |
2305 | Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx, "end.idx"); |
2306 | |
2307 | // Now we need to generate the expression for N - (N % VF), which is |
2308 | // the part that the vectorized body will execute. |
2309 | Value *R = BypassBuilder.CreateURem(Count, Step, "n.mod.vf"); |
2310 | Value *CountRoundDown = BypassBuilder.CreateSub(Count, R, "n.vec"); |
2311 | Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx, |
2312 | "end.idx.rnd.down"); |
2313 | |
2314 | // Now, compare the new count to zero. If it is zero skip the vector loop and |
2315 | // jump to the scalar loop. |
2316 | Value *Cmp = |
2317 | BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, "cmp.zero"); |
2318 | |
2319 | BasicBlock *LastBypassBlock = BypassBlock; |
2320 | |
2321 | // Generate code to check that the loops trip count that we computed by adding |
2322 | // one to the backedge-taken count will not overflow. |
2323 | { |
2324 | auto PastOverflowCheck = |
2325 | std::next(BasicBlock::iterator(OverflowCheckAnchor)); |
2326 | BasicBlock *CheckBlock = |
2327 | LastBypassBlock->splitBasicBlock(PastOverflowCheck, "overflow.checked"); |
2328 | if (ParentLoop) |
2329 | ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase()); |
2330 | LoopBypassBlocks.push_back(CheckBlock); |
2331 | Instruction *OldTerm = LastBypassBlock->getTerminator(); |
2332 | BranchInst::Create(ScalarPH, CheckBlock, CheckBCOverflow, OldTerm); |
2333 | OldTerm->eraseFromParent(); |
2334 | LastBypassBlock = CheckBlock; |
2335 | } |
2336 | |
2337 | // Generate the code to check that the strides we assumed to be one are really |
2338 | // one. We want the new basic block to start at the first instruction in a |
2339 | // sequence of instructions that form a check. |
2340 | Instruction *StrideCheck; |
2341 | Instruction *FirstCheckInst; |
2342 | std::tie(FirstCheckInst, StrideCheck) = |
2343 | addStrideCheck(LastBypassBlock->getTerminator()); |
2344 | if (StrideCheck) { |
2345 | // Create a new block containing the stride check. |
2346 | BasicBlock *CheckBlock = |
2347 | LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.stridecheck"); |
2348 | if (ParentLoop) |
2349 | ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase()); |
2350 | LoopBypassBlocks.push_back(CheckBlock); |
2351 | |
2352 | // Replace the branch into the memory check block with a conditional branch |
2353 | // for the "few elements case". |
2354 | Instruction *OldTerm = LastBypassBlock->getTerminator(); |
2355 | BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm); |
2356 | OldTerm->eraseFromParent(); |
2357 | |
2358 | Cmp = StrideCheck; |
2359 | LastBypassBlock = CheckBlock; |
2360 | } |
2361 | |
2362 | // Generate the code that checks in runtime if arrays overlap. We put the |
2363 | // checks into a separate block to make the more common case of few elements |
2364 | // faster. |
2365 | Instruction *MemRuntimeCheck; |
2366 | std::tie(FirstCheckInst, MemRuntimeCheck) = |
2367 | addRuntimeCheck(LastBypassBlock->getTerminator()); |
2368 | if (MemRuntimeCheck) { |
2369 | // Create a new block containing the memory check. |
2370 | BasicBlock *CheckBlock = |
2371 | LastBypassBlock->splitBasicBlock(MemRuntimeCheck, "vector.memcheck"); |
2372 | if (ParentLoop) |
2373 | ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase()); |
2374 | LoopBypassBlocks.push_back(CheckBlock); |
2375 | |
2376 | // Replace the branch into the memory check block with a conditional branch |
2377 | // for the "few elements case". |
2378 | Instruction *OldTerm = LastBypassBlock->getTerminator(); |
2379 | BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm); |
2380 | OldTerm->eraseFromParent(); |
2381 | |
2382 | Cmp = MemRuntimeCheck; |
2383 | LastBypassBlock = CheckBlock; |
2384 | } |
2385 | |
2386 | LastBypassBlock->getTerminator()->eraseFromParent(); |
2387 | BranchInst::Create(MiddleBlock, VectorPH, Cmp, |
2388 | LastBypassBlock); |
2389 | |
2390 | // We are going to resume the execution of the scalar loop. |
2391 | // Go over all of the induction variables that we found and fix the |
2392 | // PHIs that are left in the scalar version of the loop. |
2393 | // The starting values of PHI nodes depend on the counter of the last |
2394 | // iteration in the vectorized loop. |
2395 | // If we come from a bypass edge then we need to start from the original |
2396 | // start value. |
2397 | |
2398 | // This variable saves the new starting index for the scalar loop. |
2399 | PHINode *ResumeIndex = nullptr; |
2400 | LoopVectorizationLegality::InductionList::iterator I, E; |
2401 | LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); |
2402 | // Set builder to point to last bypass block. |
2403 | BypassBuilder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator()); |
2404 | for (I = List->begin(), E = List->end(); I != E; ++I) { |
2405 | PHINode *OrigPhi = I->first; |
2406 | LoopVectorizationLegality::InductionInfo II = I->second; |
2407 | |
2408 | Type *ResumeValTy = (OrigPhi == OldInduction) ? IdxTy : OrigPhi->getType(); |
2409 | PHINode *ResumeVal = PHINode::Create(ResumeValTy, 2, "resume.val", |
2410 | MiddleBlock->getTerminator()); |
2411 | // We might have extended the type of the induction variable but we need a |
2412 | // truncated version for the scalar loop. |
2413 | PHINode *TruncResumeVal = (OrigPhi == OldInduction) ? |
2414 | PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val", |
2415 | MiddleBlock->getTerminator()) : nullptr; |
2416 | |
2417 | // Create phi nodes to merge from the backedge-taken check block. |
2418 | PHINode *BCResumeVal = PHINode::Create(ResumeValTy, 3, "bc.resume.val", |
2419 | ScalarPH->getTerminator()); |
2420 | BCResumeVal->addIncoming(ResumeVal, MiddleBlock); |
2421 | |
2422 | PHINode *BCTruncResumeVal = nullptr; |
2423 | if (OrigPhi == OldInduction) { |
2424 | BCTruncResumeVal = |
2425 | PHINode::Create(OrigPhi->getType(), 2, "bc.trunc.resume.val", |
2426 | ScalarPH->getTerminator()); |
2427 | BCTruncResumeVal->addIncoming(TruncResumeVal, MiddleBlock); |
2428 | } |
2429 | |
2430 | Value *EndValue = nullptr; |
2431 | switch (II.IK) { |
2432 | case LoopVectorizationLegality::IK_NoInduction: |
2433 | llvm_unreachable("Unknown induction")::llvm::llvm_unreachable_internal("Unknown induction", "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2433); |
2434 | case LoopVectorizationLegality::IK_IntInduction: { |
2435 | // Handle the integer induction counter. |
2436 | assert(OrigPhi->getType()->isIntegerTy() && "Invalid type")((OrigPhi->getType()->isIntegerTy() && "Invalid type" ) ? static_cast<void> (0) : __assert_fail ("OrigPhi->getType()->isIntegerTy() && \"Invalid type\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2436, __PRETTY_FUNCTION__)); |
2437 | |
2438 | // We have the canonical induction variable. |
2439 | if (OrigPhi == OldInduction) { |
2440 | // Create a truncated version of the resume value for the scalar loop, |
2441 | // we might have promoted the type to a larger width. |
2442 | EndValue = |
2443 | BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType()); |
2444 | // The new PHI merges the original incoming value, in case of a bypass, |
2445 | // or the value at the end of the vectorized loop. |
2446 | for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) |
2447 | TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]); |
2448 | TruncResumeVal->addIncoming(EndValue, VecBody); |
2449 | |
2450 | BCTruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]); |
2451 | |
2452 | // We know what the end value is. |
2453 | EndValue = IdxEndRoundDown; |
2454 | // We also know which PHI node holds it. |
2455 | ResumeIndex = ResumeVal; |
2456 | break; |
2457 | } |
2458 | |
2459 | // Not the canonical induction variable - add the vector loop count to the |
2460 | // start value. |
2461 | Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown, |
2462 | II.StartValue->getType(), |
2463 | "cast.crd"); |
2464 | EndValue = BypassBuilder.CreateAdd(CRD, II.StartValue , "ind.end"); |
2465 | break; |
2466 | } |
2467 | case LoopVectorizationLegality::IK_ReverseIntInduction: { |
2468 | // Convert the CountRoundDown variable to the PHI size. |
2469 | Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown, |
2470 | II.StartValue->getType(), |
2471 | "cast.crd"); |
2472 | // Handle reverse integer induction counter. |
2473 | EndValue = BypassBuilder.CreateSub(II.StartValue, CRD, "rev.ind.end"); |
2474 | break; |
2475 | } |
2476 | case LoopVectorizationLegality::IK_PtrInduction: { |
2477 | // For pointer induction variables, calculate the offset using |
2478 | // the end index. |
2479 | EndValue = BypassBuilder.CreateGEP(II.StartValue, CountRoundDown, |
2480 | "ptr.ind.end"); |
2481 | break; |
2482 | } |
2483 | case LoopVectorizationLegality::IK_ReversePtrInduction: { |
2484 | // The value at the end of the loop for the reverse pointer is calculated |
2485 | // by creating a GEP with a negative index starting from the start value. |
2486 | Value *Zero = ConstantInt::get(CountRoundDown->getType(), 0); |
2487 | Value *NegIdx = BypassBuilder.CreateSub(Zero, CountRoundDown, |
2488 | "rev.ind.end"); |
2489 | EndValue = BypassBuilder.CreateGEP(II.StartValue, NegIdx, |
2490 | "rev.ptr.ind.end"); |
2491 | break; |
2492 | } |
2493 | }// end of case |
2494 | |
2495 | // The new PHI merges the original incoming value, in case of a bypass, |
2496 | // or the value at the end of the vectorized loop. |
2497 | for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) { |
2498 | if (OrigPhi == OldInduction) |
2499 | ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]); |
2500 | else |
2501 | ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]); |
2502 | } |
2503 | ResumeVal->addIncoming(EndValue, VecBody); |
2504 | |
2505 | // Fix the scalar body counter (PHI node). |
2506 | unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH); |
2507 | |
2508 | // The old induction's phi node in the scalar body needs the truncated |
2509 | // value. |
2510 | if (OrigPhi == OldInduction) { |
2511 | BCResumeVal->addIncoming(StartIdx, LoopBypassBlocks[0]); |
2512 | OrigPhi->setIncomingValue(BlockIdx, BCTruncResumeVal); |
2513 | } else { |
2514 | BCResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]); |
2515 | OrigPhi->setIncomingValue(BlockIdx, BCResumeVal); |
2516 | } |
2517 | } |
2518 | |
2519 | // If we are generating a new induction variable then we also need to |
2520 | // generate the code that calculates the exit value. This value is not |
2521 | // simply the end of the counter because we may skip the vectorized body |
2522 | // in case of a runtime check. |
2523 | if (!OldInduction){ |
2524 | assert(!ResumeIndex && "Unexpected resume value found")((!ResumeIndex && "Unexpected resume value found") ? static_cast <void> (0) : __assert_fail ("!ResumeIndex && \"Unexpected resume value found\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2524, __PRETTY_FUNCTION__)); |
2525 | ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val", |
2526 | MiddleBlock->getTerminator()); |
2527 | for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) |
2528 | ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]); |
2529 | ResumeIndex->addIncoming(IdxEndRoundDown, VecBody); |
2530 | } |
2531 | |
2532 | // Make sure that we found the index where scalar loop needs to continue. |
2533 | assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() &&((ResumeIndex && ResumeIndex->getType()->isIntegerTy () && "Invalid resume Index") ? static_cast<void> (0) : __assert_fail ("ResumeIndex && ResumeIndex->getType()->isIntegerTy() && \"Invalid resume Index\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2534, __PRETTY_FUNCTION__)) |
2534 | "Invalid resume Index")((ResumeIndex && ResumeIndex->getType()->isIntegerTy () && "Invalid resume Index") ? static_cast<void> (0) : __assert_fail ("ResumeIndex && ResumeIndex->getType()->isIntegerTy() && \"Invalid resume Index\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2534, __PRETTY_FUNCTION__)); |
2535 | |
2536 | // Add a check in the middle block to see if we have completed |
2537 | // all of the iterations in the first vector loop. |
2538 | // If (N - N%VF) == N, then we *don't* need to run the remainder. |
2539 | Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd, |
2540 | ResumeIndex, "cmp.n", |
2541 | MiddleBlock->getTerminator()); |
2542 | |
2543 | BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator()); |
2544 | // Remove the old terminator. |
2545 | MiddleBlock->getTerminator()->eraseFromParent(); |
2546 | |
2547 | // Create i+1 and fill the PHINode. |
2548 | Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next"); |
2549 | Induction->addIncoming(StartIdx, VectorPH); |
2550 | Induction->addIncoming(NextIdx, VecBody); |
2551 | // Create the compare. |
2552 | Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown); |
2553 | Builder.CreateCondBr(ICmp, MiddleBlock, VecBody); |
2554 | |
2555 | // Now we have two terminators. Remove the old one from the block. |
2556 | VecBody->getTerminator()->eraseFromParent(); |
2557 | |
2558 | // Get ready to start creating new instructions into the vectorized body. |
2559 | Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); |
2560 | |
2561 | // Save the state. |
2562 | LoopVectorPreHeader = VectorPH; |
2563 | LoopScalarPreHeader = ScalarPH; |
2564 | LoopMiddleBlock = MiddleBlock; |
2565 | LoopExitBlock = ExitBlock; |
2566 | LoopVectorBody.push_back(VecBody); |
2567 | LoopScalarBody = OldBasicBlock; |
2568 | |
2569 | LoopVectorizeHints Hints(Lp, true); |
2570 | Hints.setAlreadyVectorized(); |
2571 | } |
2572 | |
2573 | /// This function returns the identity element (or neutral element) for |
2574 | /// the operation K. |
2575 | Constant* |
2576 | LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp) { |
2577 | switch (K) { |
2578 | case RK_IntegerXor: |
2579 | case RK_IntegerAdd: |
2580 | case RK_IntegerOr: |
2581 | // Adding, Xoring, Oring zero to a number does not change it. |
2582 | return ConstantInt::get(Tp, 0); |
2583 | case RK_IntegerMult: |
2584 | // Multiplying a number by 1 does not change it. |
2585 | return ConstantInt::get(Tp, 1); |
2586 | case RK_IntegerAnd: |
2587 | // AND-ing a number with an all-1 value does not change it. |
2588 | return ConstantInt::get(Tp, -1, true); |
2589 | case RK_FloatMult: |
2590 | // Multiplying a number by 1 does not change it. |
2591 | return ConstantFP::get(Tp, 1.0L); |
2592 | case RK_FloatAdd: |
2593 | // Adding zero to a number does not change it. |
2594 | return ConstantFP::get(Tp, 0.0L); |
2595 | default: |
2596 | llvm_unreachable("Unknown reduction kind")::llvm::llvm_unreachable_internal("Unknown reduction kind", "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2596); |
2597 | } |
2598 | } |
2599 | |
2600 | /// This function translates the reduction kind to an LLVM binary operator. |
2601 | static unsigned |
2602 | getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) { |
2603 | switch (Kind) { |
2604 | case LoopVectorizationLegality::RK_IntegerAdd: |
2605 | return Instruction::Add; |
2606 | case LoopVectorizationLegality::RK_IntegerMult: |
2607 | return Instruction::Mul; |
2608 | case LoopVectorizationLegality::RK_IntegerOr: |
2609 | return Instruction::Or; |
2610 | case LoopVectorizationLegality::RK_IntegerAnd: |
2611 | return Instruction::And; |
2612 | case LoopVectorizationLegality::RK_IntegerXor: |
2613 | return Instruction::Xor; |
2614 | case LoopVectorizationLegality::RK_FloatMult: |
2615 | return Instruction::FMul; |
2616 | case LoopVectorizationLegality::RK_FloatAdd: |
2617 | return Instruction::FAdd; |
2618 | case LoopVectorizationLegality::RK_IntegerMinMax: |
2619 | return Instruction::ICmp; |
2620 | case LoopVectorizationLegality::RK_FloatMinMax: |
2621 | return Instruction::FCmp; |
2622 | default: |
2623 | llvm_unreachable("Unknown reduction operation")::llvm::llvm_unreachable_internal("Unknown reduction operation" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2623); |
2624 | } |
2625 | } |
2626 | |
2627 | Value *createMinMaxOp(IRBuilder<> &Builder, |
2628 | LoopVectorizationLegality::MinMaxReductionKind RK, |
2629 | Value *Left, |
2630 | Value *Right) { |
2631 | CmpInst::Predicate P = CmpInst::ICMP_NE; |
2632 | switch (RK) { |
2633 | default: |
2634 | llvm_unreachable("Unknown min/max reduction kind")::llvm::llvm_unreachable_internal("Unknown min/max reduction kind" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2634); |
2635 | case LoopVectorizationLegality::MRK_UIntMin: |
2636 | P = CmpInst::ICMP_ULT; |
2637 | break; |
2638 | case LoopVectorizationLegality::MRK_UIntMax: |
2639 | P = CmpInst::ICMP_UGT; |
2640 | break; |
2641 | case LoopVectorizationLegality::MRK_SIntMin: |
2642 | P = CmpInst::ICMP_SLT; |
2643 | break; |
2644 | case LoopVectorizationLegality::MRK_SIntMax: |
2645 | P = CmpInst::ICMP_SGT; |
2646 | break; |
2647 | case LoopVectorizationLegality::MRK_FloatMin: |
2648 | P = CmpInst::FCMP_OLT; |
2649 | break; |
2650 | case LoopVectorizationLegality::MRK_FloatMax: |
2651 | P = CmpInst::FCMP_OGT; |
2652 | break; |
2653 | } |
2654 | |
2655 | Value *Cmp; |
2656 | if (RK == LoopVectorizationLegality::MRK_FloatMin || |
2657 | RK == LoopVectorizationLegality::MRK_FloatMax) |
2658 | Cmp = Builder.CreateFCmp(P, Left, Right, "rdx.minmax.cmp"); |
2659 | else |
2660 | Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp"); |
2661 | |
2662 | Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select"); |
2663 | return Select; |
2664 | } |
2665 | |
2666 | namespace { |
2667 | struct CSEDenseMapInfo { |
2668 | static bool canHandle(Instruction *I) { |
2669 | return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || |
2670 | isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); |
2671 | } |
2672 | static inline Instruction *getEmptyKey() { |
2673 | return DenseMapInfo<Instruction *>::getEmptyKey(); |
2674 | } |
2675 | static inline Instruction *getTombstoneKey() { |
2676 | return DenseMapInfo<Instruction *>::getTombstoneKey(); |
2677 | } |
2678 | static unsigned getHashValue(Instruction *I) { |
2679 | assert(canHandle(I) && "Unknown instruction!")((canHandle(I) && "Unknown instruction!") ? static_cast <void> (0) : __assert_fail ("canHandle(I) && \"Unknown instruction!\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2679, __PRETTY_FUNCTION__)); |
2680 | return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), |
2681 | I->value_op_end())); |
2682 | } |
2683 | static bool isEqual(Instruction *LHS, Instruction *RHS) { |
2684 | if (LHS == getEmptyKey() || RHS == getEmptyKey() || |
2685 | LHS == getTombstoneKey() || RHS == getTombstoneKey()) |
2686 | return LHS == RHS; |
2687 | return LHS->isIdenticalTo(RHS); |
2688 | } |
2689 | }; |
2690 | } |
2691 | |
2692 | /// \brief Check whether this block is a predicated block. |
2693 | /// Due to if predication of stores we might create a sequence of "if(pred) a[i] |
2694 | /// = ...; " blocks. We start with one vectorized basic block. For every |
2695 | /// conditional block we split this vectorized block. Therefore, every second |
2696 | /// block will be a predicated one. |
2697 | static bool isPredicatedBlock(unsigned BlockNum) { |
2698 | return BlockNum % 2; |
2699 | } |
2700 | |
2701 | ///\brief Perform cse of induction variable instructions. |
2702 | static void cse(SmallVector<BasicBlock *, 4> &BBs) { |
2703 | // Perform simple cse. |
2704 | SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; |
2705 | for (unsigned i = 0, e = BBs.size(); i != e; ++i) { |
2706 | BasicBlock *BB = BBs[i]; |
2707 | for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { |
2708 | Instruction *In = I++; |
2709 | |
2710 | if (!CSEDenseMapInfo::canHandle(In)) |
2711 | continue; |
2712 | |
2713 | // Check if we can replace this instruction with any of the |
2714 | // visited instructions. |
2715 | if (Instruction *V = CSEMap.lookup(In)) { |
2716 | In->replaceAllUsesWith(V); |
2717 | In->eraseFromParent(); |
2718 | continue; |
2719 | } |
2720 | // Ignore instructions in conditional blocks. We create "if (pred) a[i] = |
2721 | // ...;" blocks for predicated stores. Every second block is a predicated |
2722 | // block. |
2723 | if (isPredicatedBlock(i)) |
2724 | continue; |
2725 | |
2726 | CSEMap[In] = In; |
2727 | } |
2728 | } |
2729 | } |
2730 | |
2731 | /// \brief Adds a 'fast' flag to floating point operations. |
2732 | static Value *addFastMathFlag(Value *V) { |
2733 | if (isa<FPMathOperator>(V)){ |
2734 | FastMathFlags Flags; |
2735 | Flags.setUnsafeAlgebra(); |
2736 | cast<Instruction>(V)->setFastMathFlags(Flags); |
2737 | } |
2738 | return V; |
2739 | } |
2740 | |
2741 | void InnerLoopVectorizer::vectorizeLoop() { |
2742 | //===------------------------------------------------===// |
2743 | // |
2744 | // Notice: any optimization or new instruction that go |
2745 | // into the code below should be also be implemented in |
2746 | // the cost-model. |
2747 | // |
2748 | //===------------------------------------------------===// |
2749 | Constant *Zero = Builder.getInt32(0); |
2750 | |
2751 | // In order to support reduction variables we need to be able to vectorize |
2752 | // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two |
2753 | // stages. First, we create a new vector PHI node with no incoming edges. |
2754 | // We use this value when we vectorize all of the instructions that use the |
2755 | // PHI. Next, after all of the instructions in the block are complete we |
2756 | // add the new incoming edges to the PHI. At this point all of the |
2757 | // instructions in the basic block are vectorized, so we can use them to |
2758 | // construct the PHI. |
2759 | PhiVector RdxPHIsToFix; |
2760 | |
2761 | // Scan the loop in a topological order to ensure that defs are vectorized |
2762 | // before users. |
2763 | LoopBlocksDFS DFS(OrigLoop); |
2764 | DFS.perform(LI); |
2765 | |
2766 | // Vectorize all of the blocks in the original loop. |
2767 | for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), |
2768 | be = DFS.endRPO(); bb != be; ++bb) |
2769 | vectorizeBlockInLoop(*bb, &RdxPHIsToFix); |
2770 | |
2771 | // At this point every instruction in the original loop is widened to |
2772 | // a vector form. We are almost done. Now, we need to fix the PHI nodes |
2773 | // that we vectorized. The PHI nodes are currently empty because we did |
2774 | // not want to introduce cycles. Notice that the remaining PHI nodes |
2775 | // that we need to fix are reduction variables. |
2776 | |
2777 | // Create the 'reduced' values for each of the induction vars. |
2778 | // The reduced values are the vector values that we scalarize and combine |
2779 | // after the loop is finished. |
2780 | for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end(); |
2781 | it != e; ++it) { |
2782 | PHINode *RdxPhi = *it; |
2783 | assert(RdxPhi && "Unable to recover vectorized PHI")((RdxPhi && "Unable to recover vectorized PHI") ? static_cast <void> (0) : __assert_fail ("RdxPhi && \"Unable to recover vectorized PHI\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2783, __PRETTY_FUNCTION__)); |
2784 | |
2785 | // Find the reduction variable descriptor. |
2786 | assert(Legal->getReductionVars()->count(RdxPhi) &&((Legal->getReductionVars()->count(RdxPhi) && "Unable to find the reduction variable" ) ? static_cast<void> (0) : __assert_fail ("Legal->getReductionVars()->count(RdxPhi) && \"Unable to find the reduction variable\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2787, __PRETTY_FUNCTION__)) |
2787 | "Unable to find the reduction variable")((Legal->getReductionVars()->count(RdxPhi) && "Unable to find the reduction variable" ) ? static_cast<void> (0) : __assert_fail ("Legal->getReductionVars()->count(RdxPhi) && \"Unable to find the reduction variable\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2787, __PRETTY_FUNCTION__)); |
2788 | LoopVectorizationLegality::ReductionDescriptor RdxDesc = |
2789 | (*Legal->getReductionVars())[RdxPhi]; |
2790 | |
2791 | setDebugLocFromInst(Builder, RdxDesc.StartValue); |
2792 | |
2793 | // We need to generate a reduction vector from the incoming scalar. |
2794 | // To do so, we need to generate the 'identity' vector and override |
2795 | // one of the elements with the incoming scalar reduction. We need |
2796 | // to do it in the vector-loop preheader. |
2797 | Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator()); |
2798 | |
2799 | // This is the vector-clone of the value that leaves the loop. |
2800 | VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr); |
2801 | Type *VecTy = VectorExit[0]->getType(); |
2802 | |
2803 | // Find the reduction identity variable. Zero for addition, or, xor, |
2804 | // one for multiplication, -1 for And. |
2805 | Value *Identity; |
2806 | Value *VectorStart; |
2807 | if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerMinMax || |
2808 | RdxDesc.Kind == LoopVectorizationLegality::RK_FloatMinMax) { |
2809 | // MinMax reduction have the start value as their identify. |
2810 | if (VF == 1) { |
2811 | VectorStart = Identity = RdxDesc.StartValue; |
2812 | } else { |
2813 | VectorStart = Identity = Builder.CreateVectorSplat(VF, |
2814 | RdxDesc.StartValue, |
2815 | "minmax.ident"); |
2816 | } |
2817 | } else { |
2818 | // Handle other reduction kinds: |
2819 | Constant *Iden = |
2820 | LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind, |
2821 | VecTy->getScalarType()); |
2822 | if (VF == 1) { |
2823 | Identity = Iden; |
2824 | // This vector is the Identity vector where the first element is the |
2825 | // incoming scalar reduction. |
2826 | VectorStart = RdxDesc.StartValue; |
2827 | } else { |
2828 | Identity = ConstantVector::getSplat(VF, Iden); |
2829 | |
2830 | // This vector is the Identity vector where the first element is the |
2831 | // incoming scalar reduction. |
2832 | VectorStart = Builder.CreateInsertElement(Identity, |
2833 | RdxDesc.StartValue, Zero); |
2834 | } |
2835 | } |
2836 | |
2837 | // Fix the vector-loop phi. |
2838 | // We created the induction variable so we know that the |
2839 | // preheader is the first entry. |
2840 | BasicBlock *VecPreheader = Induction->getIncomingBlock(0); |
2841 | |
2842 | // Reductions do not have to start at zero. They can start with |
2843 | // any loop invariant values. |
2844 | VectorParts &VecRdxPhi = WidenMap.get(RdxPhi); |
2845 | BasicBlock *Latch = OrigLoop->getLoopLatch(); |
2846 | Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch); |
2847 | VectorParts &Val = getVectorValue(LoopVal); |
2848 | for (unsigned part = 0; part < UF; ++part) { |
2849 | // Make sure to add the reduction stat value only to the |
2850 | // first unroll part. |
2851 | Value *StartVal = (part == 0) ? VectorStart : Identity; |
2852 | cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal, VecPreheader); |
2853 | cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part], |
2854 | LoopVectorBody.back()); |
2855 | } |
2856 | |
2857 | // Before each round, move the insertion point right between |
2858 | // the PHIs and the values we are going to write. |
2859 | // This allows us to write both PHINodes and the extractelement |
2860 | // instructions. |
2861 | Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); |
2862 | |
2863 | VectorParts RdxParts; |
2864 | setDebugLocFromInst(Builder, RdxDesc.LoopExitInstr); |
2865 | for (unsigned part = 0; part < UF; ++part) { |
2866 | // This PHINode contains the vectorized reduction variable, or |
2867 | // the initial value vector, if we bypass the vector loop. |
2868 | VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr); |
2869 | PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); |
2870 | Value *StartVal = (part == 0) ? VectorStart : Identity; |
2871 | for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) |
2872 | NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]); |
2873 | NewPhi->addIncoming(RdxExitVal[part], |
2874 | LoopVectorBody.back()); |
2875 | RdxParts.push_back(NewPhi); |
2876 | } |
2877 | |
2878 | // Reduce all of the unrolled parts into a single vector. |
2879 | Value *ReducedPartRdx = RdxParts[0]; |
2880 | unsigned Op = getReductionBinOp(RdxDesc.Kind); |
2881 | setDebugLocFromInst(Builder, ReducedPartRdx); |
2882 | for (unsigned part = 1; part < UF; ++part) { |
2883 | if (Op != Instruction::ICmp && Op != Instruction::FCmp) |
2884 | // Floating point operations had to be 'fast' to enable the reduction. |
2885 | ReducedPartRdx = addFastMathFlag( |
2886 | Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part], |
2887 | ReducedPartRdx, "bin.rdx")); |
2888 | else |
2889 | ReducedPartRdx = createMinMaxOp(Builder, RdxDesc.MinMaxKind, |
2890 | ReducedPartRdx, RdxParts[part]); |
2891 | } |
2892 | |
2893 | if (VF > 1) { |
2894 | // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles |
2895 | // and vector ops, reducing the set of values being computed by half each |
2896 | // round. |
2897 | assert(isPowerOf2_32(VF) &&((isPowerOf2_32(VF) && "Reduction emission only supported for pow2 vectors!" ) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(VF) && \"Reduction emission only supported for pow2 vectors!\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2898, __PRETTY_FUNCTION__)) |
2898 | "Reduction emission only supported for pow2 vectors!")((isPowerOf2_32(VF) && "Reduction emission only supported for pow2 vectors!" ) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(VF) && \"Reduction emission only supported for pow2 vectors!\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2898, __PRETTY_FUNCTION__)); |
2899 | Value *TmpVec = ReducedPartRdx; |
2900 | SmallVector<Constant*, 32> ShuffleMask(VF, nullptr); |
2901 | for (unsigned i = VF; i != 1; i >>= 1) { |
2902 | // Move the upper half of the vector to the lower half. |
2903 | for (unsigned j = 0; j != i/2; ++j) |
2904 | ShuffleMask[j] = Builder.getInt32(i/2 + j); |
2905 | |
2906 | // Fill the rest of the mask with undef. |
2907 | std::fill(&ShuffleMask[i/2], ShuffleMask.end(), |
2908 | UndefValue::get(Builder.getInt32Ty())); |
2909 | |
2910 | Value *Shuf = |
2911 | Builder.CreateShuffleVector(TmpVec, |
2912 | UndefValue::get(TmpVec->getType()), |
2913 | ConstantVector::get(ShuffleMask), |
2914 | "rdx.shuf"); |
2915 | |
2916 | if (Op != Instruction::ICmp && Op != Instruction::FCmp) |
2917 | // Floating point operations had to be 'fast' to enable the reduction. |
2918 | TmpVec = addFastMathFlag(Builder.CreateBinOp( |
2919 | (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx")); |
2920 | else |
2921 | TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf); |
2922 | } |
2923 | |
2924 | // The result is in the first element of the vector. |
2925 | ReducedPartRdx = Builder.CreateExtractElement(TmpVec, |
2926 | Builder.getInt32(0)); |
2927 | } |
2928 | |
2929 | // Create a phi node that merges control-flow from the backedge-taken check |
2930 | // block and the middle block. |
2931 | PHINode *BCBlockPhi = PHINode::Create(RdxPhi->getType(), 2, "bc.merge.rdx", |
2932 | LoopScalarPreHeader->getTerminator()); |
2933 | BCBlockPhi->addIncoming(RdxDesc.StartValue, LoopBypassBlocks[0]); |
2934 | BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); |
2935 | |
2936 | // Now, we need to fix the users of the reduction variable |
2937 | // inside and outside of the scalar remainder loop. |
2938 | // We know that the loop is in LCSSA form. We need to update the |
2939 | // PHI nodes in the exit blocks. |
2940 | for (BasicBlock::iterator LEI = LoopExitBlock->begin(), |
2941 | LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { |
2942 | PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); |
2943 | if (!LCSSAPhi) break; |
2944 | |
2945 | // All PHINodes need to have a single entry edge, or two if |
2946 | // we already fixed them. |
2947 | assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI")((LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI" ) ? static_cast<void> (0) : __assert_fail ("LCSSAPhi->getNumIncomingValues() < 3 && \"Invalid LCSSA PHI\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2947, __PRETTY_FUNCTION__)); |
2948 | |
2949 | // We found our reduction value exit-PHI. Update it with the |
2950 | // incoming bypass edge. |
2951 | if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) { |
2952 | // Add an edge coming from the bypass. |
2953 | LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); |
2954 | break; |
2955 | } |
2956 | }// end of the LCSSA phi scan. |
2957 | |
2958 | // Fix the scalar loop reduction variable with the incoming reduction sum |
2959 | // from the vector body and from the backedge value. |
2960 | int IncomingEdgeBlockIdx = |
2961 | (RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch()); |
2962 | assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index")((IncomingEdgeBlockIdx >= 0 && "Invalid block index" ) ? static_cast<void> (0) : __assert_fail ("IncomingEdgeBlockIdx >= 0 && \"Invalid block index\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2962, __PRETTY_FUNCTION__)); |
2963 | // Pick the other block. |
2964 | int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); |
2965 | (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); |
2966 | (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr); |
2967 | }// end of for each redux variable. |
2968 | |
2969 | fixLCSSAPHIs(); |
2970 | |
2971 | // Remove redundant induction instructions. |
2972 | cse(LoopVectorBody); |
2973 | } |
2974 | |
2975 | void InnerLoopVectorizer::fixLCSSAPHIs() { |
2976 | for (BasicBlock::iterator LEI = LoopExitBlock->begin(), |
2977 | LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { |
2978 | PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); |
2979 | if (!LCSSAPhi) break; |
2980 | if (LCSSAPhi->getNumIncomingValues() == 1) |
2981 | LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()), |
2982 | LoopMiddleBlock); |
2983 | } |
2984 | } |
2985 | |
2986 | InnerLoopVectorizer::VectorParts |
2987 | InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { |
2988 | assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) &&((std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end( Dst) && "Invalid edge") ? static_cast<void> (0) : __assert_fail ("std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) && \"Invalid edge\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2989, __PRETTY_FUNCTION__)) |
2989 | "Invalid edge")((std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end( Dst) && "Invalid edge") ? static_cast<void> (0) : __assert_fail ("std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) && \"Invalid edge\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 2989, __PRETTY_FUNCTION__)); |
2990 | |
2991 | // Look for cached value. |
2992 | std::pair<BasicBlock*, BasicBlock*> Edge(Src, Dst); |
2993 | EdgeMaskCache::iterator ECEntryIt = MaskCache.find(Edge); |
2994 | if (ECEntryIt != MaskCache.end()) |
2995 | return ECEntryIt->second; |
2996 | |
2997 | VectorParts SrcMask = createBlockInMask(Src); |
2998 | |
2999 | // The terminator has to be a branch inst! |
3000 | BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); |
3001 | assert(BI && "Unexpected terminator found")((BI && "Unexpected terminator found") ? static_cast< void> (0) : __assert_fail ("BI && \"Unexpected terminator found\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3001, __PRETTY_FUNCTION__)); |
3002 | |
3003 | if (BI->isConditional()) { |
3004 | VectorParts EdgeMask = getVectorValue(BI->getCondition()); |
3005 | |
3006 | if (BI->getSuccessor(0) != Dst) |
3007 | for (unsigned part = 0; part < UF; ++part) |
3008 | EdgeMask[part] = Builder.CreateNot(EdgeMask[part]); |
3009 | |
3010 | for (unsigned part = 0; part < UF; ++part) |
3011 | EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]); |
3012 | |
3013 | MaskCache[Edge] = EdgeMask; |
3014 | return EdgeMask; |
3015 | } |
3016 | |
3017 | MaskCache[Edge] = SrcMask; |
3018 | return SrcMask; |
3019 | } |
3020 | |
3021 | InnerLoopVectorizer::VectorParts |
3022 | InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { |
3023 | assert(OrigLoop->contains(BB) && "Block is not a part of a loop")((OrigLoop->contains(BB) && "Block is not a part of a loop" ) ? static_cast<void> (0) : __assert_fail ("OrigLoop->contains(BB) && \"Block is not a part of a loop\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3023, __PRETTY_FUNCTION__)); |
3024 | |
3025 | // Loop incoming mask is all-one. |
3026 | if (OrigLoop->getHeader() == BB) { |
3027 | Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1); |
3028 | return getVectorValue(C); |
3029 | } |
3030 | |
3031 | // This is the block mask. We OR all incoming edges, and with zero. |
3032 | Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0); |
3033 | VectorParts BlockMask = getVectorValue(Zero); |
3034 | |
3035 | // For each pred: |
3036 | for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) { |
3037 | VectorParts EM = createEdgeMask(*it, BB); |
3038 | for (unsigned part = 0; part < UF; ++part) |
3039 | BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]); |
3040 | } |
3041 | |
3042 | return BlockMask; |
3043 | } |
3044 | |
3045 | void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, |
3046 | InnerLoopVectorizer::VectorParts &Entry, |
3047 | unsigned UF, unsigned VF, PhiVector *PV) { |
3048 | PHINode* P = cast<PHINode>(PN); |
3049 | // Handle reduction variables: |
3050 | if (Legal->getReductionVars()->count(P)) { |
3051 | for (unsigned part = 0; part < UF; ++part) { |
3052 | // This is phase one of vectorizing PHIs. |
3053 | Type *VecTy = (VF == 1) ? PN->getType() : |
3054 | VectorType::get(PN->getType(), VF); |
3055 | Entry[part] = PHINode::Create(VecTy, 2, "vec.phi", |
3056 | LoopVectorBody.back()-> getFirstInsertionPt()); |
3057 | } |
3058 | PV->push_back(P); |
3059 | return; |
3060 | } |
3061 | |
3062 | setDebugLocFromInst(Builder, P); |
3063 | // Check for PHI nodes that are lowered to vector selects. |
3064 | if (P->getParent() != OrigLoop->getHeader()) { |
3065 | // We know that all PHIs in non-header blocks are converted into |
3066 | // selects, so we don't have to worry about the insertion order and we |
3067 | // can just use the builder. |
3068 | // At this point we generate the predication tree. There may be |
3069 | // duplications since this is a simple recursive scan, but future |
3070 | // optimizations will clean it up. |
3071 | |
3072 | unsigned NumIncoming = P->getNumIncomingValues(); |
3073 | |
3074 | // Generate a sequence of selects of the form: |
3075 | // SELECT(Mask3, In3, |
3076 | // SELECT(Mask2, In2, |
3077 | // ( ...))) |
3078 | for (unsigned In = 0; In < NumIncoming; In++) { |
3079 | VectorParts Cond = createEdgeMask(P->getIncomingBlock(In), |
3080 | P->getParent()); |
3081 | VectorParts &In0 = getVectorValue(P->getIncomingValue(In)); |
3082 | |
3083 | for (unsigned part = 0; part < UF; ++part) { |
3084 | // We might have single edge PHIs (blocks) - use an identity |
3085 | // 'select' for the first PHI operand. |
3086 | if (In == 0) |
3087 | Entry[part] = Builder.CreateSelect(Cond[part], In0[part], |
3088 | In0[part]); |
3089 | else |
3090 | // Select between the current value and the previous incoming edge |
3091 | // based on the incoming mask. |
3092 | Entry[part] = Builder.CreateSelect(Cond[part], In0[part], |
3093 | Entry[part], "predphi"); |
3094 | } |
3095 | } |
3096 | return; |
3097 | } |
3098 | |
3099 | // This PHINode must be an induction variable. |
3100 | // Make sure that we know about it. |
3101 | assert(Legal->getInductionVars()->count(P) &&((Legal->getInductionVars()->count(P) && "Not an induction variable" ) ? static_cast<void> (0) : __assert_fail ("Legal->getInductionVars()->count(P) && \"Not an induction variable\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3102, __PRETTY_FUNCTION__)) |
3102 | "Not an induction variable")((Legal->getInductionVars()->count(P) && "Not an induction variable" ) ? static_cast<void> (0) : __assert_fail ("Legal->getInductionVars()->count(P) && \"Not an induction variable\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3102, __PRETTY_FUNCTION__)); |
3103 | |
3104 | LoopVectorizationLegality::InductionInfo II = |
3105 | Legal->getInductionVars()->lookup(P); |
3106 | |
3107 | switch (II.IK) { |
3108 | case LoopVectorizationLegality::IK_NoInduction: |
3109 | llvm_unreachable("Unknown induction")::llvm::llvm_unreachable_internal("Unknown induction", "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3109); |
3110 | case LoopVectorizationLegality::IK_IntInduction: { |
3111 | assert(P->getType() == II.StartValue->getType() && "Types must match")((P->getType() == II.StartValue->getType() && "Types must match" ) ? static_cast<void> (0) : __assert_fail ("P->getType() == II.StartValue->getType() && \"Types must match\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3111, __PRETTY_FUNCTION__)); |
3112 | Type *PhiTy = P->getType(); |
3113 | Value *Broadcasted; |
3114 | if (P == OldInduction) { |
3115 | // Handle the canonical induction variable. We might have had to |
3116 | // extend the type. |
3117 | Broadcasted = Builder.CreateTrunc(Induction, PhiTy); |
3118 | } else { |
3119 | // Handle other induction variables that are now based on the |
3120 | // canonical one. |
3121 | Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx, |
3122 | "normalized.idx"); |
3123 | NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy); |
3124 | Broadcasted = Builder.CreateAdd(II.StartValue, NormalizedIdx, |
3125 | "offset.idx"); |
3126 | } |
3127 | Broadcasted = getBroadcastInstrs(Broadcasted); |
3128 | // After broadcasting the induction variable we need to make the vector |
3129 | // consecutive by adding 0, 1, 2, etc. |
3130 | for (unsigned part = 0; part < UF; ++part) |
3131 | Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false); |
3132 | return; |
3133 | } |
3134 | case LoopVectorizationLegality::IK_ReverseIntInduction: |
3135 | case LoopVectorizationLegality::IK_PtrInduction: |
3136 | case LoopVectorizationLegality::IK_ReversePtrInduction: |
3137 | // Handle reverse integer and pointer inductions. |
3138 | Value *StartIdx = ExtendedIdx; |
3139 | // This is the normalized GEP that starts counting at zero. |
3140 | Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx, |
3141 | "normalized.idx"); |
3142 | |
3143 | // Handle the reverse integer induction variable case. |
3144 | if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) { |
3145 | IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType()); |
3146 | Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy, |
3147 | "resize.norm.idx"); |
3148 | Value *ReverseInd = Builder.CreateSub(II.StartValue, CNI, |
3149 | "reverse.idx"); |
3150 | |
3151 | // This is a new value so do not hoist it out. |
3152 | Value *Broadcasted = getBroadcastInstrs(ReverseInd); |
3153 | // After broadcasting the induction variable we need to make the |
3154 | // vector consecutive by adding ... -3, -2, -1, 0. |
3155 | for (unsigned part = 0; part < UF; ++part) |
3156 | Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part, |
3157 | true); |
3158 | return; |
3159 | } |
3160 | |
3161 | // Handle the pointer induction variable case. |
3162 | assert(P->getType()->isPointerTy() && "Unexpected type.")((P->getType()->isPointerTy() && "Unexpected type." ) ? static_cast<void> (0) : __assert_fail ("P->getType()->isPointerTy() && \"Unexpected type.\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3162, __PRETTY_FUNCTION__)); |
3163 | |
3164 | // Is this a reverse induction ptr or a consecutive induction ptr. |
3165 | bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction == |
3166 | II.IK); |
3167 | |
3168 | // This is the vector of results. Notice that we don't generate |
3169 | // vector geps because scalar geps result in better code. |
3170 | for (unsigned part = 0; part < UF; ++part) { |
3171 | if (VF == 1) { |
3172 | int EltIndex = (part) * (Reverse ? -1 : 1); |
3173 | Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex); |
3174 | Value *GlobalIdx; |
3175 | if (Reverse) |
3176 | GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx"); |
3177 | else |
3178 | GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx"); |
3179 | |
3180 | Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, |
3181 | "next.gep"); |
3182 | Entry[part] = SclrGep; |
3183 | continue; |
3184 | } |
3185 | |
3186 | Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); |
3187 | for (unsigned int i = 0; i < VF; ++i) { |
3188 | int EltIndex = (i + part * VF) * (Reverse ? -1 : 1); |
3189 | Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex); |
3190 | Value *GlobalIdx; |
3191 | if (!Reverse) |
3192 | GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx"); |
3193 | else |
3194 | GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx"); |
3195 | |
3196 | Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, |
3197 | "next.gep"); |
3198 | VecVal = Builder.CreateInsertElement(VecVal, SclrGep, |
3199 | Builder.getInt32(i), |
3200 | "insert.gep"); |
3201 | } |
3202 | Entry[part] = VecVal; |
3203 | } |
3204 | return; |
3205 | } |
3206 | } |
3207 | |
3208 | void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { |
3209 | // For each instruction in the old loop. |
3210 | for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { |
3211 | VectorParts &Entry = WidenMap.get(it); |
3212 | switch (it->getOpcode()) { |
3213 | case Instruction::Br: |
3214 | // Nothing to do for PHIs and BR, since we already took care of the |
3215 | // loop control flow instructions. |
3216 | continue; |
3217 | case Instruction::PHI:{ |
3218 | // Vectorize PHINodes. |
3219 | widenPHIInstruction(it, Entry, UF, VF, PV); |
3220 | continue; |
3221 | }// End of PHI. |
3222 | |
3223 | case Instruction::Add: |
3224 | case Instruction::FAdd: |
3225 | case Instruction::Sub: |
3226 | case Instruction::FSub: |
3227 | case Instruction::Mul: |
3228 | case Instruction::FMul: |
3229 | case Instruction::UDiv: |
3230 | case Instruction::SDiv: |
3231 | case Instruction::FDiv: |
3232 | case Instruction::URem: |
3233 | case Instruction::SRem: |
3234 | case Instruction::FRem: |
3235 | case Instruction::Shl: |
3236 | case Instruction::LShr: |
3237 | case Instruction::AShr: |
3238 | case Instruction::And: |
3239 | case Instruction::Or: |
3240 | case Instruction::Xor: { |
3241 | // Just widen binops. |
3242 | BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it); |
3243 | setDebugLocFromInst(Builder, BinOp); |
3244 | VectorParts &A = getVectorValue(it->getOperand(0)); |
3245 | VectorParts &B = getVectorValue(it->getOperand(1)); |
3246 | |
3247 | // Use this vector value for all users of the original instruction. |
3248 | for (unsigned Part = 0; Part < UF; ++Part) { |
3249 | Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]); |
3250 | |
3251 | if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V)) |
3252 | VecOp->copyIRFlags(BinOp); |
3253 | |
3254 | Entry[Part] = V; |
3255 | } |
3256 | |
3257 | propagateMetadata(Entry, it); |
3258 | break; |
3259 | } |
3260 | case Instruction::Select: { |
3261 | // Widen selects. |
3262 | // If the selector is loop invariant we can create a select |
3263 | // instruction with a scalar condition. Otherwise, use vector-select. |
3264 | bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)), |
3265 | OrigLoop); |
3266 | setDebugLocFromInst(Builder, it); |
3267 | |
3268 | // The condition can be loop invariant but still defined inside the |
3269 | // loop. This means that we can't just use the original 'cond' value. |
3270 | // We have to take the 'vectorized' value and pick the first lane. |
3271 | // Instcombine will make this a no-op. |
3272 | VectorParts &Cond = getVectorValue(it->getOperand(0)); |
3273 | VectorParts &Op0 = getVectorValue(it->getOperand(1)); |
3274 | VectorParts &Op1 = getVectorValue(it->getOperand(2)); |
3275 | |
3276 | Value *ScalarCond = (VF == 1) ? Cond[0] : |
3277 | Builder.CreateExtractElement(Cond[0], Builder.getInt32(0)); |
3278 | |
3279 | for (unsigned Part = 0; Part < UF; ++Part) { |
3280 | Entry[Part] = Builder.CreateSelect( |
3281 | InvariantCond ? ScalarCond : Cond[Part], |
3282 | Op0[Part], |
3283 | Op1[Part]); |
3284 | } |
3285 | |
3286 | propagateMetadata(Entry, it); |
3287 | break; |
3288 | } |
3289 | |
3290 | case Instruction::ICmp: |
3291 | case Instruction::FCmp: { |
3292 | // Widen compares. Generate vector compares. |
3293 | bool FCmp = (it->getOpcode() == Instruction::FCmp); |
3294 | CmpInst *Cmp = dyn_cast<CmpInst>(it); |
3295 | setDebugLocFromInst(Builder, it); |
3296 | VectorParts &A = getVectorValue(it->getOperand(0)); |
3297 | VectorParts &B = getVectorValue(it->getOperand(1)); |
3298 | for (unsigned Part = 0; Part < UF; ++Part) { |
3299 | Value *C = nullptr; |
3300 | if (FCmp) |
3301 | C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]); |
3302 | else |
3303 | C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]); |
3304 | Entry[Part] = C; |
3305 | } |
3306 | |
3307 | propagateMetadata(Entry, it); |
3308 | break; |
3309 | } |
3310 | |
3311 | case Instruction::Store: |
3312 | case Instruction::Load: |
3313 | vectorizeMemoryInstruction(it); |
3314 | break; |
3315 | case Instruction::ZExt: |
3316 | case Instruction::SExt: |
3317 | case Instruction::FPToUI: |
3318 | case Instruction::FPToSI: |
3319 | case Instruction::FPExt: |
3320 | case Instruction::PtrToInt: |
3321 | case Instruction::IntToPtr: |
3322 | case Instruction::SIToFP: |
3323 | case Instruction::UIToFP: |
3324 | case Instruction::Trunc: |
3325 | case Instruction::FPTrunc: |
3326 | case Instruction::BitCast: { |
3327 | CastInst *CI = dyn_cast<CastInst>(it); |
3328 | setDebugLocFromInst(Builder, it); |
3329 | /// Optimize the special case where the source is the induction |
3330 | /// variable. Notice that we can only optimize the 'trunc' case |
3331 | /// because: a. FP conversions lose precision, b. sext/zext may wrap, |
3332 | /// c. other casts depend on pointer size. |
3333 | if (CI->getOperand(0) == OldInduction && |
3334 | it->getOpcode() == Instruction::Trunc) { |
3335 | Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction, |
3336 | CI->getType()); |
3337 | Value *Broadcasted = getBroadcastInstrs(ScalarCast); |
3338 | for (unsigned Part = 0; Part < UF; ++Part) |
3339 | Entry[Part] = getConsecutiveVector(Broadcasted, VF * Part, false); |
3340 | propagateMetadata(Entry, it); |
3341 | break; |
3342 | } |
3343 | /// Vectorize casts. |
3344 | Type *DestTy = (VF == 1) ? CI->getType() : |
3345 | VectorType::get(CI->getType(), VF); |
3346 | |
3347 | VectorParts &A = getVectorValue(it->getOperand(0)); |
3348 | for (unsigned Part = 0; Part < UF; ++Part) |
3349 | Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy); |
3350 | propagateMetadata(Entry, it); |
3351 | break; |
3352 | } |
3353 | |
3354 | case Instruction::Call: { |
3355 | // Ignore dbg intrinsics. |
3356 | if (isa<DbgInfoIntrinsic>(it)) |
3357 | break; |
3358 | setDebugLocFromInst(Builder, it); |
3359 | |
3360 | Module *M = BB->getParent()->getParent(); |
3361 | CallInst *CI = cast<CallInst>(it); |
3362 | Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); |
3363 | assert(ID && "Not an intrinsic call!")((ID && "Not an intrinsic call!") ? static_cast<void > (0) : __assert_fail ("ID && \"Not an intrinsic call!\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3363, __PRETTY_FUNCTION__)); |
3364 | switch (ID) { |
3365 | case Intrinsic::assume: |
3366 | case Intrinsic::lifetime_end: |
3367 | case Intrinsic::lifetime_start: |
3368 | scalarizeInstruction(it); |
3369 | break; |
3370 | default: |
3371 | bool HasScalarOpd = hasVectorInstrinsicScalarOpd(ID, 1); |
3372 | for (unsigned Part = 0; Part < UF; ++Part) { |
3373 | SmallVector<Value *, 4> Args; |
3374 | for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { |
3375 | if (HasScalarOpd && i == 1) { |
3376 | Args.push_back(CI->getArgOperand(i)); |
3377 | continue; |
3378 | } |
3379 | VectorParts &Arg = getVectorValue(CI->getArgOperand(i)); |
3380 | Args.push_back(Arg[Part]); |
3381 | } |
3382 | Type *Tys[] = {CI->getType()}; |
3383 | if (VF > 1) |
3384 | Tys[0] = VectorType::get(CI->getType()->getScalarType(), VF); |
3385 | |
3386 | Function *F = Intrinsic::getDeclaration(M, ID, Tys); |
3387 | Entry[Part] = Builder.CreateCall(F, Args); |
3388 | } |
3389 | |
3390 | propagateMetadata(Entry, it); |
3391 | break; |
3392 | } |
3393 | break; |
3394 | } |
3395 | |
3396 | default: |
3397 | // All other instructions are unsupported. Scalarize them. |
3398 | scalarizeInstruction(it); |
3399 | break; |
3400 | }// end of switch. |
3401 | }// end of for_each instr. |
3402 | } |
3403 | |
3404 | void InnerLoopVectorizer::updateAnalysis() { |
3405 | // Forget the original basic block. |
3406 | SE->forgetLoop(OrigLoop); |
3407 | |
3408 | // Update the dominator tree information. |
3409 | assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&((DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock ) && "Entry does not dominate exit.") ? static_cast< void> (0) : __assert_fail ("DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && \"Entry does not dominate exit.\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3410, __PRETTY_FUNCTION__)) |
3410 | "Entry does not dominate exit.")((DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock ) && "Entry does not dominate exit.") ? static_cast< void> (0) : __assert_fail ("DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && \"Entry does not dominate exit.\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3410, __PRETTY_FUNCTION__)); |
3411 | |
3412 | for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) |
3413 | DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]); |
3414 | DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back()); |
3415 | |
3416 | // Due to if predication of stores we might create a sequence of "if(pred) |
3417 | // a[i] = ...; " blocks. |
3418 | for (unsigned i = 0, e = LoopVectorBody.size(); i != e; ++i) { |
3419 | if (i == 0) |
3420 | DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader); |
3421 | else if (isPredicatedBlock(i)) { |
3422 | DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-1]); |
3423 | } else { |
3424 | DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-2]); |
3425 | } |
3426 | } |
3427 | |
3428 | DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks[1]); |
3429 | DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]); |
3430 | DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); |
3431 | DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); |
3432 | |
3433 | DEBUG(DT->verifyDomTree())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { DT->verifyDomTree(); } } while (0); |
3434 | } |
3435 | |
3436 | /// \brief Check whether it is safe to if-convert this phi node. |
3437 | /// |
3438 | /// Phi nodes with constant expressions that can trap are not safe to if |
3439 | /// convert. |
3440 | static bool canIfConvertPHINodes(BasicBlock *BB) { |
3441 | for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { |
3442 | PHINode *Phi = dyn_cast<PHINode>(I); |
3443 | if (!Phi) |
3444 | return true; |
3445 | for (unsigned p = 0, e = Phi->getNumIncomingValues(); p != e; ++p) |
3446 | if (Constant *C = dyn_cast<Constant>(Phi->getIncomingValue(p))) |
3447 | if (C->canTrap()) |
3448 | return false; |
3449 | } |
3450 | return true; |
3451 | } |
3452 | |
3453 | bool LoopVectorizationLegality::canVectorizeWithIfConvert() { |
3454 | if (!EnableIfConversion) { |
3455 | emitAnalysis(Report() << "if-conversion is disabled"); |
3456 | return false; |
3457 | } |
3458 | |
3459 | assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable")((TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable" ) ? static_cast<void> (0) : __assert_fail ("TheLoop->getNumBlocks() > 1 && \"Single block loops are vectorizable\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 3459, __PRETTY_FUNCTION__)); |
3460 | |
3461 | // A list of pointers that we can safely read and write to. |
3462 | SmallPtrSet<Value *, 8> SafePointes; |
3463 | |
3464 | // Collect safe addresses. |
3465 | for (Loop::block_iterator BI = TheLoop->block_begin(), |
3466 | BE = TheLoop->block_end(); BI != BE; ++BI) { |
3467 | BasicBlock *BB = *BI; |
3468 | |
3469 | if (blockNeedsPredication(BB)) |
3470 | continue; |
3471 | |
3472 | for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { |
3473 | if (LoadInst *LI = dyn_cast<LoadInst>(I)) |
3474 | SafePointes.insert(LI->getPointerOperand()); |
3475 | else if (StoreInst *SI = dyn_cast<StoreInst>(I)) |
3476 | SafePointes.insert(SI->getPointerOperand()); |
3477 | } |
3478 | } |
3479 | |
3480 | // Collect the blocks that need predication. |
3481 | BasicBlock *Header = TheLoop->getHeader(); |
3482 | for (Loop::block_iterator BI = TheLoop->block_begin(), |
3483 | BE = TheLoop->block_end(); BI != BE; ++BI) { |
3484 | BasicBlock *BB = *BI; |
3485 | |
3486 | // We don't support switch statements inside loops. |
3487 | if (!isa<BranchInst>(BB->getTerminator())) { |
3488 | emitAnalysis(Report(BB->getTerminator()) |
3489 | << "loop contains a switch statement"); |
3490 | return false; |
3491 | } |
3492 | |
3493 | // We must be able to predicate all blocks that need to be predicated. |
3494 | if (blockNeedsPredication(BB)) { |
3495 | if (!blockCanBePredicated(BB, SafePointes)) { |
3496 | emitAnalysis(Report(BB->getTerminator()) |
3497 | << "control flow cannot be substituted for a select"); |
3498 | return false; |
3499 | } |
3500 | } else if (BB != Header && !canIfConvertPHINodes(BB)) { |
3501 | emitAnalysis(Report(BB->getTerminator()) |
3502 | << "control flow cannot be substituted for a select"); |
3503 | return false; |
3504 | } |
3505 | } |
3506 | |
3507 | // We can if-convert this loop. |
3508 | return true; |
3509 | } |
3510 | |
3511 | bool LoopVectorizationLegality::canVectorize() { |
3512 | // We must have a loop in canonical form. Loops with indirectbr in them cannot |
3513 | // be canonicalized. |
3514 | if (!TheLoop->getLoopPreheader()) { |
3515 | emitAnalysis( |
3516 | Report() << "loop control flow is not understood by vectorizer"); |
3517 | return false; |
3518 | } |
3519 | |
3520 | // We can only vectorize innermost loops. |
3521 | if (TheLoop->getSubLoopsVector().size()) { |
3522 | emitAnalysis(Report() << "loop is not the innermost loop"); |
3523 | return false; |
3524 | } |
3525 | |
3526 | // We must have a single backedge. |
3527 | if (TheLoop->getNumBackEdges() != 1) { |
3528 | emitAnalysis( |
3529 | Report() << "loop control flow is not understood by vectorizer"); |
3530 | return false; |
3531 | } |
3532 | |
3533 | // We must have a single exiting block. |
3534 | if (!TheLoop->getExitingBlock()) { |
3535 | emitAnalysis( |
3536 | Report() << "loop control flow is not understood by vectorizer"); |
3537 | return false; |
3538 | } |
3539 | |
3540 | // We need to have a loop header. |
3541 | DEBUG(dbgs() << "LV: Found a loop: " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName() << '\n'; } } while (0) |
3542 | TheLoop->getHeader()->getName() << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName() << '\n'; } } while (0); |
3543 | |
3544 | // Check if we can if-convert non-single-bb loops. |
3545 | unsigned NumBlocks = TheLoop->getNumBlocks(); |
3546 | if (NumBlocks != 1 && !canVectorizeWithIfConvert()) { |
3547 | DEBUG(dbgs() << "LV: Can't if-convert the loop.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Can't if-convert the loop.\n" ; } } while (0); |
3548 | return false; |
3549 | } |
3550 | |
3551 | // ScalarEvolution needs to be able to find the exit count. |
3552 | const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop); |
3553 | if (ExitCount == SE->getCouldNotCompute()) { |
3554 | emitAnalysis(Report() << "could not determine number of loop iterations"); |
3555 | DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: SCEV could not compute the loop exit count.\n" ; } } while (0); |
3556 | return false; |
3557 | } |
3558 | |
3559 | // Check if we can vectorize the instructions and CFG in this loop. |
3560 | if (!canVectorizeInstrs()) { |
3561 | DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Can't vectorize the instructions or CFG\n" ; } } while (0); |
3562 | return false; |
3563 | } |
3564 | |
3565 | // Go over each instruction and look at memory deps. |
3566 | if (!canVectorizeMemory()) { |
3567 | DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Can't vectorize due to memory conflicts\n" ; } } while (0); |
3568 | return false; |
3569 | } |
3570 | |
3571 | // Collect all of the variables that remain uniform after vectorization. |
3572 | collectLoopUniforms(); |
3573 | |
3574 | DEBUG(dbgs() << "LV: We can vectorize this loop" <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We can vectorize this loop" << (PtrRtCheck.Need ? " (with a runtime bound check)" : "") <<"!\n"; } } while (0) |
3575 | (PtrRtCheck.Need ? " (with a runtime bound check)" : "")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We can vectorize this loop" << (PtrRtCheck.Need ? " (with a runtime bound check)" : "") <<"!\n"; } } while (0) |
3576 | <<"!\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We can vectorize this loop" << (PtrRtCheck.Need ? " (with a runtime bound check)" : "") <<"!\n"; } } while (0); |
3577 | |
3578 | // Okay! We can vectorize. At this point we don't have any other mem analysis |
3579 | // which may limit our maximum vectorization factor, so just return true with |
3580 | // no restrictions. |
3581 | return true; |
3582 | } |
3583 | |
3584 | static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) { |
3585 | if (Ty->isPointerTy()) |
3586 | return DL.getIntPtrType(Ty); |
3587 | |
3588 | // It is possible that char's or short's overflow when we ask for the loop's |
3589 | // trip count, work around this by changing the type size. |
3590 | if (Ty->getScalarSizeInBits() < 32) |
3591 | return Type::getInt32Ty(Ty->getContext()); |
3592 | |
3593 | return Ty; |
3594 | } |
3595 | |
3596 | static Type* getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) { |
3597 | Ty0 = convertPointerToIntegerType(DL, Ty0); |
3598 | Ty1 = convertPointerToIntegerType(DL, Ty1); |
3599 | if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits()) |
3600 | return Ty0; |
3601 | return Ty1; |
3602 | } |
3603 | |
3604 | /// \brief Check that the instruction has outside loop users and is not an |
3605 | /// identified reduction variable. |
3606 | static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, |
3607 | SmallPtrSetImpl<Value *> &Reductions) { |
3608 | // Reduction instructions are allowed to have exit users. All other |
3609 | // instructions must not have external users. |
3610 | if (!Reductions.count(Inst)) |
3611 | //Check that all of the users of the loop are inside the BB. |
3612 | for (User *U : Inst->users()) { |
3613 | Instruction *UI = cast<Instruction>(U); |
3614 | // This user may be a reduction exit value. |
3615 | if (!TheLoop->contains(UI)) { |
3616 | DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an outside user for : " << *UI << '\n'; } } while (0); |
3617 | return true; |
3618 | } |
3619 | } |
3620 | return false; |
3621 | } |
3622 | |
3623 | bool LoopVectorizationLegality::canVectorizeInstrs() { |
3624 | BasicBlock *PreHeader = TheLoop->getLoopPreheader(); |
3625 | BasicBlock *Header = TheLoop->getHeader(); |
3626 | |
3627 | // Look for the attribute signaling the absence of NaNs. |
3628 | Function &F = *Header->getParent(); |
3629 | if (F.hasFnAttribute("no-nans-fp-math")) |
3630 | HasFunNoNaNAttr = F.getAttributes().getAttribute( |
3631 | AttributeSet::FunctionIndex, |
3632 | "no-nans-fp-math").getValueAsString() == "true"; |
3633 | |
3634 | // For each block in the loop. |
3635 | for (Loop::block_iterator bb = TheLoop->block_begin(), |
3636 | be = TheLoop->block_end(); bb != be; ++bb) { |
3637 | |
3638 | // Scan the instructions in the block and look for hazards. |
3639 | for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; |
3640 | ++it) { |
3641 | |
3642 | if (PHINode *Phi = dyn_cast<PHINode>(it)) { |
3643 | Type *PhiTy = Phi->getType(); |
3644 | // Check that this PHI type is allowed. |
3645 | if (!PhiTy->isIntegerTy() && |
3646 | !PhiTy->isFloatingPointTy() && |
3647 | !PhiTy->isPointerTy()) { |
3648 | emitAnalysis(Report(it) |
3649 | << "loop control flow is not understood by vectorizer"); |
3650 | DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an non-int non-pointer PHI.\n" ; } } while (0); |
3651 | return false; |
3652 | } |
3653 | |
3654 | // If this PHINode is not in the header block, then we know that we |
3655 | // can convert it to select during if-conversion. No need to check if |
3656 | // the PHIs in this block are induction or reduction variables. |
3657 | if (*bb != Header) { |
3658 | // Check that this instruction has no outside users or is an |
3659 | // identified reduction value with an outside user. |
3660 | if (!hasOutsideLoopUser(TheLoop, it, AllowedExit)) |
3661 | continue; |
3662 | emitAnalysis(Report(it) << "value could not be identified as " |
3663 | "an induction or reduction variable"); |
3664 | return false; |
3665 | } |
3666 | |
3667 | // We only allow if-converted PHIs with more than two incoming values. |
3668 | if (Phi->getNumIncomingValues() != 2) { |
3669 | emitAnalysis(Report(it) |
3670 | << "control flow not understood by vectorizer"); |
3671 | DEBUG(dbgs() << "LV: Found an invalid PHI.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an invalid PHI.\n" ; } } while (0); |
3672 | return false; |
3673 | } |
3674 | |
3675 | // This is the value coming from the preheader. |
3676 | Value *StartValue = Phi->getIncomingValueForBlock(PreHeader); |
3677 | // Check if this is an induction variable. |
3678 | InductionKind IK = isInductionVariable(Phi); |
3679 | |
3680 | if (IK_NoInduction != IK) { |
3681 | // Get the widest type. |
3682 | if (!WidestIndTy) |
3683 | WidestIndTy = convertPointerToIntegerType(*DL, PhiTy); |
3684 | else |
3685 | WidestIndTy = getWiderType(*DL, PhiTy, WidestIndTy); |
3686 | |
3687 | // Int inductions are special because we only allow one IV. |
3688 | if (IK == IK_IntInduction) { |
3689 | // Use the phi node with the widest type as induction. Use the last |
3690 | // one if there are multiple (no good reason for doing this other |
3691 | // than it is expedient). |
3692 | if (!Induction || PhiTy == WidestIndTy) |
3693 | Induction = Phi; |
3694 | } |
3695 | |
3696 | DEBUG(dbgs() << "LV: Found an induction variable.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an induction variable.\n" ; } } while (0); |
3697 | Inductions[Phi] = InductionInfo(StartValue, IK); |
3698 | |
3699 | // Until we explicitly handle the case of an induction variable with |
3700 | // an outside loop user we have to give up vectorizing this loop. |
3701 | if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) { |
3702 | emitAnalysis(Report(it) << "use of induction value outside of the " |
3703 | "loop is not handled by vectorizer"); |
3704 | return false; |
3705 | } |
3706 | |
3707 | continue; |
3708 | } |
3709 | |
3710 | if (AddReductionVar(Phi, RK_IntegerAdd)) { |
3711 | DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an ADD reduction PHI." << *Phi <<"\n"; } } while (0); |
3712 | continue; |
3713 | } |
3714 | if (AddReductionVar(Phi, RK_IntegerMult)) { |
3715 | DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a MUL reduction PHI." << *Phi <<"\n"; } } while (0); |
3716 | continue; |
3717 | } |
3718 | if (AddReductionVar(Phi, RK_IntegerOr)) { |
3719 | DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an OR reduction PHI." << *Phi <<"\n"; } } while (0); |
3720 | continue; |
3721 | } |
3722 | if (AddReductionVar(Phi, RK_IntegerAnd)) { |
3723 | DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an AND reduction PHI." << *Phi <<"\n"; } } while (0); |
3724 | continue; |
3725 | } |
3726 | if (AddReductionVar(Phi, RK_IntegerXor)) { |
3727 | DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a XOR reduction PHI." << *Phi <<"\n"; } } while (0); |
3728 | continue; |
3729 | } |
3730 | if (AddReductionVar(Phi, RK_IntegerMinMax)) { |
3731 | DEBUG(dbgs() << "LV: Found a MINMAX reduction PHI."<< *Phi <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a MINMAX reduction PHI." << *Phi <<"\n"; } } while (0); |
3732 | continue; |
3733 | } |
3734 | if (AddReductionVar(Phi, RK_FloatMult)) { |
3735 | DEBUG(dbgs() << "LV: Found an FMult reduction PHI."<< *Phi <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an FMult reduction PHI." << *Phi <<"\n"; } } while (0); |
3736 | continue; |
3737 | } |
3738 | if (AddReductionVar(Phi, RK_FloatAdd)) { |
3739 | DEBUG(dbgs() << "LV: Found an FAdd reduction PHI."<< *Phi <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an FAdd reduction PHI." << *Phi <<"\n"; } } while (0); |
3740 | continue; |
3741 | } |
3742 | if (AddReductionVar(Phi, RK_FloatMinMax)) { |
3743 | DEBUG(dbgs() << "LV: Found an float MINMAX reduction PHI."<< *Phi <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an float MINMAX reduction PHI." << *Phi << "\n"; } } while (0) |
3744 | "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an float MINMAX reduction PHI." << *Phi << "\n"; } } while (0); |
3745 | continue; |
3746 | } |
3747 | |
3748 | emitAnalysis(Report(it) << "value that could not be identified as " |
3749 | "reduction is used outside the loop"); |
3750 | DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an unidentified PHI." << *Phi <<"\n"; } } while (0); |
3751 | return false; |
3752 | }// end of PHI handling |
3753 | |
3754 | // We still don't handle functions. However, we can ignore dbg intrinsic |
3755 | // calls and we do handle certain intrinsic and libm functions. |
3756 | CallInst *CI = dyn_cast<CallInst>(it); |
3757 | if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI)) { |
3758 | emitAnalysis(Report(it) << "call instruction cannot be vectorized"); |
3759 | DEBUG(dbgs() << "LV: Found a call site.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a call site.\n" ; } } while (0); |
3760 | return false; |
3761 | } |
3762 | |
3763 | // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the |
3764 | // second argument is the same (i.e. loop invariant) |
3765 | if (CI && |
3766 | hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) { |
3767 | if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) { |
3768 | emitAnalysis(Report(it) |
3769 | << "intrinsic instruction cannot be vectorized"); |
3770 | DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n"; } } while (0); |
3771 | return false; |
3772 | } |
3773 | } |
3774 | |
3775 | // Check that the instruction return type is vectorizable. |
3776 | // Also, we can't vectorize extractelement instructions. |
3777 | if ((!VectorType::isValidElementType(it->getType()) && |
3778 | !it->getType()->isVoidTy()) || isa<ExtractElementInst>(it)) { |
3779 | emitAnalysis(Report(it) |
3780 | << "instruction return type cannot be vectorized"); |
3781 | DEBUG(dbgs() << "LV: Found unvectorizable type.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found unvectorizable type.\n" ; } } while (0); |
3782 | return false; |
3783 | } |
3784 | |
3785 | // Check that the stored type is vectorizable. |
3786 | if (StoreInst *ST = dyn_cast<StoreInst>(it)) { |
3787 | Type *T = ST->getValueOperand()->getType(); |
3788 | if (!VectorType::isValidElementType(T)) { |
3789 | emitAnalysis(Report(ST) << "store instruction cannot be vectorized"); |
3790 | return false; |
3791 | } |
3792 | if (EnableMemAccessVersioning) |
3793 | collectStridedAcccess(ST); |
3794 | } |
3795 | |
3796 | if (EnableMemAccessVersioning) |
3797 | if (LoadInst *LI = dyn_cast<LoadInst>(it)) |
3798 | collectStridedAcccess(LI); |
3799 | |
3800 | // Reduction instructions are allowed to have exit users. |
3801 | // All other instructions must not have external users. |
3802 | if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) { |
3803 | emitAnalysis(Report(it) << "value cannot be used outside the loop"); |
3804 | return false; |
3805 | } |
3806 | |
3807 | } // next instr. |
3808 | |
3809 | } |
3810 | |
3811 | if (!Induction) { |
3812 | DEBUG(dbgs() << "LV: Did not find one integer induction var.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Did not find one integer induction var.\n" ; } } while (0); |
3813 | if (Inductions.empty()) { |
3814 | emitAnalysis(Report() |
3815 | << "loop induction variable could not be identified"); |
3816 | return false; |
3817 | } |
3818 | } |
3819 | |
3820 | return true; |
3821 | } |
3822 | |
3823 | ///\brief Remove GEPs whose indices but the last one are loop invariant and |
3824 | /// return the induction operand of the gep pointer. |
3825 | static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, |
3826 | const DataLayout *DL, Loop *Lp) { |
3827 | GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr); |
3828 | if (!GEP) |
3829 | return Ptr; |
3830 | |
3831 | unsigned InductionOperand = getGEPInductionOperand(DL, GEP); |
3832 | |
3833 | // Check that all of the gep indices are uniform except for our induction |
3834 | // operand. |
3835 | for (unsigned i = 0, e = GEP->getNumOperands(); i != e; ++i) |
3836 | if (i != InductionOperand && |
3837 | !SE->isLoopInvariant(SE->getSCEV(GEP->getOperand(i)), Lp)) |
3838 | return Ptr; |
3839 | return GEP->getOperand(InductionOperand); |
3840 | } |
3841 | |
3842 | ///\brief Look for a cast use of the passed value. |
3843 | static Value *getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty) { |
3844 | Value *UniqueCast = nullptr; |
3845 | for (User *U : Ptr->users()) { |
3846 | CastInst *CI = dyn_cast<CastInst>(U); |
3847 | if (CI && CI->getType() == Ty) { |
3848 | if (!UniqueCast) |
3849 | UniqueCast = CI; |
3850 | else |
3851 | return nullptr; |
3852 | } |
3853 | } |
3854 | return UniqueCast; |
3855 | } |
3856 | |
3857 | ///\brief Get the stride of a pointer access in a loop. |
3858 | /// Looks for symbolic strides "a[i*stride]". Returns the symbolic stride as a |
3859 | /// pointer to the Value, or null otherwise. |
3860 | static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, |
3861 | const DataLayout *DL, Loop *Lp) { |
3862 | const PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); |
3863 | if (!PtrTy || PtrTy->isAggregateType()) |
3864 | return nullptr; |
3865 | |
3866 | // Try to remove a gep instruction to make the pointer (actually index at this |
3867 | // point) easier analyzable. If OrigPtr is equal to Ptr we are analzying the |
3868 | // pointer, otherwise, we are analyzing the index. |
3869 | Value *OrigPtr = Ptr; |
3870 | |
3871 | // The size of the pointer access. |
3872 | int64_t PtrAccessSize = 1; |
3873 | |
3874 | Ptr = stripGetElementPtr(Ptr, SE, DL, Lp); |
3875 | const SCEV *V = SE->getSCEV(Ptr); |
3876 | |
3877 | if (Ptr != OrigPtr) |
3878 | // Strip off casts. |
3879 | while (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(V)) |
3880 | V = C->getOperand(); |
3881 | |
3882 | const SCEVAddRecExpr *S = dyn_cast<SCEVAddRecExpr>(V); |
3883 | if (!S) |
3884 | return nullptr; |
3885 | |
3886 | V = S->getStepRecurrence(*SE); |
3887 | if (!V) |
3888 | return nullptr; |
3889 | |
3890 | // Strip off the size of access multiplication if we are still analyzing the |
3891 | // pointer. |
3892 | if (OrigPtr == Ptr) { |
3893 | DL->getTypeAllocSize(PtrTy->getElementType()); |
3894 | if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(V)) { |
3895 | if (M->getOperand(0)->getSCEVType() != scConstant) |
3896 | return nullptr; |
3897 | |
3898 | const APInt &APStepVal = |
3899 | cast<SCEVConstant>(M->getOperand(0))->getValue()->getValue(); |
3900 | |
3901 | // Huge step value - give up. |
3902 | if (APStepVal.getBitWidth() > 64) |
3903 | return nullptr; |
3904 | |
3905 | int64_t StepVal = APStepVal.getSExtValue(); |
3906 | if (PtrAccessSize != StepVal) |
3907 | return nullptr; |
3908 | V = M->getOperand(1); |
3909 | } |
3910 | } |
3911 | |
3912 | // Strip off casts. |
3913 | Type *StripedOffRecurrenceCast = nullptr; |
3914 | if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(V)) { |
3915 | StripedOffRecurrenceCast = C->getType(); |
3916 | V = C->getOperand(); |
3917 | } |
3918 | |
3919 | // Look for the loop invariant symbolic value. |
3920 | const SCEVUnknown *U = dyn_cast<SCEVUnknown>(V); |
3921 | if (!U) |
3922 | return nullptr; |
3923 | |
3924 | Value *Stride = U->getValue(); |
3925 | if (!Lp->isLoopInvariant(Stride)) |
3926 | return nullptr; |
3927 | |
3928 | // If we have stripped off the recurrence cast we have to make sure that we |
3929 | // return the value that is used in this loop so that we can replace it later. |
3930 | if (StripedOffRecurrenceCast) |
3931 | Stride = getUniqueCastUse(Stride, Lp, StripedOffRecurrenceCast); |
3932 | |
3933 | return Stride; |
3934 | } |
3935 | |
3936 | void LoopVectorizationLegality::collectStridedAcccess(Value *MemAccess) { |
3937 | Value *Ptr = nullptr; |
3938 | if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess)) |
3939 | Ptr = LI->getPointerOperand(); |
3940 | else if (StoreInst *SI = dyn_cast<StoreInst>(MemAccess)) |
3941 | Ptr = SI->getPointerOperand(); |
3942 | else |
3943 | return; |
3944 | |
3945 | Value *Stride = getStrideFromPointer(Ptr, SE, DL, TheLoop); |
3946 | if (!Stride) |
3947 | return; |
3948 | |
3949 | DEBUG(dbgs() << "LV: Found a strided access that we can version")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a strided access that we can version" ; } } while (0); |
3950 | DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *Stride << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << " Ptr: " << *Ptr << " Stride: " << *Stride << "\n"; } } while (0); |
3951 | Strides[Ptr] = Stride; |
3952 | StrideSet.insert(Stride); |
3953 | } |
3954 | |
3955 | void LoopVectorizationLegality::collectLoopUniforms() { |
3956 | // We now know that the loop is vectorizable! |
3957 | // Collect variables that will remain uniform after vectorization. |
3958 | std::vector<Value*> Worklist; |
3959 | BasicBlock *Latch = TheLoop->getLoopLatch(); |
3960 | |
3961 | // Start with the conditional branch and walk up the block. |
3962 | Worklist.push_back(Latch->getTerminator()->getOperand(0)); |
3963 | |
3964 | // Also add all consecutive pointer values; these values will be uniform |
3965 | // after vectorization (and subsequent cleanup) and, until revectorization is |
3966 | // supported, all dependencies must also be uniform. |
3967 | for (Loop::block_iterator B = TheLoop->block_begin(), |
3968 | BE = TheLoop->block_end(); B != BE; ++B) |
3969 | for (BasicBlock::iterator I = (*B)->begin(), IE = (*B)->end(); |
3970 | I != IE; ++I) |
3971 | if (I->getType()->isPointerTy() && isConsecutivePtr(I)) |
3972 | Worklist.insert(Worklist.end(), I->op_begin(), I->op_end()); |
3973 | |
3974 | while (Worklist.size()) { |
3975 | Instruction *I = dyn_cast<Instruction>(Worklist.back()); |
3976 | Worklist.pop_back(); |
3977 | |
3978 | // Look at instructions inside this loop. |
3979 | // Stop when reaching PHI nodes. |
3980 | // TODO: we need to follow values all over the loop, not only in this block. |
3981 | if (!I || !TheLoop->contains(I) || isa<PHINode>(I)) |
3982 | continue; |
3983 | |
3984 | // This is a known uniform. |
3985 | Uniforms.insert(I); |
3986 | |
3987 | // Insert all operands. |
3988 | Worklist.insert(Worklist.end(), I->op_begin(), I->op_end()); |
3989 | } |
3990 | } |
3991 | |
3992 | namespace { |
3993 | /// \brief Analyses memory accesses in a loop. |
3994 | /// |
3995 | /// Checks whether run time pointer checks are needed and builds sets for data |
3996 | /// dependence checking. |
3997 | class AccessAnalysis { |
3998 | public: |
3999 | /// \brief Read or write access location. |
4000 | typedef PointerIntPair<Value *, 1, bool> MemAccessInfo; |
4001 | typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet; |
4002 | |
4003 | /// \brief Set of potential dependent memory accesses. |
4004 | typedef EquivalenceClasses<MemAccessInfo> DepCandidates; |
4005 | |
4006 | AccessAnalysis(const DataLayout *Dl, AliasAnalysis *AA, DepCandidates &DA) : |
4007 | DL(Dl), AST(*AA), DepCands(DA), IsRTCheckNeeded(false) {} |
4008 | |
4009 | /// \brief Register a load and whether it is only read from. |
4010 | void addLoad(AliasAnalysis::Location &Loc, bool IsReadOnly) { |
4011 | Value *Ptr = const_cast<Value*>(Loc.Ptr); |
4012 | AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags); |
4013 | Accesses.insert(MemAccessInfo(Ptr, false)); |
4014 | if (IsReadOnly) |
4015 | ReadOnlyPtr.insert(Ptr); |
4016 | } |
4017 | |
4018 | /// \brief Register a store. |
4019 | void addStore(AliasAnalysis::Location &Loc) { |
4020 | Value *Ptr = const_cast<Value*>(Loc.Ptr); |
4021 | AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags); |
4022 | Accesses.insert(MemAccessInfo(Ptr, true)); |
4023 | } |
4024 | |
4025 | /// \brief Check whether we can check the pointers at runtime for |
4026 | /// non-intersection. |
4027 | bool canCheckPtrAtRT(LoopVectorizationLegality::RuntimePointerCheck &RtCheck, |
4028 | unsigned &NumComparisons, ScalarEvolution *SE, |
4029 | Loop *TheLoop, ValueToValueMap &Strides, |
4030 | bool ShouldCheckStride = false); |
4031 | |
4032 | /// \brief Goes over all memory accesses, checks whether a RT check is needed |
4033 | /// and builds sets of dependent accesses. |
4034 | void buildDependenceSets() { |
4035 | processMemAccesses(); |
4036 | } |
4037 | |
4038 | bool isRTCheckNeeded() { return IsRTCheckNeeded; } |
4039 | |
4040 | bool isDependencyCheckNeeded() { return !CheckDeps.empty(); } |
4041 | void resetDepChecks() { CheckDeps.clear(); } |
4042 | |
4043 | MemAccessInfoSet &getDependenciesToCheck() { return CheckDeps; } |
4044 | |
4045 | private: |
4046 | typedef SetVector<MemAccessInfo> PtrAccessSet; |
4047 | |
4048 | /// \brief Go over all memory access and check whether runtime pointer checks |
4049 | /// are needed /// and build sets of dependency check candidates. |
4050 | void processMemAccesses(); |
4051 | |
4052 | /// Set of all accesses. |
4053 | PtrAccessSet Accesses; |
4054 | |
4055 | /// Set of accesses that need a further dependence check. |
4056 | MemAccessInfoSet CheckDeps; |
4057 | |
4058 | /// Set of pointers that are read only. |
4059 | SmallPtrSet<Value*, 16> ReadOnlyPtr; |
4060 | |
4061 | const DataLayout *DL; |
4062 | |
4063 | /// An alias set tracker to partition the access set by underlying object and |
4064 | //intrinsic property (such as TBAA metadata). |
4065 | AliasSetTracker AST; |
4066 | |
4067 | /// Sets of potentially dependent accesses - members of one set share an |
4068 | /// underlying pointer. The set "CheckDeps" identfies which sets really need a |
4069 | /// dependence check. |
4070 | DepCandidates &DepCands; |
4071 | |
4072 | bool IsRTCheckNeeded; |
4073 | }; |
4074 | |
4075 | } // end anonymous namespace |
4076 | |
4077 | /// \brief Check whether a pointer can participate in a runtime bounds check. |
4078 | static bool hasComputableBounds(ScalarEvolution *SE, ValueToValueMap &Strides, |
4079 | Value *Ptr) { |
4080 | const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, Strides, Ptr); |
4081 | const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev); |
4082 | if (!AR) |
4083 | return false; |
4084 | |
4085 | return AR->isAffine(); |
4086 | } |
4087 | |
4088 | /// \brief Check the stride of the pointer and ensure that it does not wrap in |
4089 | /// the address space. |
4090 | static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr, |
4091 | const Loop *Lp, ValueToValueMap &StridesMap); |
4092 | |
4093 | bool AccessAnalysis::canCheckPtrAtRT( |
4094 | LoopVectorizationLegality::RuntimePointerCheck &RtCheck, |
4095 | unsigned &NumComparisons, ScalarEvolution *SE, Loop *TheLoop, |
4096 | ValueToValueMap &StridesMap, bool ShouldCheckStride) { |
4097 | // Find pointers with computable bounds. We are going to use this information |
4098 | // to place a runtime bound check. |
4099 | bool CanDoRT = true; |
4100 | |
4101 | bool IsDepCheckNeeded = isDependencyCheckNeeded(); |
4102 | NumComparisons = 0; |
4103 | |
4104 | // We assign a consecutive id to access from different alias sets. |
4105 | // Accesses between different groups doesn't need to be checked. |
4106 | unsigned ASId = 1; |
4107 | for (auto &AS : AST) { |
4108 | unsigned NumReadPtrChecks = 0; |
4109 | unsigned NumWritePtrChecks = 0; |
4110 | |
4111 | // We assign consecutive id to access from different dependence sets. |
4112 | // Accesses within the same set don't need a runtime check. |
4113 | unsigned RunningDepId = 1; |
4114 | DenseMap<Value *, unsigned> DepSetId; |
4115 | |
4116 | for (auto A : AS) { |
4117 | Value *Ptr = A.getValue(); |
4118 | bool IsWrite = Accesses.count(MemAccessInfo(Ptr, true)); |
4119 | MemAccessInfo Access(Ptr, IsWrite); |
4120 | |
4121 | if (IsWrite) |
4122 | ++NumWritePtrChecks; |
4123 | else |
4124 | ++NumReadPtrChecks; |
4125 | |
4126 | if (hasComputableBounds(SE, StridesMap, Ptr) && |
4127 | // When we run after a failing dependency check we have to make sure we |
4128 | // don't have wrapping pointers. |
4129 | (!ShouldCheckStride || |
4130 | isStridedPtr(SE, DL, Ptr, TheLoop, StridesMap) == 1)) { |
4131 | // The id of the dependence set. |
4132 | unsigned DepId; |
4133 | |
4134 | if (IsDepCheckNeeded) { |
4135 | Value *Leader = DepCands.getLeaderValue(Access).getPointer(); |
4136 | unsigned &LeaderId = DepSetId[Leader]; |
4137 | if (!LeaderId) |
4138 | LeaderId = RunningDepId++; |
4139 | DepId = LeaderId; |
4140 | } else |
4141 | // Each access has its own dependence set. |
4142 | DepId = RunningDepId++; |
4143 | |
4144 | RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap); |
4145 | |
4146 | DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a runtime check ptr:" << *Ptr << '\n'; } } while (0); |
4147 | } else { |
4148 | CanDoRT = false; |
4149 | } |
4150 | } |
4151 | |
4152 | if (IsDepCheckNeeded && CanDoRT && RunningDepId == 2) |
4153 | NumComparisons += 0; // Only one dependence set. |
4154 | else { |
4155 | NumComparisons += (NumWritePtrChecks * (NumReadPtrChecks + |
4156 | NumWritePtrChecks - 1)); |
4157 | } |
4158 | |
4159 | ++ASId; |
4160 | } |
4161 | |
4162 | // If the pointers that we would use for the bounds comparison have different |
4163 | // address spaces, assume the values aren't directly comparable, so we can't |
4164 | // use them for the runtime check. We also have to assume they could |
4165 | // overlap. In the future there should be metadata for whether address spaces |
4166 | // are disjoint. |
4167 | unsigned NumPointers = RtCheck.Pointers.size(); |
4168 | for (unsigned i = 0; i < NumPointers; ++i) { |
4169 | for (unsigned j = i + 1; j < NumPointers; ++j) { |
4170 | // Only need to check pointers between two different dependency sets. |
4171 | if (RtCheck.DependencySetId[i] == RtCheck.DependencySetId[j]) |
4172 | continue; |
4173 | // Only need to check pointers in the same alias set. |
4174 | if (RtCheck.AliasSetId[i] != RtCheck.AliasSetId[j]) |
4175 | continue; |
4176 | |
4177 | Value *PtrI = RtCheck.Pointers[i]; |
4178 | Value *PtrJ = RtCheck.Pointers[j]; |
4179 | |
4180 | unsigned ASi = PtrI->getType()->getPointerAddressSpace(); |
4181 | unsigned ASj = PtrJ->getType()->getPointerAddressSpace(); |
4182 | if (ASi != ASj) { |
4183 | DEBUG(dbgs() << "LV: Runtime check would require comparison between"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Runtime check would require comparison between" " different address spaces\n"; } } while (0) |
4184 | " different address spaces\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Runtime check would require comparison between" " different address spaces\n"; } } while (0); |
4185 | return false; |
4186 | } |
4187 | } |
4188 | } |
4189 | |
4190 | return CanDoRT; |
4191 | } |
4192 | |
4193 | void AccessAnalysis::processMemAccesses() { |
4194 | // We process the set twice: first we process read-write pointers, last we |
4195 | // process read-only pointers. This allows us to skip dependence tests for |
4196 | // read-only pointers. |
4197 | |
4198 | DEBUG(dbgs() << "LV: Processing memory accesses...\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Processing memory accesses...\n" ; } } while (0); |
4199 | DEBUG(dbgs() << " AST: "; AST.dump())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << " AST: "; AST.dump(); } } while (0); |
4200 | DEBUG(dbgs() << "LV: Accesses:\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Accesses:\n"; } } while (0); |
4201 | DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { for (auto A : Accesses) dbgs() << "\t" << *A.getPointer() << " (" << (A.getInt () ? "write" : (ReadOnlyPtr.count(A.getPointer()) ? "read-only" : "read")) << ")\n"; }; } } while (0) |
4202 | for (auto A : Accesses)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { for (auto A : Accesses) dbgs() << "\t" << *A.getPointer() << " (" << (A.getInt () ? "write" : (ReadOnlyPtr.count(A.getPointer()) ? "read-only" : "read")) << ")\n"; }; } } while (0) |
4203 | dbgs() << "\t" << *A.getPointer() << " (" <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { for (auto A : Accesses) dbgs() << "\t" << *A.getPointer() << " (" << (A.getInt () ? "write" : (ReadOnlyPtr.count(A.getPointer()) ? "read-only" : "read")) << ")\n"; }; } } while (0) |
4204 | (A.getInt() ? "write" : (ReadOnlyPtr.count(A.getPointer()) ?do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { for (auto A : Accesses) dbgs() << "\t" << *A.getPointer() << " (" << (A.getInt () ? "write" : (ReadOnlyPtr.count(A.getPointer()) ? "read-only" : "read")) << ")\n"; }; } } while (0) |
4205 | "read-only" : "read")) << ")\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { for (auto A : Accesses) dbgs() << "\t" << *A.getPointer() << " (" << (A.getInt () ? "write" : (ReadOnlyPtr.count(A.getPointer()) ? "read-only" : "read")) << ")\n"; }; } } while (0) |
4206 | })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { { for (auto A : Accesses) dbgs() << "\t" << *A.getPointer() << " (" << (A.getInt () ? "write" : (ReadOnlyPtr.count(A.getPointer()) ? "read-only" : "read")) << ")\n"; }; } } while (0); |
4207 | |
4208 | // The AliasSetTracker has nicely partitioned our pointers by metadata |
4209 | // compatibility and potential for underlying-object overlap. As a result, we |
4210 | // only need to check for potential pointer dependencies within each alias |
4211 | // set. |
4212 | for (auto &AS : AST) { |
4213 | // Note that both the alias-set tracker and the alias sets themselves used |
4214 | // linked lists internally and so the iteration order here is deterministic |
4215 | // (matching the original instruction order within each set). |
4216 | |
4217 | bool SetHasWrite = false; |
4218 | |
4219 | // Map of pointers to last access encountered. |
4220 | typedef DenseMap<Value*, MemAccessInfo> UnderlyingObjToAccessMap; |
4221 | UnderlyingObjToAccessMap ObjToLastAccess; |
4222 | |
4223 | // Set of access to check after all writes have been processed. |
4224 | PtrAccessSet DeferredAccesses; |
4225 | |
4226 | // Iterate over each alias set twice, once to process read/write pointers, |
4227 | // and then to process read-only pointers. |
4228 | for (int SetIteration = 0; SetIteration < 2; ++SetIteration) { |
4229 | bool UseDeferred = SetIteration > 0; |
4230 | PtrAccessSet &S = UseDeferred ? DeferredAccesses : Accesses; |
4231 | |
4232 | for (auto A : AS) { |
4233 | Value *Ptr = A.getValue(); |
4234 | bool IsWrite = S.count(MemAccessInfo(Ptr, true)); |
4235 | |
4236 | // If we're using the deferred access set, then it contains only reads. |
4237 | bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite; |
4238 | if (UseDeferred && !IsReadOnlyPtr) |
4239 | continue; |
4240 | // Otherwise, the pointer must be in the PtrAccessSet, either as a read |
4241 | // or a write. |
4242 | assert(((IsReadOnlyPtr && UseDeferred) || IsWrite ||((((IsReadOnlyPtr && UseDeferred) || IsWrite || S.count (MemAccessInfo(Ptr, false))) && "Alias-set pointer not in the access set?" ) ? static_cast<void> (0) : __assert_fail ("((IsReadOnlyPtr && UseDeferred) || IsWrite || S.count(MemAccessInfo(Ptr, false))) && \"Alias-set pointer not in the access set?\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 4244, __PRETTY_FUNCTION__)) |
4243 | S.count(MemAccessInfo(Ptr, false))) &&((((IsReadOnlyPtr && UseDeferred) || IsWrite || S.count (MemAccessInfo(Ptr, false))) && "Alias-set pointer not in the access set?" ) ? static_cast<void> (0) : __assert_fail ("((IsReadOnlyPtr && UseDeferred) || IsWrite || S.count(MemAccessInfo(Ptr, false))) && \"Alias-set pointer not in the access set?\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 4244, __PRETTY_FUNCTION__)) |
4244 | "Alias-set pointer not in the access set?")((((IsReadOnlyPtr && UseDeferred) || IsWrite || S.count (MemAccessInfo(Ptr, false))) && "Alias-set pointer not in the access set?" ) ? static_cast<void> (0) : __assert_fail ("((IsReadOnlyPtr && UseDeferred) || IsWrite || S.count(MemAccessInfo(Ptr, false))) && \"Alias-set pointer not in the access set?\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 4244, __PRETTY_FUNCTION__)); |
4245 | |
4246 | MemAccessInfo Access(Ptr, IsWrite); |
4247 | DepCands.insert(Access); |
4248 | |
4249 | // Memorize read-only pointers for later processing and skip them in the |
4250 | // first round (they need to be checked after we have seen all write |
4251 | // pointers). Note: we also mark pointer that are not consecutive as |
4252 | // "read-only" pointers (so that we check "a[b[i]] +="). Hence, we need |
4253 | // the second check for "!IsWrite". |
4254 | if (!UseDeferred && IsReadOnlyPtr) { |
4255 | DeferredAccesses.insert(Access); |
4256 | continue; |
4257 | } |
4258 | |
4259 | // If this is a write - check other reads and writes for conflicts. If |
4260 | // this is a read only check other writes for conflicts (but only if |
4261 | // there is no other write to the ptr - this is an optimization to |
4262 | // catch "a[i] = a[i] + " without having to do a dependence check). |
4263 | if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) { |
4264 | CheckDeps.insert(Access); |
4265 | IsRTCheckNeeded = true; |
4266 | } |
4267 | |
4268 | if (IsWrite) |
4269 | SetHasWrite = true; |
4270 | |
4271 | // Create sets of pointers connected by a shared alias set and |
4272 | // underlying object. |
4273 | typedef SmallVector<Value *, 16> ValueVector; |
4274 | ValueVector TempObjects; |
4275 | GetUnderlyingObjects(Ptr, TempObjects, DL); |
4276 | for (Value *UnderlyingObj : TempObjects) { |
4277 | UnderlyingObjToAccessMap::iterator Prev = |
4278 | ObjToLastAccess.find(UnderlyingObj); |
4279 | if (Prev != ObjToLastAccess.end()) |
4280 | DepCands.unionSets(Access, Prev->second); |
4281 | |
4282 | ObjToLastAccess[UnderlyingObj] = Access; |
4283 | } |
4284 | } |
4285 | } |
4286 | } |
4287 | } |
4288 | |
4289 | namespace { |
4290 | /// \brief Checks memory dependences among accesses to the same underlying |
4291 | /// object to determine whether there vectorization is legal or not (and at |
4292 | /// which vectorization factor). |
4293 | /// |
4294 | /// This class works under the assumption that we already checked that memory |
4295 | /// locations with different underlying pointers are "must-not alias". |
4296 | /// We use the ScalarEvolution framework to symbolically evalutate access |
4297 | /// functions pairs. Since we currently don't restructure the loop we can rely |
4298 | /// on the program order of memory accesses to determine their safety. |
4299 | /// At the moment we will only deem accesses as safe for: |
4300 | /// * A negative constant distance assuming program order. |
4301 | /// |
4302 | /// Safe: tmp = a[i + 1]; OR a[i + 1] = x; |
4303 | /// a[i] = tmp; y = a[i]; |
4304 | /// |
4305 | /// The latter case is safe because later checks guarantuee that there can't |
4306 | /// be a cycle through a phi node (that is, we check that "x" and "y" is not |
4307 | /// the same variable: a header phi can only be an induction or a reduction, a |
4308 | /// reduction can't have a memory sink, an induction can't have a memory |
4309 | /// source). This is important and must not be violated (or we have to |
4310 | /// resort to checking for cycles through memory). |
4311 | /// |
4312 | /// * A positive constant distance assuming program order that is bigger |
4313 | /// than the biggest memory access. |
4314 | /// |
4315 | /// tmp = a[i] OR b[i] = x |
4316 | /// a[i+2] = tmp y = b[i+2]; |
4317 | /// |
4318 | /// Safe distance: 2 x sizeof(a[0]), and 2 x sizeof(b[0]), respectively. |
4319 | /// |
4320 | /// * Zero distances and all accesses have the same size. |
4321 | /// |
4322 | class MemoryDepChecker { |
4323 | public: |
4324 | typedef PointerIntPair<Value *, 1, bool> MemAccessInfo; |
4325 | typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet; |
4326 | |
4327 | MemoryDepChecker(ScalarEvolution *Se, const DataLayout *Dl, const Loop *L) |
4328 | : SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0), |
4329 | ShouldRetryWithRuntimeCheck(false) {} |
4330 | |
4331 | /// \brief Register the location (instructions are given increasing numbers) |
4332 | /// of a write access. |
4333 | void addAccess(StoreInst *SI) { |
4334 | Value *Ptr = SI->getPointerOperand(); |
4335 | Accesses[MemAccessInfo(Ptr, true)].push_back(AccessIdx); |
4336 | InstMap.push_back(SI); |
4337 | ++AccessIdx; |
4338 | } |
4339 | |
4340 | /// \brief Register the location (instructions are given increasing numbers) |
4341 | /// of a write access. |
4342 | void addAccess(LoadInst *LI) { |
4343 | Value *Ptr = LI->getPointerOperand(); |
4344 | Accesses[MemAccessInfo(Ptr, false)].push_back(AccessIdx); |
4345 | InstMap.push_back(LI); |
4346 | ++AccessIdx; |
4347 | } |
4348 | |
4349 | /// \brief Check whether the dependencies between the accesses are safe. |
4350 | /// |
4351 | /// Only checks sets with elements in \p CheckDeps. |
4352 | bool areDepsSafe(AccessAnalysis::DepCandidates &AccessSets, |
4353 | MemAccessInfoSet &CheckDeps, ValueToValueMap &Strides); |
4354 | |
4355 | /// \brief The maximum number of bytes of a vector register we can vectorize |
4356 | /// the accesses safely with. |
4357 | unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; } |
4358 | |
4359 | /// \brief In same cases when the dependency check fails we can still |
4360 | /// vectorize the loop with a dynamic array access check. |
4361 | bool shouldRetryWithRuntimeCheck() { return ShouldRetryWithRuntimeCheck; } |
4362 | |
4363 | private: |
4364 | ScalarEvolution *SE; |
4365 | const DataLayout *DL; |
4366 | const Loop *InnermostLoop; |
4367 | |
4368 | /// \brief Maps access locations (ptr, read/write) to program order. |
4369 | DenseMap<MemAccessInfo, std::vector<unsigned> > Accesses; |
4370 | |
4371 | /// \brief Memory access instructions in program order. |
4372 | SmallVector<Instruction *, 16> InstMap; |
4373 | |
4374 | /// \brief The program order index to be used for the next instruction. |
4375 | unsigned AccessIdx; |
4376 | |
4377 | // We can access this many bytes in parallel safely. |
4378 | unsigned MaxSafeDepDistBytes; |
4379 | |
4380 | /// \brief If we see a non-constant dependence distance we can still try to |
4381 | /// vectorize this loop with runtime checks. |
4382 | bool ShouldRetryWithRuntimeCheck; |
4383 | |
4384 | /// \brief Check whether there is a plausible dependence between the two |
4385 | /// accesses. |
4386 | /// |
4387 | /// Access \p A must happen before \p B in program order. The two indices |
4388 | /// identify the index into the program order map. |
4389 | /// |
4390 | /// This function checks whether there is a plausible dependence (or the |
4391 | /// absence of such can't be proved) between the two accesses. If there is a |
4392 | /// plausible dependence but the dependence distance is bigger than one |
4393 | /// element access it records this distance in \p MaxSafeDepDistBytes (if this |
4394 | /// distance is smaller than any other distance encountered so far). |
4395 | /// Otherwise, this function returns true signaling a possible dependence. |
4396 | bool isDependent(const MemAccessInfo &A, unsigned AIdx, |
4397 | const MemAccessInfo &B, unsigned BIdx, |
4398 | ValueToValueMap &Strides); |
4399 | |
4400 | /// \brief Check whether the data dependence could prevent store-load |
4401 | /// forwarding. |
4402 | bool couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize); |
4403 | }; |
4404 | |
4405 | } // end anonymous namespace |
4406 | |
4407 | static bool isInBoundsGep(Value *Ptr) { |
4408 | if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr)) |
4409 | return GEP->isInBounds(); |
4410 | return false; |
4411 | } |
4412 | |
4413 | /// \brief Check whether the access through \p Ptr has a constant stride. |
4414 | static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr, |
4415 | const Loop *Lp, ValueToValueMap &StridesMap) { |
4416 | const Type *Ty = Ptr->getType(); |
4417 | assert(Ty->isPointerTy() && "Unexpected non-ptr")((Ty->isPointerTy() && "Unexpected non-ptr") ? static_cast <void> (0) : __assert_fail ("Ty->isPointerTy() && \"Unexpected non-ptr\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 4417, __PRETTY_FUNCTION__)); |
4418 | |
4419 | // Make sure that the pointer does not point to aggregate types. |
4420 | const PointerType *PtrTy = cast<PointerType>(Ty); |
4421 | if (PtrTy->getElementType()->isAggregateType()) { |
4422 | DEBUG(dbgs() << "LV: Bad stride - Not a pointer to a scalar type" << *Ptr <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Bad stride - Not a pointer to a scalar type" << *Ptr << "\n"; } } while (0) |
4423 | "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Bad stride - Not a pointer to a scalar type" << *Ptr << "\n"; } } while (0); |
4424 | return 0; |
4425 | } |
4426 | |
4427 | const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, StridesMap, Ptr); |
4428 | |
4429 | const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev); |
4430 | if (!AR) { |
4431 | DEBUG(dbgs() << "LV: Bad stride - Not an AddRecExpr pointer "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Bad stride - Not an AddRecExpr pointer " << *Ptr << " SCEV: " << *PtrScev << "\n" ; } } while (0) |
4432 | << *Ptr << " SCEV: " << *PtrScev << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Bad stride - Not an AddRecExpr pointer " << *Ptr << " SCEV: " << *PtrScev << "\n" ; } } while (0); |
4433 | return 0; |
4434 | } |
4435 | |
4436 | // The accesss function must stride over the innermost loop. |
4437 | if (Lp != AR->getLoop()) { |
4438 | DEBUG(dbgs() << "LV: Bad stride - Not striding over innermost loop " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Bad stride - Not striding over innermost loop " << *Ptr << " SCEV: " << *PtrScev << "\n" ; } } while (0) |
4439 | *Ptr << " SCEV: " << *PtrScev << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Bad stride - Not striding over innermost loop " << *Ptr << " SCEV: " << *PtrScev << "\n" ; } } while (0); |
4440 | } |
4441 | |
4442 | // The address calculation must not wrap. Otherwise, a dependence could be |
4443 | // inverted. |
4444 | // An inbounds getelementptr that is a AddRec with a unit stride |
4445 | // cannot wrap per definition. The unit stride requirement is checked later. |
4446 | // An getelementptr without an inbounds attribute and unit stride would have |
4447 | // to access the pointer value "0" which is undefined behavior in address |
4448 | // space 0, therefore we can also vectorize this case. |
4449 | bool IsInBoundsGEP = isInBoundsGep(Ptr); |
4450 | bool IsNoWrapAddRec = AR->getNoWrapFlags(SCEV::NoWrapMask); |
4451 | bool IsInAddressSpaceZero = PtrTy->getAddressSpace() == 0; |
4452 | if (!IsNoWrapAddRec && !IsInBoundsGEP && !IsInAddressSpaceZero) { |
4453 | DEBUG(dbgs() << "LV: Bad stride - Pointer may wrap in the address space "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Bad stride - Pointer may wrap in the address space " << *Ptr << " SCEV: " << *PtrScev << "\n" ; } } while (0) |
4454 | << *Ptr << " SCEV: " << *PtrScev << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Bad stride - Pointer may wrap in the address space " << *Ptr << " SCEV: " << *PtrScev << "\n" ; } } while (0); |
4455 | return 0; |
4456 | } |
4457 | |
4458 | // Check the step is constant. |
4459 | const SCEV *Step = AR->getStepRecurrence(*SE); |
4460 | |
4461 | // Calculate the pointer stride and check if it is consecutive. |
4462 | const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); |
4463 | if (!C) { |
4464 | DEBUG(dbgs() << "LV: Bad stride - Not a constant strided " << *Ptr <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Bad stride - Not a constant strided " << *Ptr << " SCEV: " << *PtrScev << "\n" ; } } while (0) |
4465 | " SCEV: " << *PtrScev << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Bad stride - Not a constant strided " << *Ptr << " SCEV: " << *PtrScev << "\n" ; } } while (0); |
4466 | return 0; |
4467 | } |
4468 | |
4469 | int64_t Size = DL->getTypeAllocSize(PtrTy->getElementType()); |
4470 | const APInt &APStepVal = C->getValue()->getValue(); |
4471 | |
4472 | // Huge step value - give up. |
4473 | if (APStepVal.getBitWidth() > 64) |
4474 | return 0; |
4475 | |
4476 | int64_t StepVal = APStepVal.getSExtValue(); |
4477 | |
4478 | // Strided access. |
4479 | int64_t Stride = StepVal / Size; |
4480 | int64_t Rem = StepVal % Size; |
4481 | if (Rem) |
4482 | return 0; |
4483 | |
4484 | // If the SCEV could wrap but we have an inbounds gep with a unit stride we |
4485 | // know we can't "wrap around the address space". In case of address space |
4486 | // zero we know that this won't happen without triggering undefined behavior. |
4487 | if (!IsNoWrapAddRec && (IsInBoundsGEP || IsInAddressSpaceZero) && |
4488 | Stride != 1 && Stride != -1) |
4489 | return 0; |
4490 | |
4491 | return Stride; |
4492 | } |
4493 | |
4494 | bool MemoryDepChecker::couldPreventStoreLoadForward(unsigned Distance, |
4495 | unsigned TypeByteSize) { |
4496 | // If loads occur at a distance that is not a multiple of a feasible vector |
4497 | // factor store-load forwarding does not take place. |
4498 | // Positive dependences might cause troubles because vectorizing them might |
4499 | // prevent store-load forwarding making vectorized code run a lot slower. |
4500 | // a[i] = a[i-3] ^ a[i-8]; |
4501 | // The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and |
4502 | // hence on your typical architecture store-load forwarding does not take |
4503 | // place. Vectorizing in such cases does not make sense. |
4504 | // Store-load forwarding distance. |
4505 | const unsigned NumCyclesForStoreLoadThroughMemory = 8*TypeByteSize; |
4506 | // Maximum vector factor. |
4507 | unsigned MaxVFWithoutSLForwardIssues = MaxVectorWidth*TypeByteSize; |
4508 | if(MaxSafeDepDistBytes < MaxVFWithoutSLForwardIssues) |
4509 | MaxVFWithoutSLForwardIssues = MaxSafeDepDistBytes; |
4510 | |
4511 | for (unsigned vf = 2*TypeByteSize; vf <= MaxVFWithoutSLForwardIssues; |
4512 | vf *= 2) { |
4513 | if (Distance % vf && Distance / vf < NumCyclesForStoreLoadThroughMemory) { |
4514 | MaxVFWithoutSLForwardIssues = (vf >>=1); |
4515 | break; |
4516 | } |
4517 | } |
4518 | |
4519 | if (MaxVFWithoutSLForwardIssues< 2*TypeByteSize) { |
4520 | DEBUG(dbgs() << "LV: Distance " << Distance <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Distance " << Distance << " that could cause a store-load forwarding conflict\n" ; } } while (0) |
4521 | " that could cause a store-load forwarding conflict\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Distance " << Distance << " that could cause a store-load forwarding conflict\n" ; } } while (0); |
4522 | return true; |
4523 | } |
4524 | |
4525 | if (MaxVFWithoutSLForwardIssues < MaxSafeDepDistBytes && |
4526 | MaxVFWithoutSLForwardIssues != MaxVectorWidth*TypeByteSize) |
4527 | MaxSafeDepDistBytes = MaxVFWithoutSLForwardIssues; |
4528 | return false; |
4529 | } |
4530 | |
4531 | bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, |
4532 | const MemAccessInfo &B, unsigned BIdx, |
4533 | ValueToValueMap &Strides) { |
4534 | assert (AIdx < BIdx && "Must pass arguments in program order")((AIdx < BIdx && "Must pass arguments in program order" ) ? static_cast<void> (0) : __assert_fail ("AIdx < BIdx && \"Must pass arguments in program order\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 4534, __PRETTY_FUNCTION__)); |
4535 | |
4536 | Value *APtr = A.getPointer(); |
4537 | Value *BPtr = B.getPointer(); |
4538 | bool AIsWrite = A.getInt(); |
4539 | bool BIsWrite = B.getInt(); |
4540 | |
4541 | // Two reads are independent. |
4542 | if (!AIsWrite && !BIsWrite) |
4543 | return false; |
4544 | |
4545 | // We cannot check pointers in different address spaces. |
4546 | if (APtr->getType()->getPointerAddressSpace() != |
4547 | BPtr->getType()->getPointerAddressSpace()) |
4548 | return true; |
4549 | |
4550 | const SCEV *AScev = replaceSymbolicStrideSCEV(SE, Strides, APtr); |
4551 | const SCEV *BScev = replaceSymbolicStrideSCEV(SE, Strides, BPtr); |
4552 | |
4553 | int StrideAPtr = isStridedPtr(SE, DL, APtr, InnermostLoop, Strides); |
4554 | int StrideBPtr = isStridedPtr(SE, DL, BPtr, InnermostLoop, Strides); |
4555 | |
4556 | const SCEV *Src = AScev; |
4557 | const SCEV *Sink = BScev; |
4558 | |
4559 | // If the induction step is negative we have to invert source and sink of the |
4560 | // dependence. |
4561 | if (StrideAPtr < 0) { |
4562 | //Src = BScev; |
4563 | //Sink = AScev; |
4564 | std::swap(APtr, BPtr); |
4565 | std::swap(Src, Sink); |
4566 | std::swap(AIsWrite, BIsWrite); |
4567 | std::swap(AIdx, BIdx); |
4568 | std::swap(StrideAPtr, StrideBPtr); |
4569 | } |
4570 | |
4571 | const SCEV *Dist = SE->getMinusSCEV(Sink, Src); |
4572 | |
4573 | DEBUG(dbgs() << "LV: Src Scev: " << *Src << "Sink Scev: " << *Sinkdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Src Scev: " << *Src << "Sink Scev: " << *Sink << "(Induction step: " << StrideAPtr << ")\n"; } } while (0) |
4574 | << "(Induction step: " << StrideAPtr << ")\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Src Scev: " << *Src << "Sink Scev: " << *Sink << "(Induction step: " << StrideAPtr << ")\n"; } } while (0); |
4575 | DEBUG(dbgs() << "LV: Distance for " << *InstMap[AIdx] << " to "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Distance for " << *InstMap[AIdx] << " to " << *InstMap[BIdx] << ": " << *Dist << "\n"; } } while (0) |
4576 | << *InstMap[BIdx] << ": " << *Dist << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Distance for " << *InstMap[AIdx] << " to " << *InstMap[BIdx] << ": " << *Dist << "\n"; } } while (0); |
4577 | |
4578 | // Need consecutive accesses. We don't want to vectorize |
4579 | // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in |
4580 | // the address space. |
4581 | if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr){ |
4582 | DEBUG(dbgs() << "Non-consecutive pointer access\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "Non-consecutive pointer access\n" ; } } while (0); |
4583 | return true; |
4584 | } |
4585 | |
4586 | const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist); |
4587 | if (!C) { |
4588 | DEBUG(dbgs() << "LV: Dependence because of non-constant distance\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Dependence because of non-constant distance\n" ; } } while (0); |
4589 | ShouldRetryWithRuntimeCheck = true; |
4590 | return true; |
4591 | } |
4592 | |
4593 | Type *ATy = APtr->getType()->getPointerElementType(); |
4594 | Type *BTy = BPtr->getType()->getPointerElementType(); |
4595 | unsigned TypeByteSize = DL->getTypeAllocSize(ATy); |
4596 | |
4597 | // Negative distances are not plausible dependencies. |
4598 | const APInt &Val = C->getValue()->getValue(); |
4599 | if (Val.isNegative()) { |
4600 | bool IsTrueDataDependence = (AIsWrite && !BIsWrite); |
4601 | if (IsTrueDataDependence && |
4602 | (couldPreventStoreLoadForward(Val.abs().getZExtValue(), TypeByteSize) || |
4603 | ATy != BTy)) |
4604 | return true; |
4605 | |
4606 | DEBUG(dbgs() << "LV: Dependence is negative: NoDep\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Dependence is negative: NoDep\n" ; } } while (0); |
4607 | return false; |
4608 | } |
4609 | |
4610 | // Write to the same location with the same size. |
4611 | // Could be improved to assert type sizes are the same (i32 == float, etc). |
4612 | if (Val == 0) { |
4613 | if (ATy == BTy) |
4614 | return false; |
4615 | DEBUG(dbgs() << "LV: Zero dependence difference but different types\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Zero dependence difference but different types\n" ; } } while (0); |
4616 | return true; |
4617 | } |
4618 | |
4619 | assert(Val.isStrictlyPositive() && "Expect a positive value")((Val.isStrictlyPositive() && "Expect a positive value" ) ? static_cast<void> (0) : __assert_fail ("Val.isStrictlyPositive() && \"Expect a positive value\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 4619, __PRETTY_FUNCTION__)); |
4620 | |
4621 | // Positive distance bigger than max vectorization factor. |
4622 | if (ATy != BTy) { |
4623 | DEBUG(dbgs() <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: ReadWrite-Write positive dependency with different types\n" ; } } while (0) |
4624 | "LV: ReadWrite-Write positive dependency with different types\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: ReadWrite-Write positive dependency with different types\n" ; } } while (0); |
4625 | return false; |
4626 | } |
4627 | |
4628 | unsigned Distance = (unsigned) Val.getZExtValue(); |
4629 | |
4630 | // Bail out early if passed-in parameters make vectorization not feasible. |
4631 | unsigned ForcedFactor = VectorizationFactor ? VectorizationFactor : 1; |
4632 | unsigned ForcedUnroll = VectorizationInterleave ? VectorizationInterleave : 1; |
4633 | |
4634 | // The distance must be bigger than the size needed for a vectorized version |
4635 | // of the operation and the size of the vectorized operation must not be |
4636 | // bigger than the currrent maximum size. |
4637 | if (Distance < 2*TypeByteSize || |
4638 | 2*TypeByteSize > MaxSafeDepDistBytes || |
4639 | Distance < TypeByteSize * ForcedUnroll * ForcedFactor) { |
4640 | DEBUG(dbgs() << "LV: Failure because of Positive distance "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Failure because of Positive distance " << Val.getSExtValue() << '\n'; } } while (0) |
4641 | << Val.getSExtValue() << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Failure because of Positive distance " << Val.getSExtValue() << '\n'; } } while (0); |
4642 | return true; |
4643 | } |
4644 | |
4645 | MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ? |
4646 | Distance : MaxSafeDepDistBytes; |
4647 | |
4648 | bool IsTrueDataDependence = (!AIsWrite && BIsWrite); |
4649 | if (IsTrueDataDependence && |
4650 | couldPreventStoreLoadForward(Distance, TypeByteSize)) |
4651 | return true; |
4652 | |
4653 | DEBUG(dbgs() << "LV: Positive distance " << Val.getSExtValue() <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Positive distance " << Val.getSExtValue() << " with max VF = " << MaxSafeDepDistBytes / TypeByteSize << '\n'; } } while ( 0) |
4654 | " with max VF = " << MaxSafeDepDistBytes / TypeByteSize << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Positive distance " << Val.getSExtValue() << " with max VF = " << MaxSafeDepDistBytes / TypeByteSize << '\n'; } } while ( 0); |
4655 | |
4656 | return false; |
4657 | } |
4658 | |
4659 | bool MemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates &AccessSets, |
4660 | MemAccessInfoSet &CheckDeps, |
4661 | ValueToValueMap &Strides) { |
4662 | |
4663 | MaxSafeDepDistBytes = -1U; |
4664 | while (!CheckDeps.empty()) { |
4665 | MemAccessInfo CurAccess = *CheckDeps.begin(); |
4666 | |
4667 | // Get the relevant memory access set. |
4668 | EquivalenceClasses<MemAccessInfo>::iterator I = |
4669 | AccessSets.findValue(AccessSets.getLeaderValue(CurAccess)); |
4670 | |
4671 | // Check accesses within this set. |
4672 | EquivalenceClasses<MemAccessInfo>::member_iterator AI, AE; |
4673 | AI = AccessSets.member_begin(I), AE = AccessSets.member_end(); |
4674 | |
4675 | // Check every access pair. |
4676 | while (AI != AE) { |
4677 | CheckDeps.erase(*AI); |
4678 | EquivalenceClasses<MemAccessInfo>::member_iterator OI = std::next(AI); |
4679 | while (OI != AE) { |
4680 | // Check every accessing instruction pair in program order. |
4681 | for (std::vector<unsigned>::iterator I1 = Accesses[*AI].begin(), |
4682 | I1E = Accesses[*AI].end(); I1 != I1E; ++I1) |
4683 | for (std::vector<unsigned>::iterator I2 = Accesses[*OI].begin(), |
4684 | I2E = Accesses[*OI].end(); I2 != I2E; ++I2) { |
4685 | if (*I1 < *I2 && isDependent(*AI, *I1, *OI, *I2, Strides)) |
4686 | return false; |
4687 | if (*I2 < *I1 && isDependent(*OI, *I2, *AI, *I1, Strides)) |
4688 | return false; |
4689 | } |
4690 | ++OI; |
4691 | } |
4692 | AI++; |
4693 | } |
4694 | } |
4695 | return true; |
4696 | } |
4697 | |
4698 | bool LoopVectorizationLegality::canVectorizeMemory() { |
4699 | |
4700 | typedef SmallVector<Value*, 16> ValueVector; |
4701 | typedef SmallPtrSet<Value*, 16> ValueSet; |
4702 | |
4703 | // Holds the Load and Store *instructions*. |
4704 | ValueVector Loads; |
4705 | ValueVector Stores; |
4706 | |
4707 | // Holds all the different accesses in the loop. |
4708 | unsigned NumReads = 0; |
4709 | unsigned NumReadWrites = 0; |
4710 | |
4711 | PtrRtCheck.Pointers.clear(); |
4712 | PtrRtCheck.Need = false; |
4713 | |
4714 | const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); |
4715 | MemoryDepChecker DepChecker(SE, DL, TheLoop); |
4716 | |
4717 | // For each block. |
4718 | for (Loop::block_iterator bb = TheLoop->block_begin(), |
4719 | be = TheLoop->block_end(); bb != be; ++bb) { |
4720 | |
4721 | // Scan the BB and collect legal loads and stores. |
4722 | for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; |
4723 | ++it) { |
4724 | |
4725 | // If this is a load, save it. If this instruction can read from memory |
4726 | // but is not a load, then we quit. Notice that we don't handle function |
4727 | // calls that read or write. |
4728 | if (it->mayReadFromMemory()) { |
4729 | // Many math library functions read the rounding mode. We will only |
4730 | // vectorize a loop if it contains known function calls that don't set |
4731 | // the flag. Therefore, it is safe to ignore this read from memory. |
4732 | CallInst *Call = dyn_cast<CallInst>(it); |
4733 | if (Call && getIntrinsicIDForCall(Call, TLI)) |
4734 | continue; |
4735 | |
4736 | LoadInst *Ld = dyn_cast<LoadInst>(it); |
4737 | if (!Ld || (!Ld->isSimple() && !IsAnnotatedParallel)) { |
4738 | emitAnalysis(Report(Ld) |
4739 | << "read with atomic ordering or volatile read"); |
4740 | DEBUG(dbgs() << "LV: Found a non-simple load.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a non-simple load.\n" ; } } while (0); |
4741 | return false; |
4742 | } |
4743 | NumLoads++; |
4744 | Loads.push_back(Ld); |
4745 | DepChecker.addAccess(Ld); |
4746 | continue; |
4747 | } |
4748 | |
4749 | // Save 'store' instructions. Abort if other instructions write to memory. |
4750 | if (it->mayWriteToMemory()) { |
4751 | StoreInst *St = dyn_cast<StoreInst>(it); |
4752 | if (!St) { |
4753 | emitAnalysis(Report(it) << "instruction cannot be vectorized"); |
4754 | return false; |
4755 | } |
4756 | if (!St->isSimple() && !IsAnnotatedParallel) { |
4757 | emitAnalysis(Report(St) |
4758 | << "write with atomic ordering or volatile write"); |
4759 | DEBUG(dbgs() << "LV: Found a non-simple store.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a non-simple store.\n" ; } } while (0); |
4760 | return false; |
4761 | } |
4762 | NumStores++; |
4763 | Stores.push_back(St); |
4764 | DepChecker.addAccess(St); |
4765 | } |
4766 | } // Next instr. |
4767 | } // Next block. |
4768 | |
4769 | // Now we have two lists that hold the loads and the stores. |
4770 | // Next, we find the pointers that they use. |
4771 | |
4772 | // Check if we see any stores. If there are no stores, then we don't |
4773 | // care if the pointers are *restrict*. |
4774 | if (!Stores.size()) { |
4775 | DEBUG(dbgs() << "LV: Found a read-only loop!\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a read-only loop!\n" ; } } while (0); |
4776 | return true; |
4777 | } |
4778 | |
4779 | AccessAnalysis::DepCandidates DependentAccesses; |
4780 | AccessAnalysis Accesses(DL, AA, DependentAccesses); |
4781 | |
4782 | // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects |
4783 | // multiple times on the same object. If the ptr is accessed twice, once |
4784 | // for read and once for write, it will only appear once (on the write |
4785 | // list). This is okay, since we are going to check for conflicts between |
4786 | // writes and between reads and writes, but not between reads and reads. |
4787 | ValueSet Seen; |
4788 | |
4789 | ValueVector::iterator I, IE; |
4790 | for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) { |
4791 | StoreInst *ST = cast<StoreInst>(*I); |
4792 | Value* Ptr = ST->getPointerOperand(); |
4793 | |
4794 | if (isUniform(Ptr)) { |
4795 | emitAnalysis( |
4796 | Report(ST) |
4797 | << "write to a loop invariant address could not be vectorized"); |
4798 | DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We don't allow storing to uniform addresses\n" ; } } while (0); |
4799 | return false; |
4800 | } |
4801 | |
4802 | // If we did *not* see this pointer before, insert it to the read-write |
4803 | // list. At this phase it is only a 'write' list. |
4804 | if (Seen.insert(Ptr).second) { |
4805 | ++NumReadWrites; |
4806 | |
4807 | AliasAnalysis::Location Loc = AA->getLocation(ST); |
4808 | // The TBAA metadata could have a control dependency on the predication |
4809 | // condition, so we cannot rely on it when determining whether or not we |
4810 | // need runtime pointer checks. |
4811 | if (blockNeedsPredication(ST->getParent())) |
4812 | Loc.AATags.TBAA = nullptr; |
4813 | |
4814 | Accesses.addStore(Loc); |
4815 | } |
4816 | } |
4817 | |
4818 | if (IsAnnotatedParallel) { |
4819 | DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: A loop annotated parallel, ignore memory dependency " << "checks.\n"; } } while (0) |
4820 | << "LV: A loop annotated parallel, ignore memory dependency "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: A loop annotated parallel, ignore memory dependency " << "checks.\n"; } } while (0) |
4821 | << "checks.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: A loop annotated parallel, ignore memory dependency " << "checks.\n"; } } while (0); |
4822 | return true; |
4823 | } |
4824 | |
4825 | for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { |
4826 | LoadInst *LD = cast<LoadInst>(*I); |
4827 | Value* Ptr = LD->getPointerOperand(); |
4828 | // If we did *not* see this pointer before, insert it to the |
4829 | // read list. If we *did* see it before, then it is already in |
4830 | // the read-write list. This allows us to vectorize expressions |
4831 | // such as A[i] += x; Because the address of A[i] is a read-write |
4832 | // pointer. This only works if the index of A[i] is consecutive. |
4833 | // If the address of i is unknown (for example A[B[i]]) then we may |
4834 | // read a few words, modify, and write a few words, and some of the |
4835 | // words may be written to the same address. |
4836 | bool IsReadOnlyPtr = false; |
4837 | if (Seen.insert(Ptr).second || |
4838 | !isStridedPtr(SE, DL, Ptr, TheLoop, Strides)) { |
4839 | ++NumReads; |
4840 | IsReadOnlyPtr = true; |
4841 | } |
4842 | |
4843 | AliasAnalysis::Location Loc = AA->getLocation(LD); |
4844 | // The TBAA metadata could have a control dependency on the predication |
4845 | // condition, so we cannot rely on it when determining whether or not we |
4846 | // need runtime pointer checks. |
4847 | if (blockNeedsPredication(LD->getParent())) |
4848 | Loc.AATags.TBAA = nullptr; |
4849 | |
4850 | Accesses.addLoad(Loc, IsReadOnlyPtr); |
4851 | } |
4852 | |
4853 | // If we write (or read-write) to a single destination and there are no |
4854 | // other reads in this loop then is it safe to vectorize. |
4855 | if (NumReadWrites == 1 && NumReads == 0) { |
4856 | DEBUG(dbgs() << "LV: Found a write-only loop!\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found a write-only loop!\n" ; } } while (0); |
4857 | return true; |
4858 | } |
4859 | |
4860 | // Build dependence sets and check whether we need a runtime pointer bounds |
4861 | // check. |
4862 | Accesses.buildDependenceSets(); |
4863 | bool NeedRTCheck = Accesses.isRTCheckNeeded(); |
4864 | |
4865 | // Find pointers with computable bounds. We are going to use this information |
4866 | // to place a runtime bound check. |
4867 | unsigned NumComparisons = 0; |
4868 | bool CanDoRT = false; |
4869 | if (NeedRTCheck) |
4870 | CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, TheLoop, |
4871 | Strides); |
4872 | |
4873 | DEBUG(dbgs() << "LV: We need to do " << NumComparisons <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We need to do " << NumComparisons << " pointer comparisons.\n"; } } while (0) |
4874 | " pointer comparisons.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We need to do " << NumComparisons << " pointer comparisons.\n"; } } while (0); |
4875 | |
4876 | // If we only have one set of dependences to check pointers among we don't |
4877 | // need a runtime check. |
4878 | if (NumComparisons == 0 && NeedRTCheck) |
4879 | NeedRTCheck = false; |
4880 | |
4881 | // Check that we did not collect too many pointers or found an unsizeable |
4882 | // pointer. |
4883 | if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) { |
4884 | PtrRtCheck.reset(); |
4885 | CanDoRT = false; |
4886 | } |
4887 | |
4888 | if (CanDoRT) { |
4889 | DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We can perform a memory runtime check if needed.\n" ; } } while (0); |
4890 | } |
4891 | |
4892 | if (NeedRTCheck && !CanDoRT) { |
4893 | emitAnalysis(Report() << "cannot identify array bounds"); |
4894 | DEBUG(dbgs() << "LV: We can't vectorize because we can't find " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We can't vectorize because we can't find " << "the array bounds.\n"; } } while (0) |
4895 | "the array bounds.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We can't vectorize because we can't find " << "the array bounds.\n"; } } while (0); |
4896 | PtrRtCheck.reset(); |
4897 | return false; |
4898 | } |
4899 | |
4900 | PtrRtCheck.Need = NeedRTCheck; |
4901 | |
4902 | bool CanVecMem = true; |
4903 | if (Accesses.isDependencyCheckNeeded()) { |
4904 | DEBUG(dbgs() << "LV: Checking memory dependencies\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Checking memory dependencies\n" ; } } while (0); |
4905 | CanVecMem = DepChecker.areDepsSafe( |
4906 | DependentAccesses, Accesses.getDependenciesToCheck(), Strides); |
4907 | MaxSafeDepDistBytes = DepChecker.getMaxSafeDepDistBytes(); |
4908 | |
4909 | if (!CanVecMem && DepChecker.shouldRetryWithRuntimeCheck()) { |
4910 | DEBUG(dbgs() << "LV: Retrying with memory checks\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Retrying with memory checks\n" ; } } while (0); |
4911 | NeedRTCheck = true; |
4912 | |
4913 | // Clear the dependency checks. We assume they are not needed. |
4914 | Accesses.resetDepChecks(); |
4915 | |
4916 | PtrRtCheck.reset(); |
4917 | PtrRtCheck.Need = true; |
4918 | |
4919 | CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, |
4920 | TheLoop, Strides, true); |
4921 | // Check that we did not collect too many pointers or found an unsizeable |
4922 | // pointer. |
4923 | if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) { |
4924 | if (!CanDoRT && NumComparisons > 0) |
4925 | emitAnalysis(Report() |
4926 | << "cannot check memory dependencies at runtime"); |
4927 | else |
4928 | emitAnalysis(Report() |
4929 | << NumComparisons << " exceeds limit of " |
4930 | << RuntimeMemoryCheckThreshold |
4931 | << " dependent memory operations checked at runtime"); |
4932 | DEBUG(dbgs() << "LV: Can't vectorize with memory checks\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Can't vectorize with memory checks\n" ; } } while (0); |
4933 | PtrRtCheck.reset(); |
4934 | return false; |
4935 | } |
4936 | |
4937 | CanVecMem = true; |
4938 | } |
4939 | } |
4940 | |
4941 | if (!CanVecMem) |
4942 | emitAnalysis(Report() << "unsafe dependent memory operations in loop"); |
4943 | |
4944 | DEBUG(dbgs() << "LV: We" << (NeedRTCheck ? "" : " don't") <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We" << (NeedRTCheck ? "" : " don't") << " need a runtime memory check.\n"; } } while (0) |
4945 | " need a runtime memory check.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: We" << (NeedRTCheck ? "" : " don't") << " need a runtime memory check.\n"; } } while (0); |
4946 | |
4947 | return CanVecMem; |
4948 | } |
4949 | |
4950 | static bool hasMultipleUsesOf(Instruction *I, |
4951 | SmallPtrSetImpl<Instruction *> &Insts) { |
4952 | unsigned NumUses = 0; |
4953 | for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) { |
4954 | if (Insts.count(dyn_cast<Instruction>(*Use))) |
4955 | ++NumUses; |
4956 | if (NumUses > 1) |
4957 | return true; |
4958 | } |
4959 | |
4960 | return false; |
4961 | } |
4962 | |
4963 | static bool areAllUsesIn(Instruction *I, SmallPtrSetImpl<Instruction *> &Set) { |
4964 | for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) |
4965 | if (!Set.count(dyn_cast<Instruction>(*Use))) |
4966 | return false; |
4967 | return true; |
4968 | } |
4969 | |
4970 | bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, |
4971 | ReductionKind Kind) { |
4972 | if (Phi->getNumIncomingValues() != 2) |
4973 | return false; |
4974 | |
4975 | // Reduction variables are only found in the loop header block. |
4976 | if (Phi->getParent() != TheLoop->getHeader()) |
4977 | return false; |
4978 | |
4979 | // Obtain the reduction start value from the value that comes from the loop |
4980 | // preheader. |
4981 | Value *RdxStart = Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader()); |
4982 | |
4983 | // ExitInstruction is the single value which is used outside the loop. |
4984 | // We only allow for a single reduction value to be used outside the loop. |
4985 | // This includes users of the reduction, variables (which form a cycle |
4986 | // which ends in the phi node). |
4987 | Instruction *ExitInstruction = nullptr; |
4988 | // Indicates that we found a reduction operation in our scan. |
4989 | bool FoundReduxOp = false; |
4990 | |
4991 | // We start with the PHI node and scan for all of the users of this |
4992 | // instruction. All users must be instructions that can be used as reduction |
4993 | // variables (such as ADD). We must have a single out-of-block user. The cycle |
4994 | // must include the original PHI. |
4995 | bool FoundStartPHI = false; |
4996 | |
4997 | // To recognize min/max patterns formed by a icmp select sequence, we store |
4998 | // the number of instruction we saw from the recognized min/max pattern, |
4999 | // to make sure we only see exactly the two instructions. |
5000 | unsigned NumCmpSelectPatternInst = 0; |
5001 | ReductionInstDesc ReduxDesc(false, nullptr); |
5002 | |
5003 | SmallPtrSet<Instruction *, 8> VisitedInsts; |
5004 | SmallVector<Instruction *, 8> Worklist; |
5005 | Worklist.push_back(Phi); |
5006 | VisitedInsts.insert(Phi); |
5007 | |
5008 | // A value in the reduction can be used: |
5009 | // - By the reduction: |
5010 | // - Reduction operation: |
5011 | // - One use of reduction value (safe). |
5012 | // - Multiple use of reduction value (not safe). |
5013 | // - PHI: |
5014 | // - All uses of the PHI must be the reduction (safe). |
5015 | // - Otherwise, not safe. |
5016 | // - By one instruction outside of the loop (safe). |
5017 | // - By further instructions outside of the loop (not safe). |
5018 | // - By an instruction that is not part of the reduction (not safe). |
5019 | // This is either: |
5020 | // * An instruction type other than PHI or the reduction operation. |
5021 | // * A PHI in the header other than the initial PHI. |
5022 | while (!Worklist.empty()) { |
5023 | Instruction *Cur = Worklist.back(); |
5024 | Worklist.pop_back(); |
5025 | |
5026 | // No Users. |
5027 | // If the instruction has no users then this is a broken chain and can't be |
5028 | // a reduction variable. |
5029 | if (Cur->use_empty()) |
5030 | return false; |
5031 | |
5032 | bool IsAPhi = isa<PHINode>(Cur); |
5033 | |
5034 | // A header PHI use other than the original PHI. |
5035 | if (Cur != Phi && IsAPhi && Cur->getParent() == Phi->getParent()) |
5036 | return false; |
5037 | |
5038 | // Reductions of instructions such as Div, and Sub is only possible if the |
5039 | // LHS is the reduction variable. |
5040 | if (!Cur->isCommutative() && !IsAPhi && !isa<SelectInst>(Cur) && |
5041 | !isa<ICmpInst>(Cur) && !isa<FCmpInst>(Cur) && |
5042 | !VisitedInsts.count(dyn_cast<Instruction>(Cur->getOperand(0)))) |
5043 | return false; |
5044 | |
5045 | // Any reduction instruction must be of one of the allowed kinds. |
5046 | ReduxDesc = isReductionInstr(Cur, Kind, ReduxDesc); |
5047 | if (!ReduxDesc.IsReduction) |
5048 | return false; |
5049 | |
5050 | // A reduction operation must only have one use of the reduction value. |
5051 | if (!IsAPhi && Kind != RK_IntegerMinMax && Kind != RK_FloatMinMax && |
5052 | hasMultipleUsesOf(Cur, VisitedInsts)) |
5053 | return false; |
5054 | |
5055 | // All inputs to a PHI node must be a reduction value. |
5056 | if(IsAPhi && Cur != Phi && !areAllUsesIn(Cur, VisitedInsts)) |
5057 | return false; |
5058 | |
5059 | if (Kind == RK_IntegerMinMax && (isa<ICmpInst>(Cur) || |
5060 | isa<SelectInst>(Cur))) |
5061 | ++NumCmpSelectPatternInst; |
5062 | if (Kind == RK_FloatMinMax && (isa<FCmpInst>(Cur) || |
5063 | isa<SelectInst>(Cur))) |
5064 | ++NumCmpSelectPatternInst; |
5065 | |
5066 | // Check whether we found a reduction operator. |
5067 | FoundReduxOp |= !IsAPhi; |
5068 | |
5069 | // Process users of current instruction. Push non-PHI nodes after PHI nodes |
5070 | // onto the stack. This way we are going to have seen all inputs to PHI |
5071 | // nodes once we get to them. |
5072 | SmallVector<Instruction *, 8> NonPHIs; |
5073 | SmallVector<Instruction *, 8> PHIs; |
5074 | for (User *U : Cur->users()) { |
5075 | Instruction *UI = cast<Instruction>(U); |
5076 | |
5077 | // Check if we found the exit user. |
5078 | BasicBlock *Parent = UI->getParent(); |
5079 | if (!TheLoop->contains(Parent)) { |
5080 | // Exit if you find multiple outside users or if the header phi node is |
5081 | // being used. In this case the user uses the value of the previous |
5082 | // iteration, in which case we would loose "VF-1" iterations of the |
5083 | // reduction operation if we vectorize. |
5084 | if (ExitInstruction != nullptr || Cur == Phi) |
5085 | return false; |
5086 | |
5087 | // The instruction used by an outside user must be the last instruction |
5088 | // before we feed back to the reduction phi. Otherwise, we loose VF-1 |
5089 | // operations on the value. |
5090 | if (std::find(Phi->op_begin(), Phi->op_end(), Cur) == Phi->op_end()) |
5091 | return false; |
5092 | |
5093 | ExitInstruction = Cur; |
5094 | continue; |
5095 | } |
5096 | |
5097 | // Process instructions only once (termination). Each reduction cycle |
5098 | // value must only be used once, except by phi nodes and min/max |
5099 | // reductions which are represented as a cmp followed by a select. |
5100 | ReductionInstDesc IgnoredVal(false, nullptr); |
5101 | if (VisitedInsts.insert(UI).second) { |
5102 | if (isa<PHINode>(UI)) |
5103 | PHIs.push_back(UI); |
5104 | else |
5105 | NonPHIs.push_back(UI); |
5106 | } else if (!isa<PHINode>(UI) && |
5107 | ((!isa<FCmpInst>(UI) && |
5108 | !isa<ICmpInst>(UI) && |
5109 | !isa<SelectInst>(UI)) || |
5110 | !isMinMaxSelectCmpPattern(UI, IgnoredVal).IsReduction)) |
5111 | return false; |
5112 | |
5113 | // Remember that we completed the cycle. |
5114 | if (UI == Phi) |
5115 | FoundStartPHI = true; |
5116 | } |
5117 | Worklist.append(PHIs.begin(), PHIs.end()); |
5118 | Worklist.append(NonPHIs.begin(), NonPHIs.end()); |
5119 | } |
5120 | |
5121 | // This means we have seen one but not the other instruction of the |
5122 | // pattern or more than just a select and cmp. |
5123 | if ((Kind == RK_IntegerMinMax || Kind == RK_FloatMinMax) && |
5124 | NumCmpSelectPatternInst != 2) |
5125 | return false; |
5126 | |
5127 | if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction) |
5128 | return false; |
5129 | |
5130 | // We found a reduction var if we have reached the original phi node and we |
5131 | // only have a single instruction with out-of-loop users. |
5132 | |
5133 | // This instruction is allowed to have out-of-loop users. |
5134 | AllowedExit.insert(ExitInstruction); |
5135 | |
5136 | // Save the description of this reduction variable. |
5137 | ReductionDescriptor RD(RdxStart, ExitInstruction, Kind, |
5138 | ReduxDesc.MinMaxKind); |
5139 | Reductions[Phi] = RD; |
5140 | // We've ended the cycle. This is a reduction variable if we have an |
5141 | // outside user and it has a binary op. |
5142 | |
5143 | return true; |
5144 | } |
5145 | |
5146 | /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction |
5147 | /// pattern corresponding to a min(X, Y) or max(X, Y). |
5148 | LoopVectorizationLegality::ReductionInstDesc |
5149 | LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I, |
5150 | ReductionInstDesc &Prev) { |
5151 | |
5152 | assert((isa<ICmpInst>(I) || isa<FCmpInst>(I) || isa<SelectInst>(I)) &&(((isa<ICmpInst>(I) || isa<FCmpInst>(I) || isa< SelectInst>(I)) && "Expect a select instruction") ? static_cast<void> (0) : __assert_fail ("(isa<ICmpInst>(I) || isa<FCmpInst>(I) || isa<SelectInst>(I)) && \"Expect a select instruction\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 5153, __PRETTY_FUNCTION__)) |
5153 | "Expect a select instruction")(((isa<ICmpInst>(I) || isa<FCmpInst>(I) || isa< SelectInst>(I)) && "Expect a select instruction") ? static_cast<void> (0) : __assert_fail ("(isa<ICmpInst>(I) || isa<FCmpInst>(I) || isa<SelectInst>(I)) && \"Expect a select instruction\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 5153, __PRETTY_FUNCTION__)); |
5154 | Instruction *Cmp = nullptr; |
5155 | SelectInst *Select = nullptr; |
5156 | |
5157 | // We must handle the select(cmp()) as a single instruction. Advance to the |
5158 | // select. |
5159 | if ((Cmp = dyn_cast<ICmpInst>(I)) || (Cmp = dyn_cast<FCmpInst>(I))) { |
5160 | if (!Cmp->hasOneUse() || !(Select = dyn_cast<SelectInst>(*I->user_begin()))) |
5161 | return ReductionInstDesc(false, I); |
5162 | return ReductionInstDesc(Select, Prev.MinMaxKind); |
5163 | } |
5164 | |
5165 | // Only handle single use cases for now. |
5166 | if (!(Select = dyn_cast<SelectInst>(I))) |
5167 | return ReductionInstDesc(false, I); |
5168 | if (!(Cmp = dyn_cast<ICmpInst>(I->getOperand(0))) && |
5169 | !(Cmp = dyn_cast<FCmpInst>(I->getOperand(0)))) |
5170 | return ReductionInstDesc(false, I); |
5171 | if (!Cmp->hasOneUse()) |
5172 | return ReductionInstDesc(false, I); |
5173 | |
5174 | Value *CmpLeft; |
5175 | Value *CmpRight; |
5176 | |
5177 | // Look for a min/max pattern. |
5178 | if (m_UMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) |
5179 | return ReductionInstDesc(Select, MRK_UIntMin); |
5180 | else if (m_UMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) |
5181 | return ReductionInstDesc(Select, MRK_UIntMax); |
5182 | else if (m_SMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) |
5183 | return ReductionInstDesc(Select, MRK_SIntMax); |
5184 | else if (m_SMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) |
5185 | return ReductionInstDesc(Select, MRK_SIntMin); |
5186 | else if (m_OrdFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) |
5187 | return ReductionInstDesc(Select, MRK_FloatMin); |
5188 | else if (m_OrdFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) |
5189 | return ReductionInstDesc(Select, MRK_FloatMax); |
5190 | else if (m_UnordFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) |
5191 | return ReductionInstDesc(Select, MRK_FloatMin); |
5192 | else if (m_UnordFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) |
5193 | return ReductionInstDesc(Select, MRK_FloatMax); |
5194 | |
5195 | return ReductionInstDesc(false, I); |
5196 | } |
5197 | |
5198 | LoopVectorizationLegality::ReductionInstDesc |
5199 | LoopVectorizationLegality::isReductionInstr(Instruction *I, |
5200 | ReductionKind Kind, |
5201 | ReductionInstDesc &Prev) { |
5202 | bool FP = I->getType()->isFloatingPointTy(); |
5203 | bool FastMath = FP && I->hasUnsafeAlgebra(); |
5204 | switch (I->getOpcode()) { |
5205 | default: |
5206 | return ReductionInstDesc(false, I); |
5207 | case Instruction::PHI: |
5208 | if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd && |
5209 | Kind != RK_FloatMinMax)) |
5210 | return ReductionInstDesc(false, I); |
5211 | return ReductionInstDesc(I, Prev.MinMaxKind); |
5212 | case Instruction::Sub: |
5213 | case Instruction::Add: |
5214 | return ReductionInstDesc(Kind == RK_IntegerAdd, I); |
5215 | case Instruction::Mul: |
5216 | return ReductionInstDesc(Kind == RK_IntegerMult, I); |
5217 | case Instruction::And: |
5218 | return ReductionInstDesc(Kind == RK_IntegerAnd, I); |
5219 | case Instruction::Or: |
5220 | return ReductionInstDesc(Kind == RK_IntegerOr, I); |
5221 | case Instruction::Xor: |
5222 | return ReductionInstDesc(Kind == RK_IntegerXor, I); |
5223 | case Instruction::FMul: |
5224 | return ReductionInstDesc(Kind == RK_FloatMult && FastMath, I); |
5225 | case Instruction::FSub: |
5226 | case Instruction::FAdd: |
5227 | return ReductionInstDesc(Kind == RK_FloatAdd && FastMath, I); |
5228 | case Instruction::FCmp: |
5229 | case Instruction::ICmp: |
5230 | case Instruction::Select: |
5231 | if (Kind != RK_IntegerMinMax && |
5232 | (!HasFunNoNaNAttr || Kind != RK_FloatMinMax)) |
5233 | return ReductionInstDesc(false, I); |
5234 | return isMinMaxSelectCmpPattern(I, Prev); |
5235 | } |
5236 | } |
5237 | |
5238 | LoopVectorizationLegality::InductionKind |
5239 | LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { |
5240 | Type *PhiTy = Phi->getType(); |
5241 | // We only handle integer and pointer inductions variables. |
5242 | if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy()) |
5243 | return IK_NoInduction; |
5244 | |
5245 | // Check that the PHI is consecutive. |
5246 | const SCEV *PhiScev = SE->getSCEV(Phi); |
5247 | const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); |
5248 | if (!AR) { |
5249 | DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: PHI is not a poly recurrence.\n" ; } } while (0); |
5250 | return IK_NoInduction; |
5251 | } |
5252 | const SCEV *Step = AR->getStepRecurrence(*SE); |
5253 | |
5254 | // Integer inductions need to have a stride of one. |
5255 | if (PhiTy->isIntegerTy()) { |
5256 | if (Step->isOne()) |
5257 | return IK_IntInduction; |
5258 | if (Step->isAllOnesValue()) |
5259 | return IK_ReverseIntInduction; |
5260 | return IK_NoInduction; |
5261 | } |
5262 | |
5263 | // Calculate the pointer stride and check if it is consecutive. |
5264 | const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); |
5265 | if (!C) |
5266 | return IK_NoInduction; |
5267 | |
5268 | assert(PhiTy->isPointerTy() && "The PHI must be a pointer")((PhiTy->isPointerTy() && "The PHI must be a pointer" ) ? static_cast<void> (0) : __assert_fail ("PhiTy->isPointerTy() && \"The PHI must be a pointer\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 5268, __PRETTY_FUNCTION__)); |
5269 | Type *PointerElementType = PhiTy->getPointerElementType(); |
5270 | // The pointer stride cannot be determined if the pointer element type is not |
5271 | // sized. |
5272 | if (!PointerElementType->isSized()) |
5273 | return IK_NoInduction; |
5274 | |
5275 | uint64_t Size = DL->getTypeAllocSize(PointerElementType); |
5276 | if (C->getValue()->equalsInt(Size)) |
5277 | return IK_PtrInduction; |
5278 | else if (C->getValue()->equalsInt(0 - Size)) |
5279 | return IK_ReversePtrInduction; |
5280 | |
5281 | return IK_NoInduction; |
5282 | } |
5283 | |
5284 | bool LoopVectorizationLegality::isInductionVariable(const Value *V) { |
5285 | Value *In0 = const_cast<Value*>(V); |
5286 | PHINode *PN = dyn_cast_or_null<PHINode>(In0); |
5287 | if (!PN) |
5288 | return false; |
5289 | |
5290 | return Inductions.count(PN); |
5291 | } |
5292 | |
5293 | bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { |
5294 | assert(TheLoop->contains(BB) && "Unknown block used")((TheLoop->contains(BB) && "Unknown block used") ? static_cast<void> (0) : __assert_fail ("TheLoop->contains(BB) && \"Unknown block used\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 5294, __PRETTY_FUNCTION__)); |
5295 | |
5296 | // Blocks that do not dominate the latch need predication. |
5297 | BasicBlock* Latch = TheLoop->getLoopLatch(); |
5298 | return !DT->dominates(BB, Latch); |
5299 | } |
5300 | |
5301 | bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB, |
5302 | SmallPtrSetImpl<Value *> &SafePtrs) { |
5303 | for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { |
5304 | // We might be able to hoist the load. |
5305 | if (it->mayReadFromMemory()) { |
5306 | LoadInst *LI = dyn_cast<LoadInst>(it); |
5307 | if (!LI || !SafePtrs.count(LI->getPointerOperand())) |
5308 | return false; |
5309 | } |
5310 | |
5311 | // We don't predicate stores at the moment. |
5312 | if (it->mayWriteToMemory()) { |
5313 | StoreInst *SI = dyn_cast<StoreInst>(it); |
5314 | // We only support predication of stores in basic blocks with one |
5315 | // predecessor. |
5316 | if (!SI || ++NumPredStores > NumberOfStoresToPredicate || |
5317 | !SafePtrs.count(SI->getPointerOperand()) || |
5318 | !SI->getParent()->getSinglePredecessor()) |
5319 | return false; |
5320 | } |
5321 | if (it->mayThrow()) |
5322 | return false; |
5323 | |
5324 | // Check that we don't have a constant expression that can trap as operand. |
5325 | for (Instruction::op_iterator OI = it->op_begin(), OE = it->op_end(); |
5326 | OI != OE; ++OI) { |
5327 | if (Constant *C = dyn_cast<Constant>(*OI)) |
5328 | if (C->canTrap()) |
5329 | return false; |
5330 | } |
5331 | |
5332 | // The instructions below can trap. |
5333 | switch (it->getOpcode()) { |
5334 | default: continue; |
5335 | case Instruction::UDiv: |
5336 | case Instruction::SDiv: |
5337 | case Instruction::URem: |
5338 | case Instruction::SRem: |
5339 | return false; |
5340 | } |
5341 | } |
5342 | |
5343 | return true; |
5344 | } |
5345 | |
5346 | LoopVectorizationCostModel::VectorizationFactor |
5347 | LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { |
5348 | // Width 1 means no vectorize |
5349 | VectorizationFactor Factor = { 1U, 0U }; |
5350 | if (OptForSize && Legal->getRuntimePointerCheck()->Need) { |
5351 | emitAnalysis(Report() << "runtime pointer checks needed. Enable vectorization of this loop with '#pragma clang loop vectorize(enable)' when compiling with -Os"); |
5352 | DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n" ; } } while (0); |
5353 | return Factor; |
5354 | } |
5355 | |
5356 | if (!EnableCondStoresVectorization && Legal->NumPredStores) { |
5357 | emitAnalysis(Report() << "store that is conditionally executed prevents vectorization"); |
5358 | DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: No vectorization. There are conditional stores.\n" ; } } while (0); |
5359 | return Factor; |
5360 | } |
5361 | |
5362 | // Find the trip count. |
5363 | unsigned TC = SE->getSmallConstantTripCount(TheLoop); |
5364 | DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found trip count: " << TC << '\n'; } } while (0); |
5365 | |
5366 | unsigned WidestType = getWidestType(); |
5367 | unsigned WidestRegister = TTI.getRegisterBitWidth(true); |
5368 | unsigned MaxSafeDepDist = -1U; |
5369 | if (Legal->getMaxSafeDepDistBytes() != -1U) |
5370 | MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; |
5371 | WidestRegister = ((WidestRegister < MaxSafeDepDist) ? |
5372 | WidestRegister : MaxSafeDepDist); |
5373 | unsigned MaxVectorSize = WidestRegister / WidestType; |
5374 | DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: The Widest type: " << WidestType << " bits.\n"; } } while (0); |
5375 | DEBUG(dbgs() << "LV: The Widest register is: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: The Widest register is: " << WidestRegister << " bits.\n"; } } while (0) |
5376 | << WidestRegister << " bits.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: The Widest register is: " << WidestRegister << " bits.\n"; } } while (0); |
5377 | |
5378 | if (MaxVectorSize == 0) { |
5379 | DEBUG(dbgs() << "LV: The target has no vector registers.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: The target has no vector registers.\n" ; } } while (0); |
5380 | MaxVectorSize = 1; |
5381 | } |
5382 | |
5383 | assert(MaxVectorSize <= 32 && "Did not expect to pack so many elements"((MaxVectorSize <= 32 && "Did not expect to pack so many elements" " into one vector!") ? static_cast<void> (0) : __assert_fail ("MaxVectorSize <= 32 && \"Did not expect to pack so many elements\" \" into one vector!\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 5384, __PRETTY_FUNCTION__)) |
5384 | " into one vector!")((MaxVectorSize <= 32 && "Did not expect to pack so many elements" " into one vector!") ? static_cast<void> (0) : __assert_fail ("MaxVectorSize <= 32 && \"Did not expect to pack so many elements\" \" into one vector!\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 5384, __PRETTY_FUNCTION__)); |
5385 | |
5386 | unsigned VF = MaxVectorSize; |
5387 | |
5388 | // If we optimize the program for size, avoid creating the tail loop. |
5389 | if (OptForSize) { |
5390 | // If we are unable to calculate the trip count then don't try to vectorize. |
5391 | if (TC < 2) { |
5392 | emitAnalysis(Report() << "unable to calculate the loop count due to complex control flow"); |
5393 | DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Aborting. A tail loop is required in Os.\n" ; } } while (0); |
5394 | return Factor; |
5395 | } |
5396 | |
5397 | // Find the maximum SIMD width that can fit within the trip count. |
5398 | VF = TC % MaxVectorSize; |
5399 | |
5400 | if (VF == 0) |
5401 | VF = MaxVectorSize; |
5402 | |
5403 | // If the trip count that we found modulo the vectorization factor is not |
5404 | // zero then we require a tail. |
5405 | if (VF < 2) { |
5406 | emitAnalysis(Report() << "cannot optimize for size and vectorize at the " |
5407 | "same time. Enable vectorization of this loop " |
5408 | "with '#pragma clang loop vectorize(enable)' " |
5409 | "when compiling with -Os"); |
5410 | DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Aborting. A tail loop is required in Os.\n" ; } } while (0); |
5411 | return Factor; |
5412 | } |
5413 | } |
5414 | |
5415 | int UserVF = Hints->getWidth(); |
5416 | if (UserVF != 0) { |
5417 | assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two")((isPowerOf2_32(UserVF) && "VF needs to be a power of two" ) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(UserVF) && \"VF needs to be a power of two\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 5417, __PRETTY_FUNCTION__)); |
5418 | DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Using user VF " << UserVF << ".\n"; } } while (0); |
5419 | |
5420 | Factor.Width = UserVF; |
5421 | return Factor; |
5422 | } |
5423 | |
5424 | float Cost = expectedCost(1); |
5425 | #ifndef NDEBUG |
5426 | const float ScalarCost = Cost; |
5427 | #endif /* NDEBUG */ |
5428 | unsigned Width = 1; |
5429 | DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"; } } while (0); |
5430 | |
5431 | bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; |
5432 | // Ignore scalar width, because the user explicitly wants vectorization. |
5433 | if (ForceVectorization && VF > 1) { |
5434 | Width = 2; |
5435 | Cost = expectedCost(Width) / (float)Width; |
5436 | } |
5437 | |
5438 | for (unsigned i=2; i <= VF; i*=2) { |
5439 | // Notice that the vector loop needs to be executed less times, so |
5440 | // we need to divide the cost of the vector loops by the width of |
5441 | // the vector elements. |
5442 | float VectorCost = expectedCost(i) / (float)i; |
5443 | DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Vector loop of width " << i << " costs: " << (int)VectorCost << ".\n"; } } while (0) |
5444 | (int)VectorCost << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Vector loop of width " << i << " costs: " << (int)VectorCost << ".\n"; } } while (0); |
5445 | if (VectorCost < Cost) { |
5446 | Cost = VectorCost; |
5447 | Width = i; |
5448 | } |
5449 | } |
5450 | |
5451 | DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() << "LV: Vectorization seems to be not beneficial, " << "but was forced by a user.\n"; } } while (0) |
5452 | << "LV: Vectorization seems to be not beneficial, "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() << "LV: Vectorization seems to be not beneficial, " << "but was forced by a user.\n"; } } while (0) |
5453 | << "but was forced by a user.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() << "LV: Vectorization seems to be not beneficial, " << "but was forced by a user.\n"; } } while (0); |
5454 | DEBUG(dbgs() << "LV: Selecting VF: "<< Width << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Selecting VF: "<< Width << ".\n"; } } while (0); |
5455 | Factor.Width = Width; |
5456 | Factor.Cost = Width * Cost; |
5457 | return Factor; |
5458 | } |
5459 | |
5460 | unsigned LoopVectorizationCostModel::getWidestType() { |
5461 | unsigned MaxWidth = 8; |
5462 | |
5463 | // For each block. |
5464 | for (Loop::block_iterator bb = TheLoop->block_begin(), |
5465 | be = TheLoop->block_end(); bb != be; ++bb) { |
5466 | BasicBlock *BB = *bb; |
5467 | |
5468 | // For each instruction in the loop. |
5469 | for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { |
5470 | Type *T = it->getType(); |
5471 | |
5472 | // Ignore ephemeral values. |
5473 | if (EphValues.count(it)) |
5474 | continue; |
5475 | |
5476 | // Only examine Loads, Stores and PHINodes. |
5477 | if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it)) |
5478 | continue; |
5479 | |
5480 | // Examine PHI nodes that are reduction variables. |
5481 | if (PHINode *PN = dyn_cast<PHINode>(it)) |
5482 | if (!Legal->getReductionVars()->count(PN)) |
5483 | continue; |
5484 | |
5485 | // Examine the stored values. |
5486 | if (StoreInst *ST = dyn_cast<StoreInst>(it)) |
5487 | T = ST->getValueOperand()->getType(); |
5488 | |
5489 | // Ignore loaded pointer types and stored pointer types that are not |
5490 | // consecutive. However, we do want to take consecutive stores/loads of |
5491 | // pointer vectors into account. |
5492 | if (T->isPointerTy() && !isConsecutiveLoadOrStore(it)) |
5493 | continue; |
5494 | |
5495 | MaxWidth = std::max(MaxWidth, |
5496 | (unsigned)DL->getTypeSizeInBits(T->getScalarType())); |
5497 | } |
5498 | } |
5499 | |
5500 | return MaxWidth; |
5501 | } |
5502 | |
5503 | unsigned |
5504 | LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, |
5505 | unsigned VF, |
5506 | unsigned LoopCost) { |
5507 | |
5508 | // -- The unroll heuristics -- |
5509 | // We unroll the loop in order to expose ILP and reduce the loop overhead. |
5510 | // There are many micro-architectural considerations that we can't predict |
5511 | // at this level. For example, frontend pressure (on decode or fetch) due to |
5512 | // code size, or the number and capabilities of the execution ports. |
5513 | // |
5514 | // We use the following heuristics to select the unroll factor: |
5515 | // 1. If the code has reductions, then we unroll in order to break the cross |
5516 | // iteration dependency. |
5517 | // 2. If the loop is really small, then we unroll in order to reduce the loop |
5518 | // overhead. |
5519 | // 3. We don't unroll if we think that we will spill registers to memory due |
5520 | // to the increased register pressure. |
5521 | |
5522 | // Use the user preference, unless 'auto' is selected. |
5523 | int UserUF = Hints->getInterleave(); |
5524 | if (UserUF != 0) |
5525 | return UserUF; |
5526 | |
5527 | // When we optimize for size, we don't unroll. |
5528 | if (OptForSize) |
5529 | return 1; |
5530 | |
5531 | // We used the distance for the unroll factor. |
5532 | if (Legal->getMaxSafeDepDistBytes() != -1U) |
5533 | return 1; |
5534 | |
5535 | // Do not unroll loops with a relatively small trip count. |
5536 | unsigned TC = SE->getSmallConstantTripCount(TheLoop); |
5537 | if (TC > 1 && TC < TinyTripCountUnrollThreshold) |
5538 | return 1; |
5539 | |
5540 | unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1); |
5541 | DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: The target has " << TargetNumRegisters << " registers\n"; } } while (0) |
5542 | " registers\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: The target has " << TargetNumRegisters << " registers\n"; } } while (0); |
5543 | |
5544 | if (VF == 1) { |
5545 | if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) |
5546 | TargetNumRegisters = ForceTargetNumScalarRegs; |
5547 | } else { |
5548 | if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) |
5549 | TargetNumRegisters = ForceTargetNumVectorRegs; |
5550 | } |
5551 | |
5552 | LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage(); |
5553 | // We divide by these constants so assume that we have at least one |
5554 | // instruction that uses at least one register. |
5555 | R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); |
5556 | R.NumInstructions = std::max(R.NumInstructions, 1U); |
5557 | |
5558 | // We calculate the unroll factor using the following formula. |
5559 | // Subtract the number of loop invariants from the number of available |
5560 | // registers. These registers are used by all of the unrolled instances. |
5561 | // Next, divide the remaining registers by the number of registers that is |
5562 | // required by the loop, in order to estimate how many parallel instances |
5563 | // fit without causing spills. All of this is rounded down if necessary to be |
5564 | // a power of two. We want power of two unroll factors to simplify any |
5565 | // addressing operations or alignment considerations. |
5566 | unsigned UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) / |
5567 | R.MaxLocalUsers); |
5568 | |
5569 | // Don't count the induction variable as unrolled. |
5570 | if (EnableIndVarRegisterHeur) |
5571 | UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) / |
5572 | std::max(1U, (R.MaxLocalUsers - 1))); |
5573 | |
5574 | // Clamp the unroll factor ranges to reasonable factors. |
5575 | unsigned MaxInterleaveSize = TTI.getMaxInterleaveFactor(); |
5576 | |
5577 | // Check if the user has overridden the unroll max. |
5578 | if (VF == 1) { |
5579 | if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) |
5580 | MaxInterleaveSize = ForceTargetMaxScalarInterleaveFactor; |
5581 | } else { |
5582 | if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) |
5583 | MaxInterleaveSize = ForceTargetMaxVectorInterleaveFactor; |
5584 | } |
5585 | |
5586 | // If we did not calculate the cost for VF (because the user selected the VF) |
5587 | // then we calculate the cost of VF here. |
5588 | if (LoopCost == 0) |
5589 | LoopCost = expectedCost(VF); |
5590 | |
5591 | // Clamp the calculated UF to be between the 1 and the max unroll factor |
5592 | // that the target allows. |
5593 | if (UF > MaxInterleaveSize) |
5594 | UF = MaxInterleaveSize; |
5595 | else if (UF < 1) |
5596 | UF = 1; |
5597 | |
5598 | // Unroll if we vectorized this loop and there is a reduction that could |
5599 | // benefit from unrolling. |
5600 | if (VF > 1 && Legal->getReductionVars()->size()) { |
5601 | DEBUG(dbgs() << "LV: Unrolling because of reductions.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Unrolling because of reductions.\n" ; } } while (0); |
5602 | return UF; |
5603 | } |
5604 | |
5605 | // Note that if we've already vectorized the loop we will have done the |
5606 | // runtime check and so unrolling won't require further checks. |
5607 | bool UnrollingRequiresRuntimePointerCheck = |
5608 | (VF == 1 && Legal->getRuntimePointerCheck()->Need); |
5609 | |
5610 | // We want to unroll small loops in order to reduce the loop overhead and |
5611 | // potentially expose ILP opportunities. |
5612 | DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Loop cost is " << LoopCost << '\n'; } } while (0); |
5613 | if (!UnrollingRequiresRuntimePointerCheck && |
5614 | LoopCost < SmallLoopCost) { |
5615 | // We assume that the cost overhead is 1 and we use the cost model |
5616 | // to estimate the cost of the loop and unroll until the cost of the |
5617 | // loop overhead is about 5% of the cost of the loop. |
5618 | unsigned SmallUF = std::min(UF, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); |
5619 | |
5620 | // Unroll until store/load ports (estimated by max unroll factor) are |
5621 | // saturated. |
5622 | unsigned StoresUF = UF / (Legal->NumStores ? Legal->NumStores : 1); |
5623 | unsigned LoadsUF = UF / (Legal->NumLoads ? Legal->NumLoads : 1); |
5624 | |
5625 | // If we have a scalar reduction (vector reductions are already dealt with |
5626 | // by this point), we can increase the critical path length if the loop |
5627 | // we're unrolling is inside another loop. Limit, by default to 2, so the |
5628 | // critical path only gets increased by one reduction operation. |
5629 | if (Legal->getReductionVars()->size() && |
5630 | TheLoop->getLoopDepth() > 1) { |
5631 | unsigned F = static_cast<unsigned>(MaxNestedScalarReductionUF); |
5632 | SmallUF = std::min(SmallUF, F); |
5633 | StoresUF = std::min(StoresUF, F); |
5634 | LoadsUF = std::min(LoadsUF, F); |
5635 | } |
5636 | |
5637 | if (EnableLoadStoreRuntimeUnroll && std::max(StoresUF, LoadsUF) > SmallUF) { |
5638 | DEBUG(dbgs() << "LV: Unrolling to saturate store or load ports.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Unrolling to saturate store or load ports.\n" ; } } while (0); |
5639 | return std::max(StoresUF, LoadsUF); |
5640 | } |
5641 | |
5642 | DEBUG(dbgs() << "LV: Unrolling to reduce branch cost.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Unrolling to reduce branch cost.\n" ; } } while (0); |
5643 | return SmallUF; |
5644 | } |
5645 | |
5646 | DEBUG(dbgs() << "LV: Not Unrolling.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Not Unrolling.\n"; } } while (0); |
5647 | return 1; |
5648 | } |
5649 | |
5650 | LoopVectorizationCostModel::RegisterUsage |
5651 | LoopVectorizationCostModel::calculateRegisterUsage() { |
5652 | // This function calculates the register usage by measuring the highest number |
5653 | // of values that are alive at a single location. Obviously, this is a very |
5654 | // rough estimation. We scan the loop in a topological order in order and |
5655 | // assign a number to each instruction. We use RPO to ensure that defs are |
5656 | // met before their users. We assume that each instruction that has in-loop |
5657 | // users starts an interval. We record every time that an in-loop value is |
5658 | // used, so we have a list of the first and last occurrences of each |
5659 | // instruction. Next, we transpose this data structure into a multi map that |
5660 | // holds the list of intervals that *end* at a specific location. This multi |
5661 | // map allows us to perform a linear search. We scan the instructions linearly |
5662 | // and record each time that a new interval starts, by placing it in a set. |
5663 | // If we find this value in the multi-map then we remove it from the set. |
5664 | // The max register usage is the maximum size of the set. |
5665 | // We also search for instructions that are defined outside the loop, but are |
5666 | // used inside the loop. We need this number separately from the max-interval |
5667 | // usage number because when we unroll, loop-invariant values do not take |
5668 | // more register. |
5669 | LoopBlocksDFS DFS(TheLoop); |
5670 | DFS.perform(LI); |
5671 | |
5672 | RegisterUsage R; |
5673 | R.NumInstructions = 0; |
5674 | |
5675 | // Each 'key' in the map opens a new interval. The values |
5676 | // of the map are the index of the 'last seen' usage of the |
5677 | // instruction that is the key. |
5678 | typedef DenseMap<Instruction*, unsigned> IntervalMap; |
5679 | // Maps instruction to its index. |
5680 | DenseMap<unsigned, Instruction*> IdxToInstr; |
5681 | // Marks the end of each interval. |
5682 | IntervalMap EndPoint; |
5683 | // Saves the list of instruction indices that are used in the loop. |
5684 | SmallSet<Instruction*, 8> Ends; |
5685 | // Saves the list of values that are used in the loop but are |
5686 | // defined outside the loop, such as arguments and constants. |
5687 | SmallPtrSet<Value*, 8> LoopInvariants; |
5688 | |
5689 | unsigned Index = 0; |
5690 | for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), |
5691 | be = DFS.endRPO(); bb != be; ++bb) { |
5692 | R.NumInstructions += (*bb)->size(); |
5693 | for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; |
5694 | ++it) { |
5695 | Instruction *I = it; |
5696 | IdxToInstr[Index++] = I; |
5697 | |
5698 | // Save the end location of each USE. |
5699 | for (unsigned i = 0; i < I->getNumOperands(); ++i) { |
5700 | Value *U = I->getOperand(i); |
5701 | Instruction *Instr = dyn_cast<Instruction>(U); |
5702 | |
5703 | // Ignore non-instruction values such as arguments, constants, etc. |
5704 | if (!Instr) continue; |
5705 | |
5706 | // If this instruction is outside the loop then record it and continue. |
5707 | if (!TheLoop->contains(Instr)) { |
5708 | LoopInvariants.insert(Instr); |
5709 | continue; |
5710 | } |
5711 | |
5712 | // Overwrite previous end points. |
5713 | EndPoint[Instr] = Index; |
5714 | Ends.insert(Instr); |
5715 | } |
5716 | } |
5717 | } |
5718 | |
5719 | // Saves the list of intervals that end with the index in 'key'. |
5720 | typedef SmallVector<Instruction*, 2> InstrList; |
5721 | DenseMap<unsigned, InstrList> TransposeEnds; |
5722 | |
5723 | // Transpose the EndPoints to a list of values that end at each index. |
5724 | for (IntervalMap::iterator it = EndPoint.begin(), e = EndPoint.end(); |
5725 | it != e; ++it) |
5726 | TransposeEnds[it->second].push_back(it->first); |
5727 | |
5728 | SmallSet<Instruction*, 8> OpenIntervals; |
5729 | unsigned MaxUsage = 0; |
5730 | |
5731 | |
5732 | DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV(REG): Calculating max register usage:\n" ; } } while (0); |
5733 | for (unsigned int i = 0; i < Index; ++i) { |
5734 | Instruction *I = IdxToInstr[i]; |
5735 | // Ignore instructions that are never used within the loop. |
5736 | if (!Ends.count(I)) continue; |
5737 | |
5738 | // Ignore ephemeral values. |
5739 | if (EphValues.count(I)) |
5740 | continue; |
5741 | |
5742 | // Remove all of the instructions that end at this location. |
5743 | InstrList &List = TransposeEnds[i]; |
5744 | for (unsigned int j=0, e = List.size(); j < e; ++j) |
5745 | OpenIntervals.erase(List[j]); |
5746 | |
5747 | // Count the number of live interals. |
5748 | MaxUsage = std::max(MaxUsage, OpenIntervals.size()); |
5749 | |
5750 | DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV(REG): At #" << i << " Interval # " << OpenIntervals.size() << '\n'; } } while (0) |
5751 | OpenIntervals.size() << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV(REG): At #" << i << " Interval # " << OpenIntervals.size() << '\n'; } } while (0); |
5752 | |
5753 | // Add the current instruction to the list of open intervals. |
5754 | OpenIntervals.insert(I); |
5755 | } |
5756 | |
5757 | unsigned Invariant = LoopInvariants.size(); |
5758 | DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV(REG): Found max usage: " << MaxUsage << '\n'; } } while (0); |
5759 | DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n'; } } while (0); |
5760 | DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << '\n'; } } while (0); |
5761 | |
5762 | R.LoopInvariantRegs = Invariant; |
5763 | R.MaxLocalUsers = MaxUsage; |
5764 | return R; |
5765 | } |
5766 | |
5767 | unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { |
5768 | unsigned Cost = 0; |
5769 | |
5770 | // For each block. |
5771 | for (Loop::block_iterator bb = TheLoop->block_begin(), |
5772 | be = TheLoop->block_end(); bb != be; ++bb) { |
5773 | unsigned BlockCost = 0; |
5774 | BasicBlock *BB = *bb; |
5775 | |
5776 | // For each instruction in the old loop. |
5777 | for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { |
5778 | // Skip dbg intrinsics. |
5779 | if (isa<DbgInfoIntrinsic>(it)) |
5780 | continue; |
5781 | |
5782 | // Ignore ephemeral values. |
5783 | if (EphValues.count(it)) |
5784 | continue; |
5785 | |
5786 | unsigned C = getInstructionCost(it, VF); |
5787 | |
5788 | // Check if we should override the cost. |
5789 | if (ForceTargetInstructionCost.getNumOccurrences() > 0) |
5790 | C = ForceTargetInstructionCost; |
5791 | |
5792 | BlockCost += C; |
5793 | DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an estimated cost of " << C << " for VF " << VF << " For instruction: " << *it << '\n'; } } while (0) |
5794 | VF << " For instruction: " << *it << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-vectorize")) { dbgs() << "LV: Found an estimated cost of " << C << " for VF " << VF << " For instruction: " << *it << '\n'; } } while (0); |
5795 | } |
5796 | |
5797 | // We assume that if-converted blocks have a 50% chance of being executed. |
5798 | // When the code is scalar then some of the blocks are avoided due to CF. |
5799 | // When the code is vectorized we execute all code paths. |
5800 | if (VF == 1 && Legal->blockNeedsPredication(*bb)) |
5801 | BlockCost /= 2; |
5802 | |
5803 | Cost += BlockCost; |
5804 | } |
5805 | |
5806 | return Cost; |
5807 | } |
5808 | |
5809 | /// \brief Check whether the address computation for a non-consecutive memory |
5810 | /// access looks like an unlikely candidate for being merged into the indexing |
5811 | /// mode. |
5812 | /// |
5813 | /// We look for a GEP which has one index that is an induction variable and all |
5814 | /// other indices are loop invariant. If the stride of this access is also |
5815 | /// within a small bound we decide that this address computation can likely be |
5816 | /// merged into the addressing mode. |
5817 | /// In all other cases, we identify the address computation as complex. |
5818 | static bool isLikelyComplexAddressComputation(Value *Ptr, |
5819 | LoopVectorizationLegality *Legal, |
5820 | ScalarEvolution *SE, |
5821 | const Loop *TheLoop) { |
5822 | GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); |
5823 | if (!Gep) |
5824 | return true; |
5825 | |
5826 | // We are looking for a gep with all loop invariant indices except for one |
5827 | // which should be an induction variable. |
5828 | unsigned NumOperands = Gep->getNumOperands(); |
5829 | for (unsigned i = 1; i < NumOperands; ++i) { |
5830 | Value *Opd = Gep->getOperand(i); |
5831 | if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && |
5832 | !Legal->isInductionVariable(Opd)) |
5833 | return true; |
5834 | } |
5835 | |
5836 | // Now we know we have a GEP ptr, %inv, %ind, %inv. Make sure that the step |
5837 | // can likely be merged into the address computation. |
5838 | unsigned MaxMergeDistance = 64; |
5839 | |
5840 | const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Ptr)); |
5841 | if (!AddRec) |
5842 | return true; |
5843 | |
5844 | // Check the step is constant. |
5845 | const SCEV *Step = AddRec->getStepRecurrence(*SE); |
5846 | // Calculate the pointer stride and check if it is consecutive. |
5847 | const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); |
5848 | if (!C) |
5849 | return true; |
5850 | |
5851 | const APInt &APStepVal = C->getValue()->getValue(); |
5852 | |
5853 | // Huge step value - give up. |
5854 | if (APStepVal.getBitWidth() > 64) |
5855 | return true; |
5856 | |
5857 | int64_t StepVal = APStepVal.getSExtValue(); |
5858 | |
5859 | return StepVal > MaxMergeDistance; |
5860 | } |
5861 | |
5862 | static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { |
5863 | if (Legal->hasStride(I->getOperand(0)) || Legal->hasStride(I->getOperand(1))) |
5864 | return true; |
5865 | return false; |
5866 | } |
5867 | |
5868 | unsigned |
5869 | LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { |
5870 | // If we know that this instruction will remain uniform, check the cost of |
5871 | // the scalar version. |
5872 | if (Legal->isUniformAfterVectorization(I)) |
5873 | VF = 1; |
5874 | |
5875 | Type *RetTy = I->getType(); |
5876 | Type *VectorTy = ToVectorTy(RetTy, VF); |
5877 | |
5878 | // TODO: We need to estimate the cost of intrinsic calls. |
5879 | switch (I->getOpcode()) { |
5880 | case Instruction::GetElementPtr: |
5881 | // We mark this instruction as zero-cost because the cost of GEPs in |
5882 | // vectorized code depends on whether the corresponding memory instruction |
5883 | // is scalarized or not. Therefore, we handle GEPs with the memory |
5884 | // instruction cost. |
5885 | return 0; |
5886 | case Instruction::Br: { |
5887 | return TTI.getCFInstrCost(I->getOpcode()); |
5888 | } |
5889 | case Instruction::PHI: |
5890 | //TODO: IF-converted IFs become selects. |
5891 | return 0; |
5892 | case Instruction::Add: |
5893 | case Instruction::FAdd: |
5894 | case Instruction::Sub: |
5895 | case Instruction::FSub: |
5896 | case Instruction::Mul: |
5897 | case Instruction::FMul: |
5898 | case Instruction::UDiv: |
5899 | case Instruction::SDiv: |
5900 | case Instruction::FDiv: |
5901 | case Instruction::URem: |
5902 | case Instruction::SRem: |
5903 | case Instruction::FRem: |
5904 | case Instruction::Shl: |
5905 | case Instruction::LShr: |
5906 | case Instruction::AShr: |
5907 | case Instruction::And: |
5908 | case Instruction::Or: |
5909 | case Instruction::Xor: { |
5910 | // Since we will replace the stride by 1 the multiplication should go away. |
5911 | if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) |
5912 | return 0; |
5913 | // Certain instructions can be cheaper to vectorize if they have a constant |
5914 | // second vector operand. One example of this are shifts on x86. |
5915 | TargetTransformInfo::OperandValueKind Op1VK = |
5916 | TargetTransformInfo::OK_AnyValue; |
5917 | TargetTransformInfo::OperandValueKind Op2VK = |
5918 | TargetTransformInfo::OK_AnyValue; |
5919 | TargetTransformInfo::OperandValueProperties Op1VP = |
5920 | TargetTransformInfo::OP_None; |
5921 | TargetTransformInfo::OperandValueProperties Op2VP = |
5922 | TargetTransformInfo::OP_None; |
5923 | Value *Op2 = I->getOperand(1); |
5924 | |
5925 | // Check for a splat of a constant or for a non uniform vector of constants. |
5926 | if (isa<ConstantInt>(Op2)) { |
5927 | ConstantInt *CInt = cast<ConstantInt>(Op2); |
5928 | if (CInt && CInt->getValue().isPowerOf2()) |
5929 | Op2VP = TargetTransformInfo::OP_PowerOf2; |
5930 | Op2VK = TargetTransformInfo::OK_UniformConstantValue; |
5931 | } else if (isa<ConstantVector>(Op2) || isa<ConstantDataVector>(Op2)) { |
5932 | Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; |
5933 | Constant *SplatValue = cast<Constant>(Op2)->getSplatValue(); |
5934 | if (SplatValue) { |
5935 | ConstantInt *CInt = dyn_cast<ConstantInt>(SplatValue); |
5936 | if (CInt && CInt->getValue().isPowerOf2()) |
5937 | Op2VP = TargetTransformInfo::OP_PowerOf2; |
5938 | Op2VK = TargetTransformInfo::OK_UniformConstantValue; |
5939 | } |
5940 | } |
5941 | |
5942 | return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK, |
5943 | Op1VP, Op2VP); |
5944 | } |
5945 | case Instruction::Select: { |
5946 | SelectInst *SI = cast<SelectInst>(I); |
5947 | const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); |
5948 | bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); |
5949 | Type *CondTy = SI->getCondition()->getType(); |
5950 | if (!ScalarCond) |
5951 | CondTy = VectorType::get(CondTy, VF); |
5952 | |
5953 | return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); |
5954 | } |
5955 | case Instruction::ICmp: |
5956 | case Instruction::FCmp: { |
5957 | Type *ValTy = I->getOperand(0)->getType(); |
5958 | VectorTy = ToVectorTy(ValTy, VF); |
5959 | return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy); |
5960 | } |
5961 | case Instruction::Store: |
5962 | case Instruction::Load: { |
5963 | StoreInst *SI = dyn_cast<StoreInst>(I); |
5964 | LoadInst *LI = dyn_cast<LoadInst>(I); |
5965 | Type *ValTy = (SI ? SI->getValueOperand()->getType() : |
5966 | LI->getType()); |
5967 | VectorTy = ToVectorTy(ValTy, VF); |
5968 | |
5969 | unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment(); |
5970 | unsigned AS = SI ? SI->getPointerAddressSpace() : |
5971 | LI->getPointerAddressSpace(); |
5972 | Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand(); |
5973 | // We add the cost of address computation here instead of with the gep |
5974 | // instruction because only here we know whether the operation is |
5975 | // scalarized. |
5976 | if (VF == 1) |
5977 | return TTI.getAddressComputationCost(VectorTy) + |
5978 | TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); |
5979 | |
5980 | // Scalarized loads/stores. |
5981 | int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); |
5982 | bool Reverse = ConsecutiveStride < 0; |
5983 | unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ValTy); |
5984 | unsigned VectorElementSize = DL->getTypeStoreSize(VectorTy)/VF; |
5985 | if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) { |
5986 | bool IsComplexComputation = |
5987 | isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop); |
5988 | unsigned Cost = 0; |
5989 | // The cost of extracting from the value vector and pointer vector. |
5990 | Type *PtrTy = ToVectorTy(Ptr->getType(), VF); |
5991 | for (unsigned i = 0; i < VF; ++i) { |
5992 | // The cost of extracting the pointer operand. |
5993 | Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i); |
5994 | // In case of STORE, the cost of ExtractElement from the vector. |
5995 | // In case of LOAD, the cost of InsertElement into the returned |
5996 | // vector. |
5997 | Cost += TTI.getVectorInstrCost(SI ? Instruction::ExtractElement : |
5998 | Instruction::InsertElement, |
5999 | VectorTy, i); |
6000 | } |
6001 | |
6002 | // The cost of the scalar loads/stores. |
6003 | Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation); |
6004 | Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), |
6005 | Alignment, AS); |
6006 | return Cost; |
6007 | } |
6008 | |
6009 | // Wide load/stores. |
6010 | unsigned Cost = TTI.getAddressComputationCost(VectorTy); |
6011 | Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); |
6012 | |
6013 | if (Reverse) |
6014 | Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, |
6015 | VectorTy, 0); |
6016 | return Cost; |
6017 | } |
6018 | case Instruction::ZExt: |
6019 | case Instruction::SExt: |
6020 | case Instruction::FPToUI: |
6021 | case Instruction::FPToSI: |
6022 | case Instruction::FPExt: |
6023 | case Instruction::PtrToInt: |
6024 | case Instruction::IntToPtr: |
6025 | case Instruction::SIToFP: |
6026 | case Instruction::UIToFP: |
6027 | case Instruction::Trunc: |
6028 | case Instruction::FPTrunc: |
6029 | case Instruction::BitCast: { |
6030 | // We optimize the truncation of induction variable. |
6031 | // The cost of these is the same as the scalar operation. |
6032 | if (I->getOpcode() == Instruction::Trunc && |
6033 | Legal->isInductionVariable(I->getOperand(0))) |
6034 | return TTI.getCastInstrCost(I->getOpcode(), I->getType(), |
6035 | I->getOperand(0)->getType()); |
6036 | |
6037 | Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); |
6038 | return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); |
6039 | } |
6040 | case Instruction::Call: { |
6041 | CallInst *CI = cast<CallInst>(I); |
6042 | Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); |
6043 | assert(ID && "Not an intrinsic call!")((ID && "Not an intrinsic call!") ? static_cast<void > (0) : __assert_fail ("ID && \"Not an intrinsic call!\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6043, __PRETTY_FUNCTION__)); |
6044 | Type *RetTy = ToVectorTy(CI->getType(), VF); |
6045 | SmallVector<Type*, 4> Tys; |
6046 | for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) |
6047 | Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF)); |
6048 | return TTI.getIntrinsicInstrCost(ID, RetTy, Tys); |
6049 | } |
6050 | default: { |
6051 | // We are scalarizing the instruction. Return the cost of the scalar |
6052 | // instruction, plus the cost of insert and extract into vector |
6053 | // elements, times the vector width. |
6054 | unsigned Cost = 0; |
6055 | |
6056 | if (!RetTy->isVoidTy() && VF != 1) { |
6057 | unsigned InsCost = TTI.getVectorInstrCost(Instruction::InsertElement, |
6058 | VectorTy); |
6059 | unsigned ExtCost = TTI.getVectorInstrCost(Instruction::ExtractElement, |
6060 | VectorTy); |
6061 | |
6062 | // The cost of inserting the results plus extracting each one of the |
6063 | // operands. |
6064 | Cost += VF * (InsCost + ExtCost * I->getNumOperands()); |
6065 | } |
6066 | |
6067 | // The cost of executing VF copies of the scalar instruction. This opcode |
6068 | // is unknown. Assume that it is the same as 'mul'. |
6069 | Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy); |
6070 | return Cost; |
6071 | } |
6072 | }// end of switch. |
6073 | } |
6074 | |
6075 | Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) { |
6076 | if (Scalar->isVoidTy() || VF == 1) |
6077 | return Scalar; |
6078 | return VectorType::get(Scalar, VF); |
6079 | } |
6080 | |
6081 | char LoopVectorize::ID = 0; |
6082 | static const char lv_name[] = "Loop Vectorization"; |
6083 | INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)static void* initializeLoopVectorizePassOnce(PassRegistry & Registry) { |
6084 | INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)initializeTargetTransformInfoAnalysisGroup(Registry); |
6085 | INITIALIZE_AG_DEPENDENCY(AliasAnalysis)initializeAliasAnalysisAnalysisGroup(Registry); |
6086 | INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)initializeAssumptionTrackerPass(Registry); |
6087 | INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfo)initializeBlockFrequencyInfoPass(Registry); |
6088 | INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)initializeDominatorTreeWrapperPassPass(Registry); |
6089 | INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)initializeScalarEvolutionPass(Registry); |
6090 | INITIALIZE_PASS_DEPENDENCY(LCSSA)initializeLCSSAPass(Registry); |
6091 | INITIALIZE_PASS_DEPENDENCY(LoopInfo)initializeLoopInfoPass(Registry); |
6092 | INITIALIZE_PASS_DEPENDENCY(LoopSimplify)initializeLoopSimplifyPass(Registry); |
6093 | INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)PassInfo *PI = new PassInfo(lv_name, "loop-vectorize", & LoopVectorize ::ID, PassInfo::NormalCtor_t(callDefaultCtor< LoopVectorize >), false, false); Registry.registerPass(*PI, true); return PI; } void llvm::initializeLoopVectorizePass(PassRegistry & Registry) { static volatile sys::cas_flag initialized = 0; sys ::cas_flag old_val = sys::CompareAndSwap(&initialized, 1, 0); if (old_val == 0) { initializeLoopVectorizePassOnce(Registry ); sys::MemoryFence(); AnnotateIgnoreWritesBegin("/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6093); AnnotateHappensBefore("/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6093, &initialized); initialized = 2; AnnotateIgnoreWritesEnd ("/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6093); } else { sys::cas_flag tmp = initialized; sys::MemoryFence (); while (tmp != 2) { tmp = initialized; sys::MemoryFence(); } } AnnotateHappensAfter("/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6093, &initialized); } |
6094 | |
6095 | namespace llvm { |
6096 | Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) { |
6097 | return new LoopVectorize(NoUnrolling, AlwaysVectorize); |
6098 | } |
6099 | } |
6100 | |
6101 | bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { |
6102 | // Check for a store. |
6103 | if (StoreInst *ST = dyn_cast<StoreInst>(Inst)) |
6104 | return Legal->isConsecutivePtr(ST->getPointerOperand()) != 0; |
6105 | |
6106 | // Check for a load. |
6107 | if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) |
6108 | return Legal->isConsecutivePtr(LI->getPointerOperand()) != 0; |
6109 | |
6110 | return false; |
6111 | } |
6112 | |
6113 | |
6114 | void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, |
6115 | bool IfPredicateStore) { |
6116 | assert(!Instr->getType()->isAggregateType() && "Can't handle vectors")((!Instr->getType()->isAggregateType() && "Can't handle vectors" ) ? static_cast<void> (0) : __assert_fail ("!Instr->getType()->isAggregateType() && \"Can't handle vectors\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6116, __PRETTY_FUNCTION__)); |
6117 | // Holds vector parameters or scalars, in case of uniform vals. |
6118 | SmallVector<VectorParts, 4> Params; |
6119 | |
6120 | setDebugLocFromInst(Builder, Instr); |
6121 | |
6122 | // Find all of the vectorized parameters. |
6123 | for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { |
6124 | Value *SrcOp = Instr->getOperand(op); |
6125 | |
6126 | // If we are accessing the old induction variable, use the new one. |
6127 | if (SrcOp == OldInduction) { |
6128 | Params.push_back(getVectorValue(SrcOp)); |
6129 | continue; |
6130 | } |
6131 | |
6132 | // Try using previously calculated values. |
6133 | Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); |
6134 | |
6135 | // If the src is an instruction that appeared earlier in the basic block |
6136 | // then it should already be vectorized. |
6137 | if (SrcInst && OrigLoop->contains(SrcInst)) { |
6138 | assert(WidenMap.has(SrcInst) && "Source operand is unavailable")((WidenMap.has(SrcInst) && "Source operand is unavailable" ) ? static_cast<void> (0) : __assert_fail ("WidenMap.has(SrcInst) && \"Source operand is unavailable\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6138, __PRETTY_FUNCTION__)); |
6139 | // The parameter is a vector value from earlier. |
6140 | Params.push_back(WidenMap.get(SrcInst)); |
6141 | } else { |
6142 | // The parameter is a scalar from outside the loop. Maybe even a constant. |
6143 | VectorParts Scalars; |
6144 | Scalars.append(UF, SrcOp); |
6145 | Params.push_back(Scalars); |
6146 | } |
6147 | } |
6148 | |
6149 | assert(Params.size() == Instr->getNumOperands() &&((Params.size() == Instr->getNumOperands() && "Invalid number of operands" ) ? static_cast<void> (0) : __assert_fail ("Params.size() == Instr->getNumOperands() && \"Invalid number of operands\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6150, __PRETTY_FUNCTION__)) |
6150 | "Invalid number of operands")((Params.size() == Instr->getNumOperands() && "Invalid number of operands" ) ? static_cast<void> (0) : __assert_fail ("Params.size() == Instr->getNumOperands() && \"Invalid number of operands\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6150, __PRETTY_FUNCTION__)); |
6151 | |
6152 | // Does this instruction return a value ? |
6153 | bool IsVoidRetTy = Instr->getType()->isVoidTy(); |
6154 | |
6155 | Value *UndefVec = IsVoidRetTy ? nullptr : |
6156 | UndefValue::get(Instr->getType()); |
6157 | // Create a new entry in the WidenMap and initialize it to Undef or Null. |
6158 | VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); |
6159 | |
6160 | Instruction *InsertPt = Builder.GetInsertPoint(); |
6161 | BasicBlock *IfBlock = Builder.GetInsertBlock(); |
6162 | BasicBlock *CondBlock = nullptr; |
6163 | |
6164 | VectorParts Cond; |
6165 | Loop *VectorLp = nullptr; |
6166 | if (IfPredicateStore) { |
6167 | assert(Instr->getParent()->getSinglePredecessor() &&((Instr->getParent()->getSinglePredecessor() && "Only support single predecessor blocks") ? static_cast<void > (0) : __assert_fail ("Instr->getParent()->getSinglePredecessor() && \"Only support single predecessor blocks\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6168, __PRETTY_FUNCTION__)) |
6168 | "Only support single predecessor blocks")((Instr->getParent()->getSinglePredecessor() && "Only support single predecessor blocks") ? static_cast<void > (0) : __assert_fail ("Instr->getParent()->getSinglePredecessor() && \"Only support single predecessor blocks\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6168, __PRETTY_FUNCTION__)); |
6169 | Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(), |
6170 | Instr->getParent()); |
6171 | VectorLp = LI->getLoopFor(IfBlock); |
6172 | assert(VectorLp && "Must have a loop for this block")((VectorLp && "Must have a loop for this block") ? static_cast <void> (0) : __assert_fail ("VectorLp && \"Must have a loop for this block\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6172, __PRETTY_FUNCTION__)); |
6173 | } |
6174 | |
6175 | // For each vector unroll 'part': |
6176 | for (unsigned Part = 0; Part < UF; ++Part) { |
6177 | // For each scalar that we create: |
6178 | |
6179 | // Start an "if (pred) a[i] = ..." block. |
6180 | Value *Cmp = nullptr; |
6181 | if (IfPredicateStore) { |
6182 | if (Cond[Part]->getType()->isVectorTy()) |
6183 | Cond[Part] = |
6184 | Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0)); |
6185 | Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part], |
6186 | ConstantInt::get(Cond[Part]->getType(), 1)); |
6187 | CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); |
6188 | LoopVectorBody.push_back(CondBlock); |
6189 | VectorLp->addBasicBlockToLoop(CondBlock, LI->getBase()); |
6190 | // Update Builder with newly created basic block. |
6191 | Builder.SetInsertPoint(InsertPt); |
6192 | } |
6193 | |
6194 | Instruction *Cloned = Instr->clone(); |
6195 | if (!IsVoidRetTy) |
6196 | Cloned->setName(Instr->getName() + ".cloned"); |
6197 | // Replace the operands of the cloned instructions with extracted scalars. |
6198 | for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { |
6199 | Value *Op = Params[op][Part]; |
6200 | Cloned->setOperand(op, Op); |
6201 | } |
6202 | |
6203 | // Place the cloned scalar in the new loop. |
6204 | Builder.Insert(Cloned); |
6205 | |
6206 | // If the original scalar returns a value we need to place it in a vector |
6207 | // so that future users will be able to use it. |
6208 | if (!IsVoidRetTy) |
6209 | VecResults[Part] = Cloned; |
6210 | |
6211 | // End if-block. |
6212 | if (IfPredicateStore) { |
6213 | BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); |
6214 | LoopVectorBody.push_back(NewIfBlock); |
6215 | VectorLp->addBasicBlockToLoop(NewIfBlock, LI->getBase()); |
6216 | Builder.SetInsertPoint(InsertPt); |
6217 | Instruction *OldBr = IfBlock->getTerminator(); |
6218 | BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); |
6219 | OldBr->eraseFromParent(); |
6220 | IfBlock = NewIfBlock; |
6221 | } |
6222 | } |
6223 | } |
6224 | |
6225 | void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) { |
6226 | StoreInst *SI = dyn_cast<StoreInst>(Instr); |
6227 | bool IfPredicateStore = (SI && Legal->blockNeedsPredication(SI->getParent())); |
6228 | |
6229 | return scalarizeInstruction(Instr, IfPredicateStore); |
6230 | } |
6231 | |
6232 | Value *InnerLoopUnroller::reverseVector(Value *Vec) { |
6233 | return Vec; |
6234 | } |
6235 | |
6236 | Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { |
6237 | return V; |
6238 | } |
6239 | |
6240 | Value *InnerLoopUnroller::getConsecutiveVector(Value* Val, int StartIdx, |
6241 | bool Negate) { |
6242 | // When unrolling and the VF is 1, we only need to add a simple scalar. |
6243 | Type *ITy = Val->getType(); |
6244 | assert(!ITy->isVectorTy() && "Val must be a scalar")((!ITy->isVectorTy() && "Val must be a scalar") ? static_cast <void> (0) : __assert_fail ("!ITy->isVectorTy() && \"Val must be a scalar\"" , "/tmp/buildd/llvm-toolchain-snapshot-3.6~svn223149/lib/Transforms/Vectorize/LoopVectorize.cpp" , 6244, __PRETTY_FUNCTION__)); |
6245 | Constant *C = ConstantInt::get(ITy, StartIdx, Negate); |
6246 | return Builder.CreateAdd(Val, C, "induction"); |
6247 | } |