LLVM  3.7.0
LoopVectorize.cpp
Go to the documentation of this file.
1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
11 // and generates target-independent LLVM-IR.
12 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
13 // of instructions in order to estimate the profitability of vectorization.
14 //
15 // The loop vectorizer combines consecutive loop iterations into a single
16 // 'wide' iteration. After this transformation the index is incremented
17 // by the SIMD vector width, and not by one.
18 //
19 // This pass has three parts:
20 // 1. The main loop pass that drives the different parts.
21 // 2. LoopVectorizationLegality - A unit that checks for the legality
22 // of the vectorization.
23 // 3. InnerLoopVectorizer - A unit that performs the actual
24 // widening of instructions.
25 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
26 // of vectorization. It decides on the optimal vector width, which
27 // can be one, if vectorization is not profitable.
28 //
29 //===----------------------------------------------------------------------===//
30 //
31 // The reduction-variable vectorization is based on the paper:
32 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
33 //
34 // Variable uniformity checks are inspired by:
35 // Karrenberg, R. and Hack, S. Whole Function Vectorization.
36 //
37 // The interleaved access vectorization is based on the paper:
38 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
39 // Data for SIMD
40 //
41 // Other ideas/concepts are from:
42 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
43 //
44 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
45 // Vectorizing Compilers.
46 //
47 //===----------------------------------------------------------------------===//
48 
50 #include "llvm/ADT/DenseMap.h"
52 #include "llvm/ADT/Hashing.h"
53 #include "llvm/ADT/MapVector.h"
54 #include "llvm/ADT/SetVector.h"
55 #include "llvm/ADT/SmallPtrSet.h"
56 #include "llvm/ADT/SmallSet.h"
57 #include "llvm/ADT/SmallVector.h"
58 #include "llvm/ADT/Statistic.h"
59 #include "llvm/ADT/StringExtras.h"
66 #include "llvm/Analysis/LoopInfo.h"
68 #include "llvm/Analysis/LoopPass.h"
74 #include "llvm/IR/Constants.h"
75 #include "llvm/IR/DataLayout.h"
76 #include "llvm/IR/DebugInfo.h"
77 #include "llvm/IR/DerivedTypes.h"
78 #include "llvm/IR/DiagnosticInfo.h"
79 #include "llvm/IR/Dominators.h"
80 #include "llvm/IR/Function.h"
81 #include "llvm/IR/IRBuilder.h"
82 #include "llvm/IR/Instructions.h"
83 #include "llvm/IR/IntrinsicInst.h"
84 #include "llvm/IR/LLVMContext.h"
85 #include "llvm/IR/Module.h"
86 #include "llvm/IR/PatternMatch.h"
87 #include "llvm/IR/Type.h"
88 #include "llvm/IR/Value.h"
89 #include "llvm/IR/ValueHandle.h"
90 #include "llvm/IR/Verifier.h"
91 #include "llvm/Pass.h"
94 #include "llvm/Support/Debug.h"
96 #include "llvm/Transforms/Scalar.h"
101 #include <algorithm>
102 #include <map>
103 #include <tuple>
104 
105 using namespace llvm;
106 using namespace llvm::PatternMatch;
107 
108 #define LV_NAME "loop-vectorize"
109 #define DEBUG_TYPE LV_NAME
110 
111 STATISTIC(LoopsVectorized, "Number of loops vectorized");
112 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
113 
114 static cl::opt<bool>
115 EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
116  cl::desc("Enable if-conversion during vectorization."));
117 
118 /// We don't vectorize loops with a known constant trip count below this number.
119 static cl::opt<unsigned>
120 TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16),
121  cl::Hidden,
122  cl::desc("Don't vectorize loops with a constant "
123  "trip count that is smaller than this "
124  "value."));
125 
126 /// This enables versioning on the strides of symbolically striding memory
127 /// accesses in code like the following.
128 /// for (i = 0; i < N; ++i)
129 /// A[i * Stride1] += B[i * Stride2] ...
130 ///
131 /// Will be roughly translated to
132 /// if (Stride1 == 1 && Stride2 == 1) {
133 /// for (i = 0; i < N; i+=4)
134 /// A[i:i+3] += ...
135 /// } else
136 /// ...
138  "enable-mem-access-versioning", cl::init(true), cl::Hidden,
139  cl::desc("Enable symblic stride memory access versioning"));
140 
142  "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
143  cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
144 
145 /// Maximum factor for an interleaved memory access.
147  "max-interleave-group-factor", cl::Hidden,
148  cl::desc("Maximum factor for an interleaved access group (default = 8)"),
149  cl::init(8));
150 
151 /// We don't interleave loops with a known constant trip count below this
152 /// number.
153 static const unsigned TinyTripCountInterleaveThreshold = 128;
154 
156  "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
157  cl::desc("A flag that overrides the target's number of scalar registers."));
158 
160  "force-target-num-vector-regs", cl::init(0), cl::Hidden,
161  cl::desc("A flag that overrides the target's number of vector registers."));
162 
163 /// Maximum vectorization interleave count.
164 static const unsigned MaxInterleaveFactor = 16;
165 
167  "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
168  cl::desc("A flag that overrides the target's max interleave factor for "
169  "scalar loops."));
170 
172  "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
173  cl::desc("A flag that overrides the target's max interleave factor for "
174  "vectorized loops."));
175 
177  "force-target-instruction-cost", cl::init(0), cl::Hidden,
178  cl::desc("A flag that overrides the target's expected cost for "
179  "an instruction to a single constant value. Mostly "
180  "useful for getting consistent testing."));
181 
183  "small-loop-cost", cl::init(20), cl::Hidden,
184  cl::desc(
185  "The cost of a loop that is considered 'small' by the interleaver."));
186 
188  "loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden,
189  cl::desc("Enable the use of the block frequency analysis to access PGO "
190  "heuristics minimizing code growth in cold regions and being more "
191  "aggressive in hot regions."));
192 
193 // Runtime interleave loops for load/store throughput.
195  "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
196  cl::desc(
197  "Enable runtime interleaving until load/store ports are saturated"));
198 
199 /// The number of stores in a loop that are allowed to need predication.
201  "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
202  cl::desc("Max number of stores to be predicated behind an if."));
203 
205  "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
206  cl::desc("Count the induction variable only once when interleaving"));
207 
209  "enable-cond-stores-vec", cl::init(false), cl::Hidden,
210  cl::desc("Enable if predication of stores during vectorization."));
211 
213  "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
214  cl::desc("The maximum interleave count to use when interleaving a scalar "
215  "reduction in a nested loop."));
216 
217 namespace {
218 
219 // Forward declarations.
220 class LoopVectorizationLegality;
221 class LoopVectorizationCostModel;
222 class LoopVectorizeHints;
223 
224 /// \brief This modifies LoopAccessReport to initialize message with
225 /// loop-vectorizer-specific part.
226 class VectorizationReport : public LoopAccessReport {
227 public:
228  VectorizationReport(Instruction *I = nullptr)
229  : LoopAccessReport("loop not vectorized: ", I) {}
230 
231  /// \brief This allows promotion of the loop-access analysis report into the
232  /// loop-vectorizer report. It modifies the message to add the
233  /// loop-vectorizer-specific part of the message.
234  explicit VectorizationReport(const LoopAccessReport &R)
235  : LoopAccessReport(Twine("loop not vectorized: ") + R.str(),
236  R.getInstr()) {}
237 };
238 
239 /// A helper function for converting Scalar types to vector types.
240 /// If the incoming type is void, we return void. If the VF is 1, we return
241 /// the scalar type.
242 static Type* ToVectorTy(Type *Scalar, unsigned VF) {
243  if (Scalar->isVoidTy() || VF == 1)
244  return Scalar;
245  return VectorType::get(Scalar, VF);
246 }
247 
248 /// InnerLoopVectorizer vectorizes loops which contain only one basic
249 /// block to a specified vectorization factor (VF).
250 /// This class performs the widening of scalars into vectors, or multiple
251 /// scalars. This class also implements the following features:
252 /// * It inserts an epilogue loop for handling loops that don't have iteration
253 /// counts that are known to be a multiple of the vectorization factor.
254 /// * It handles the code generation for reduction variables.
255 /// * Scalarization (implementation using scalars) of un-vectorizable
256 /// instructions.
257 /// InnerLoopVectorizer does not perform any vectorization-legality
258 /// checks, and relies on the caller to check for the different legality
259 /// aspects. The InnerLoopVectorizer relies on the
260 /// LoopVectorizationLegality class to provide information about the induction
261 /// and reduction variables that were found to a given vectorization factor.
262 class InnerLoopVectorizer {
263 public:
264  InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
265  DominatorTree *DT, const TargetLibraryInfo *TLI,
266  const TargetTransformInfo *TTI, unsigned VecWidth,
267  unsigned UnrollFactor)
268  : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
269  VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()),
270  Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor),
271  Legal(nullptr), AddedSafetyChecks(false) {}
272 
273  // Perform the actual loop widening (vectorization).
274  void vectorize(LoopVectorizationLegality *L) {
275  Legal = L;
276  // Create a new empty loop. Unlink the old loop and connect the new one.
277  createEmptyLoop();
278  // Widen each instruction in the old loop to a new one in the new loop.
279  // Use the Legality module to find the induction and reduction variables.
280  vectorizeLoop();
281  // Register the new loop and update the analysis passes.
282  updateAnalysis();
283  }
284 
285  // Return true if any runtime check is added.
286  bool IsSafetyChecksAdded() {
287  return AddedSafetyChecks;
288  }
289 
290  virtual ~InnerLoopVectorizer() {}
291 
292 protected:
293  /// A small list of PHINodes.
294  typedef SmallVector<PHINode*, 4> PhiVector;
295  /// When we unroll loops we have multiple vector values for each scalar.
296  /// This data structure holds the unrolled and vectorized values that
297  /// originated from one scalar instruction.
298  typedef SmallVector<Value*, 2> VectorParts;
299 
300  // When we if-convert we need to create edge masks. We have to cache values
301  // so that we don't end up with exponential recursion/IR.
303  VectorParts> EdgeMaskCache;
304 
305  /// \brief Add checks for strides that were assumed to be 1.
306  ///
307  /// Returns the last check instruction and the first check instruction in the
308  /// pair as (first, last).
309  std::pair<Instruction *, Instruction *> addStrideCheck(Instruction *Loc);
310 
311  /// Create an empty loop, based on the loop ranges of the old loop.
312  void createEmptyLoop();
313  /// Copy and widen the instructions from the old loop.
314  virtual void vectorizeLoop();
315 
316  /// \brief The Loop exit block may have single value PHI nodes where the
317  /// incoming value is 'Undef'. While vectorizing we only handled real values
318  /// that were defined inside the loop. Here we fix the 'undef case'.
319  /// See PR14725.
320  void fixLCSSAPHIs();
321 
322  /// A helper function that computes the predicate of the block BB, assuming
323  /// that the header block of the loop is set to True. It returns the *entry*
324  /// mask for the block BB.
325  VectorParts createBlockInMask(BasicBlock *BB);
326  /// A helper function that computes the predicate of the edge between SRC
327  /// and DST.
328  VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
329 
330  /// A helper function to vectorize a single BB within the innermost loop.
331  void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV);
332 
333  /// Vectorize a single PHINode in a block. This method handles the induction
334  /// variable canonicalization. It supports both VF = 1 for unrolled loops and
335  /// arbitrary length vectors.
336  void widenPHIInstruction(Instruction *PN, VectorParts &Entry,
337  unsigned UF, unsigned VF, PhiVector *PV);
338 
339  /// Insert the new loop to the loop hierarchy and pass manager
340  /// and update the analysis passes.
341  void updateAnalysis();
342 
343  /// This instruction is un-vectorizable. Implement it as a sequence
344  /// of scalars. If \p IfPredicateStore is true we need to 'hide' each
345  /// scalarized instruction behind an if block predicated on the control
346  /// dependence of the instruction.
347  virtual void scalarizeInstruction(Instruction *Instr,
348  bool IfPredicateStore=false);
349 
350  /// Vectorize Load and Store instructions,
351  virtual void vectorizeMemoryInstruction(Instruction *Instr);
352 
353  /// Create a broadcast instruction. This method generates a broadcast
354  /// instruction (shuffle) for loop invariant values and for the induction
355  /// value. If this is the induction variable then we extend it to N, N+1, ...
356  /// this is needed because each iteration in the loop corresponds to a SIMD
357  /// element.
358  virtual Value *getBroadcastInstrs(Value *V);
359 
360  /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
361  /// to each vector element of Val. The sequence starts at StartIndex.
362  virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step);
363 
364  /// When we go over instructions in the basic block we rely on previous
365  /// values within the current basic block or on loop invariant values.
366  /// When we widen (vectorize) values we place them in the map. If the values
367  /// are not within the map, they have to be loop invariant, so we simply
368  /// broadcast them into a vector.
369  VectorParts &getVectorValue(Value *V);
370 
371  /// Try to vectorize the interleaved access group that \p Instr belongs to.
372  void vectorizeInterleaveGroup(Instruction *Instr);
373 
374  /// Generate a shuffle sequence that will reverse the vector Vec.
375  virtual Value *reverseVector(Value *Vec);
376 
377  /// This is a helper class that holds the vectorizer state. It maps scalar
378  /// instructions to vector instructions. When the code is 'unrolled' then
379  /// then a single scalar value is mapped to multiple vector parts. The parts
380  /// are stored in the VectorPart type.
381  struct ValueMap {
382  /// C'tor. UnrollFactor controls the number of vectors ('parts') that
383  /// are mapped.
384  ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {}
385 
386  /// \return True if 'Key' is saved in the Value Map.
387  bool has(Value *Key) const { return MapStorage.count(Key); }
388 
389  /// Initializes a new entry in the map. Sets all of the vector parts to the
390  /// save value in 'Val'.
391  /// \return A reference to a vector with splat values.
392  VectorParts &splat(Value *Key, Value *Val) {
393  VectorParts &Entry = MapStorage[Key];
394  Entry.assign(UF, Val);
395  return Entry;
396  }
397 
398  ///\return A reference to the value that is stored at 'Key'.
399  VectorParts &get(Value *Key) {
400  VectorParts &Entry = MapStorage[Key];
401  if (Entry.empty())
402  Entry.resize(UF);
403  assert(Entry.size() == UF);
404  return Entry;
405  }
406 
407  private:
408  /// The unroll factor. Each entry in the map stores this number of vector
409  /// elements.
410  unsigned UF;
411 
412  /// Map storage. We use std::map and not DenseMap because insertions to a
413  /// dense map invalidates its iterators.
414  std::map<Value *, VectorParts> MapStorage;
415  };
416 
417  /// The original loop.
418  Loop *OrigLoop;
419  /// Scev analysis to use.
420  ScalarEvolution *SE;
421  /// Loop Info.
422  LoopInfo *LI;
423  /// Dominator Tree.
424  DominatorTree *DT;
425  /// Alias Analysis.
426  AliasAnalysis *AA;
427  /// Target Library Info.
428  const TargetLibraryInfo *TLI;
429  /// Target Transform Info.
430  const TargetTransformInfo *TTI;
431 
432  /// The vectorization SIMD factor to use. Each vector will have this many
433  /// vector elements.
434  unsigned VF;
435 
436 protected:
437  /// The vectorization unroll factor to use. Each scalar is vectorized to this
438  /// many different vector instructions.
439  unsigned UF;
440 
441  /// The builder that we use
442  IRBuilder<> Builder;
443 
444  // --- Vectorization state ---
445 
446  /// The vector-loop preheader.
447  BasicBlock *LoopVectorPreHeader;
448  /// The scalar-loop preheader.
449  BasicBlock *LoopScalarPreHeader;
450  /// Middle Block between the vector and the scalar.
451  BasicBlock *LoopMiddleBlock;
452  ///The ExitBlock of the scalar loop.
453  BasicBlock *LoopExitBlock;
454  ///The vector loop body.
455  SmallVector<BasicBlock *, 4> LoopVectorBody;
456  ///The scalar loop body.
457  BasicBlock *LoopScalarBody;
458  /// A list of all bypass blocks. The first block is the entry of the loop.
459  SmallVector<BasicBlock *, 4> LoopBypassBlocks;
460 
461  /// The new Induction variable which was added to the new block.
462  PHINode *Induction;
463  /// The induction variable of the old basic block.
464  PHINode *OldInduction;
465  /// Holds the extended (to the widest induction type) start index.
466  Value *ExtendedIdx;
467  /// Maps scalars to widened vectors.
468  ValueMap WidenMap;
469  EdgeMaskCache MaskCache;
470 
471  LoopVectorizationLegality *Legal;
472 
473  // Record whether runtime check is added.
474  bool AddedSafetyChecks;
475 };
476 
477 class InnerLoopUnroller : public InnerLoopVectorizer {
478 public:
479  InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
480  DominatorTree *DT, const TargetLibraryInfo *TLI,
481  const TargetTransformInfo *TTI, unsigned UnrollFactor)
482  : InnerLoopVectorizer(OrigLoop, SE, LI, DT, TLI, TTI, 1, UnrollFactor) {}
483 
484 private:
485  void scalarizeInstruction(Instruction *Instr,
486  bool IfPredicateStore = false) override;
487  void vectorizeMemoryInstruction(Instruction *Instr) override;
488  Value *getBroadcastInstrs(Value *V) override;
489  Value *getStepVector(Value *Val, int StartIdx, Value *Step) override;
490  Value *reverseVector(Value *Vec) override;
491 };
492 
493 /// \brief Look for a meaningful debug location on the instruction or it's
494 /// operands.
495 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
496  if (!I)
497  return I;
498 
499  DebugLoc Empty;
500  if (I->getDebugLoc() != Empty)
501  return I;
502 
503  for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
504  if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
505  if (OpInst->getDebugLoc() != Empty)
506  return OpInst;
507  }
508 
509  return I;
510 }
511 
512 /// \brief Set the debug location in the builder using the debug location in the
513 /// instruction.
514 static void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
515  if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr))
516  B.SetCurrentDebugLocation(Inst->getDebugLoc());
517  else
519 }
520 
521 #ifndef NDEBUG
522 /// \return string containing a file name and a line # for the given loop.
523 static std::string getDebugLocString(const Loop *L) {
524  std::string Result;
525  if (L) {
526  raw_string_ostream OS(Result);
527  if (const DebugLoc LoopDbgLoc = L->getStartLoc())
528  LoopDbgLoc.print(OS);
529  else
530  // Just print the module name.
531  OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
532  OS.flush();
533  }
534  return Result;
535 }
536 #endif
537 
538 /// \brief Propagate known metadata from one instruction to another.
539 static void propagateMetadata(Instruction *To, const Instruction *From) {
541  From->getAllMetadataOtherThanDebugLoc(Metadata);
542 
543  for (auto M : Metadata) {
544  unsigned Kind = M.first;
545 
546  // These are safe to transfer (this is safe for TBAA, even when we
547  // if-convert, because should that metadata have had a control dependency
548  // on the condition, and thus actually aliased with some other
549  // non-speculated memory access when the condition was false, this would be
550  // caught by the runtime overlap checks).
551  if (Kind != LLVMContext::MD_tbaa &&
552  Kind != LLVMContext::MD_alias_scope &&
553  Kind != LLVMContext::MD_noalias &&
554  Kind != LLVMContext::MD_fpmath)
555  continue;
556 
557  To->setMetadata(Kind, M.second);
558  }
559 }
560 
561 /// \brief Propagate known metadata from one instruction to a vector of others.
562 static void propagateMetadata(SmallVectorImpl<Value *> &To, const Instruction *From) {
563  for (Value *V : To)
564  if (Instruction *I = dyn_cast<Instruction>(V))
565  propagateMetadata(I, From);
566 }
567 
568 /// \brief The group of interleaved loads/stores sharing the same stride and
569 /// close to each other.
570 ///
571 /// Each member in this group has an index starting from 0, and the largest
572 /// index should be less than interleaved factor, which is equal to the absolute
573 /// value of the access's stride.
574 ///
575 /// E.g. An interleaved load group of factor 4:
576 /// for (unsigned i = 0; i < 1024; i+=4) {
577 /// a = A[i]; // Member of index 0
578 /// b = A[i+1]; // Member of index 1
579 /// d = A[i+3]; // Member of index 3
580 /// ...
581 /// }
582 ///
583 /// An interleaved store group of factor 4:
584 /// for (unsigned i = 0; i < 1024; i+=4) {
585 /// ...
586 /// A[i] = a; // Member of index 0
587 /// A[i+1] = b; // Member of index 1
588 /// A[i+2] = c; // Member of index 2
589 /// A[i+3] = d; // Member of index 3
590 /// }
591 ///
592 /// Note: the interleaved load group could have gaps (missing members), but
593 /// the interleaved store group doesn't allow gaps.
594 class InterleaveGroup {
595 public:
596  InterleaveGroup(Instruction *Instr, int Stride, unsigned Align)
597  : Align(Align), SmallestKey(0), LargestKey(0), InsertPos(Instr) {
598  assert(Align && "The alignment should be non-zero");
599 
600  Factor = std::abs(Stride);
601  assert(Factor > 1 && "Invalid interleave factor");
602 
603  Reverse = Stride < 0;
604  Members[0] = Instr;
605  }
606 
607  bool isReverse() const { return Reverse; }
608  unsigned getFactor() const { return Factor; }
609  unsigned getAlignment() const { return Align; }
610  unsigned getNumMembers() const { return Members.size(); }
611 
612  /// \brief Try to insert a new member \p Instr with index \p Index and
613  /// alignment \p NewAlign. The index is related to the leader and it could be
614  /// negative if it is the new leader.
615  ///
616  /// \returns false if the instruction doesn't belong to the group.
617  bool insertMember(Instruction *Instr, int Index, unsigned NewAlign) {
618  assert(NewAlign && "The new member's alignment should be non-zero");
619 
620  int Key = Index + SmallestKey;
621 
622  // Skip if there is already a member with the same index.
623  if (Members.count(Key))
624  return false;
625 
626  if (Key > LargestKey) {
627  // The largest index is always less than the interleave factor.
628  if (Index >= static_cast<int>(Factor))
629  return false;
630 
631  LargestKey = Key;
632  } else if (Key < SmallestKey) {
633  // The largest index is always less than the interleave factor.
634  if (LargestKey - Key >= static_cast<int>(Factor))
635  return false;
636 
637  SmallestKey = Key;
638  }
639 
640  // It's always safe to select the minimum alignment.
641  Align = std::min(Align, NewAlign);
642  Members[Key] = Instr;
643  return true;
644  }
645 
646  /// \brief Get the member with the given index \p Index
647  ///
648  /// \returns nullptr if contains no such member.
649  Instruction *getMember(unsigned Index) const {
650  int Key = SmallestKey + Index;
651  if (!Members.count(Key))
652  return nullptr;
653 
654  return Members.find(Key)->second;
655  }
656 
657  /// \brief Get the index for the given member. Unlike the key in the member
658  /// map, the index starts from 0.
659  unsigned getIndex(Instruction *Instr) const {
660  for (auto I : Members)
661  if (I.second == Instr)
662  return I.first - SmallestKey;
663 
664  llvm_unreachable("InterleaveGroup contains no such member");
665  }
666 
667  Instruction *getInsertPos() const { return InsertPos; }
668  void setInsertPos(Instruction *Inst) { InsertPos = Inst; }
669 
670 private:
671  unsigned Factor; // Interleave Factor.
672  bool Reverse;
673  unsigned Align;
675  int SmallestKey;
676  int LargestKey;
677 
678  // To avoid breaking dependences, vectorized instructions of an interleave
679  // group should be inserted at either the first load or the last store in
680  // program order.
681  //
682  // E.g. %even = load i32 // Insert Position
683  // %add = add i32 %even // Use of %even
684  // %odd = load i32
685  //
686  // store i32 %even
687  // %odd = add i32 // Def of %odd
688  // store i32 %odd // Insert Position
689  Instruction *InsertPos;
690 };
691 
692 /// \brief Drive the analysis of interleaved memory accesses in the loop.
693 ///
694 /// Use this class to analyze interleaved accesses only when we can vectorize
695 /// a loop. Otherwise it's meaningless to do analysis as the vectorization
696 /// on interleaved accesses is unsafe.
697 ///
698 /// The analysis collects interleave groups and records the relationships
699 /// between the member and the group in a map.
700 class InterleavedAccessInfo {
701 public:
702  InterleavedAccessInfo(ScalarEvolution *SE, Loop *L, DominatorTree *DT)
703  : SE(SE), TheLoop(L), DT(DT) {}
704 
705  ~InterleavedAccessInfo() {
707  // Avoid releasing a pointer twice.
708  for (auto &I : InterleaveGroupMap)
709  DelSet.insert(I.second);
710  for (auto *Ptr : DelSet)
711  delete Ptr;
712  }
713 
714  /// \brief Analyze the interleaved accesses and collect them in interleave
715  /// groups. Substitute symbolic strides using \p Strides.
716  void analyzeInterleaving(const ValueToValueMap &Strides);
717 
718  /// \brief Check if \p Instr belongs to any interleave group.
719  bool isInterleaved(Instruction *Instr) const {
720  return InterleaveGroupMap.count(Instr);
721  }
722 
723  /// \brief Get the interleave group that \p Instr belongs to.
724  ///
725  /// \returns nullptr if doesn't have such group.
726  InterleaveGroup *getInterleaveGroup(Instruction *Instr) const {
727  if (InterleaveGroupMap.count(Instr))
728  return InterleaveGroupMap.find(Instr)->second;
729  return nullptr;
730  }
731 
732 private:
733  ScalarEvolution *SE;
734  Loop *TheLoop;
735  DominatorTree *DT;
736 
737  /// Holds the relationships between the members and the interleave group.
739 
740  /// \brief The descriptor for a strided memory access.
741  struct StrideDescriptor {
742  StrideDescriptor(int Stride, const SCEV *Scev, unsigned Size,
743  unsigned Align)
744  : Stride(Stride), Scev(Scev), Size(Size), Align(Align) {}
745 
746  StrideDescriptor() : Stride(0), Scev(nullptr), Size(0), Align(0) {}
747 
748  int Stride; // The access's stride. It is negative for a reverse access.
749  const SCEV *Scev; // The scalar expression of this access
750  unsigned Size; // The size of the memory object.
751  unsigned Align; // The alignment of this access.
752  };
753 
754  /// \brief Create a new interleave group with the given instruction \p Instr,
755  /// stride \p Stride and alignment \p Align.
756  ///
757  /// \returns the newly created interleave group.
758  InterleaveGroup *createInterleaveGroup(Instruction *Instr, int Stride,
759  unsigned Align) {
760  assert(!InterleaveGroupMap.count(Instr) &&
761  "Already in an interleaved access group");
762  InterleaveGroupMap[Instr] = new InterleaveGroup(Instr, Stride, Align);
763  return InterleaveGroupMap[Instr];
764  }
765 
766  /// \brief Release the group and remove all the relationships.
767  void releaseGroup(InterleaveGroup *Group) {
768  for (unsigned i = 0; i < Group->getFactor(); i++)
769  if (Instruction *Member = Group->getMember(i))
770  InterleaveGroupMap.erase(Member);
771 
772  delete Group;
773  }
774 
775  /// \brief Collect all the accesses with a constant stride in program order.
776  void collectConstStridedAccesses(
778  const ValueToValueMap &Strides);
779 };
780 
781 /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
782 /// to what vectorization factor.
783 /// This class does not look at the profitability of vectorization, only the
784 /// legality. This class has two main kinds of checks:
785 /// * Memory checks - The code in canVectorizeMemory checks if vectorization
786 /// will change the order of memory accesses in a way that will change the
787 /// correctness of the program.
788 /// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory
789 /// checks for a number of different conditions, such as the availability of a
790 /// single induction variable, that all types are supported and vectorize-able,
791 /// etc. This code reflects the capabilities of InnerLoopVectorizer.
792 /// This class is also used by InnerLoopVectorizer for identifying
793 /// induction variable and the different reduction variables.
794 class LoopVectorizationLegality {
795 public:
796  LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DominatorTree *DT,
798  Function *F, const TargetTransformInfo *TTI,
799  LoopAccessAnalysis *LAA)
800  : NumPredStores(0), TheLoop(L), SE(SE), TLI(TLI), TheFunction(F),
801  TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(SE, L, DT),
802  Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false) {}
803 
804  /// This enum represents the kinds of inductions that we support.
805  enum InductionKind {
806  IK_NoInduction, ///< Not an induction variable.
807  IK_IntInduction, ///< Integer induction variable. Step = C.
808  IK_PtrInduction ///< Pointer induction var. Step = C / sizeof(elem).
809  };
810 
811  /// A struct for saving information about induction variables.
812  struct InductionInfo {
813  InductionInfo(Value *Start, InductionKind K, ConstantInt *Step)
814  : StartValue(Start), IK(K), StepValue(Step) {
815  assert(IK != IK_NoInduction && "Not an induction");
816  assert(StartValue && "StartValue is null");
817  assert(StepValue && !StepValue->isZero() && "StepValue is zero");
818  assert((IK != IK_PtrInduction || StartValue->getType()->isPointerTy()) &&
819  "StartValue is not a pointer for pointer induction");
820  assert((IK != IK_IntInduction || StartValue->getType()->isIntegerTy()) &&
821  "StartValue is not an integer for integer induction");
822  assert(StepValue->getType()->isIntegerTy() &&
823  "StepValue is not an integer");
824  }
825  InductionInfo()
826  : StartValue(nullptr), IK(IK_NoInduction), StepValue(nullptr) {}
827 
828  /// Get the consecutive direction. Returns:
829  /// 0 - unknown or non-consecutive.
830  /// 1 - consecutive and increasing.
831  /// -1 - consecutive and decreasing.
832  int getConsecutiveDirection() const {
833  if (StepValue && (StepValue->isOne() || StepValue->isMinusOne()))
834  return StepValue->getSExtValue();
835  return 0;
836  }
837 
838  /// Compute the transformed value of Index at offset StartValue using step
839  /// StepValue.
840  /// For integer induction, returns StartValue + Index * StepValue.
841  /// For pointer induction, returns StartValue[Index * StepValue].
842  /// FIXME: The newly created binary instructions should contain nsw/nuw
843  /// flags, which can be found from the original scalar operations.
844  Value *transform(IRBuilder<> &B, Value *Index) const {
845  switch (IK) {
846  case IK_IntInduction:
847  assert(Index->getType() == StartValue->getType() &&
848  "Index type does not match StartValue type");
849  if (StepValue->isMinusOne())
850  return B.CreateSub(StartValue, Index);
851  if (!StepValue->isOne())
852  Index = B.CreateMul(Index, StepValue);
853  return B.CreateAdd(StartValue, Index);
854 
855  case IK_PtrInduction:
856  assert(Index->getType() == StepValue->getType() &&
857  "Index type does not match StepValue type");
858  if (StepValue->isMinusOne())
859  Index = B.CreateNeg(Index);
860  else if (!StepValue->isOne())
861  Index = B.CreateMul(Index, StepValue);
862  return B.CreateGEP(nullptr, StartValue, Index);
863 
864  case IK_NoInduction:
865  return nullptr;
866  }
867  llvm_unreachable("invalid enum");
868  }
869 
870  /// Start value.
871  TrackingVH<Value> StartValue;
872  /// Induction kind.
873  InductionKind IK;
874  /// Step value.
875  ConstantInt *StepValue;
876  };
877 
878  /// ReductionList contains the reduction descriptors for all
879  /// of the reductions that were found in the loop.
880  typedef DenseMap<PHINode *, RecurrenceDescriptor> ReductionList;
881 
882  /// InductionList saves induction variables and maps them to the
883  /// induction descriptor.
884  typedef MapVector<PHINode*, InductionInfo> InductionList;
885 
886  /// Returns true if it is legal to vectorize this loop.
887  /// This does not mean that it is profitable to vectorize this
888  /// loop, only that it is legal to do so.
889  bool canVectorize();
890 
891  /// Returns the Induction variable.
892  PHINode *getInduction() { return Induction; }
893 
894  /// Returns the reduction variables found in the loop.
895  ReductionList *getReductionVars() { return &Reductions; }
896 
897  /// Returns the induction variables found in the loop.
898  InductionList *getInductionVars() { return &Inductions; }
899 
900  /// Returns the widest induction type.
901  Type *getWidestInductionType() { return WidestIndTy; }
902 
903  /// Returns True if V is an induction variable in this loop.
904  bool isInductionVariable(const Value *V);
905 
906  /// Return true if the block BB needs to be predicated in order for the loop
907  /// to be vectorized.
908  bool blockNeedsPredication(BasicBlock *BB);
909 
910  /// Check if this pointer is consecutive when vectorizing. This happens
911  /// when the last index of the GEP is the induction variable, or that the
912  /// pointer itself is an induction variable.
913  /// This check allows us to vectorize A[idx] into a wide load/store.
914  /// Returns:
915  /// 0 - Stride is unknown or non-consecutive.
916  /// 1 - Address is consecutive.
917  /// -1 - Address is consecutive, and decreasing.
918  int isConsecutivePtr(Value *Ptr);
919 
920  /// Returns true if the value V is uniform within the loop.
921  bool isUniform(Value *V);
922 
923  /// Returns true if this instruction will remain scalar after vectorization.
924  bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); }
925 
926  /// Returns the information that we collected about runtime memory check.
927  const RuntimePointerChecking *getRuntimePointerChecking() const {
928  return LAI->getRuntimePointerChecking();
929  }
930 
931  const LoopAccessInfo *getLAI() const {
932  return LAI;
933  }
934 
935  /// \brief Check if \p Instr belongs to any interleaved access group.
936  bool isAccessInterleaved(Instruction *Instr) {
937  return InterleaveInfo.isInterleaved(Instr);
938  }
939 
940  /// \brief Get the interleaved access group that \p Instr belongs to.
941  const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) {
942  return InterleaveInfo.getInterleaveGroup(Instr);
943  }
944 
945  unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); }
946 
947  bool hasStride(Value *V) { return StrideSet.count(V); }
948  bool mustCheckStrides() { return !StrideSet.empty(); }
949  SmallPtrSet<Value *, 8>::iterator strides_begin() {
950  return StrideSet.begin();
951  }
952  SmallPtrSet<Value *, 8>::iterator strides_end() { return StrideSet.end(); }
953 
954  /// Returns true if the target machine supports masked store operation
955  /// for the given \p DataType and kind of access to \p Ptr.
956  bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
957  return TTI->isLegalMaskedStore(DataType, isConsecutivePtr(Ptr));
958  }
959  /// Returns true if the target machine supports masked load operation
960  /// for the given \p DataType and kind of access to \p Ptr.
961  bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
962  return TTI->isLegalMaskedLoad(DataType, isConsecutivePtr(Ptr));
963  }
964  /// Returns true if vector representation of the instruction \p I
965  /// requires mask.
966  bool isMaskRequired(const Instruction* I) {
967  return (MaskedOp.count(I) != 0);
968  }
969  unsigned getNumStores() const {
970  return LAI->getNumStores();
971  }
972  unsigned getNumLoads() const {
973  return LAI->getNumLoads();
974  }
975  unsigned getNumPredStores() const {
976  return NumPredStores;
977  }
978 private:
979  /// Check if a single basic block loop is vectorizable.
980  /// At this point we know that this is a loop with a constant trip count
981  /// and we only need to check individual instructions.
982  bool canVectorizeInstrs();
983 
984  /// When we vectorize loops we may change the order in which
985  /// we read and write from memory. This method checks if it is
986  /// legal to vectorize the code, considering only memory constrains.
987  /// Returns true if the loop is vectorizable
988  bool canVectorizeMemory();
989 
990  /// Return true if we can vectorize this loop using the IF-conversion
991  /// transformation.
992  bool canVectorizeWithIfConvert();
993 
994  /// Collect the variables that need to stay uniform after vectorization.
995  void collectLoopUniforms();
996 
997  /// Return true if all of the instructions in the block can be speculatively
998  /// executed. \p SafePtrs is a list of addresses that are known to be legal
999  /// and we know that we can read from them without segfault.
1000  bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs);
1001 
1002  /// Returns the induction kind of Phi and record the step. This function may
1003  /// return NoInduction if the PHI is not an induction variable.
1004  InductionKind isInductionVariable(PHINode *Phi, ConstantInt *&StepValue);
1005 
1006  /// \brief Collect memory access with loop invariant strides.
1007  ///
1008  /// Looks for accesses like "a[i * StrideA]" where "StrideA" is loop
1009  /// invariant.
1010  void collectStridedAccess(Value *LoadOrStoreInst);
1011 
1012  /// Report an analysis message to assist the user in diagnosing loops that are
1013  /// not vectorized. These are handled as LoopAccessReport rather than
1014  /// VectorizationReport because the << operator of VectorizationReport returns
1015  /// LoopAccessReport.
1016  void emitAnalysis(const LoopAccessReport &Message) {
1017  LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, LV_NAME);
1018  }
1019 
1020  unsigned NumPredStores;
1021 
1022  /// The loop that we evaluate.
1023  Loop *TheLoop;
1024  /// Scev analysis.
1025  ScalarEvolution *SE;
1026  /// Target Library Info.
1027  TargetLibraryInfo *TLI;
1028  /// Parent function
1029  Function *TheFunction;
1030  /// Target Transform Info
1031  const TargetTransformInfo *TTI;
1032  /// Dominator Tree.
1033  DominatorTree *DT;
1034  // LoopAccess analysis.
1035  LoopAccessAnalysis *LAA;
1036  // And the loop-accesses info corresponding to this loop. This pointer is
1037  // null until canVectorizeMemory sets it up.
1038  const LoopAccessInfo *LAI;
1039 
1040  /// The interleave access information contains groups of interleaved accesses
1041  /// with the same stride and close to each other.
1042  InterleavedAccessInfo InterleaveInfo;
1043 
1044  // --- vectorization state --- //
1045 
1046  /// Holds the integer induction variable. This is the counter of the
1047  /// loop.
1048  PHINode *Induction;
1049  /// Holds the reduction variables.
1050  ReductionList Reductions;
1051  /// Holds all of the induction variables that we found in the loop.
1052  /// Notice that inductions don't need to start at zero and that induction
1053  /// variables can be pointers.
1054  InductionList Inductions;
1055  /// Holds the widest induction type encountered.
1056  Type *WidestIndTy;
1057 
1058  /// Allowed outside users. This holds the reduction
1059  /// vars which can be accessed from outside the loop.
1060  SmallPtrSet<Value*, 4> AllowedExit;
1061  /// This set holds the variables which are known to be uniform after
1062  /// vectorization.
1064 
1065  /// Can we assume the absence of NaNs.
1066  bool HasFunNoNaNAttr;
1067 
1068  ValueToValueMap Strides;
1069  SmallPtrSet<Value *, 8> StrideSet;
1070 
1071  /// While vectorizing these instructions we have to generate a
1072  /// call to the appropriate masked intrinsic
1074 };
1075 
1076 /// LoopVectorizationCostModel - estimates the expected speedups due to
1077 /// vectorization.
1078 /// In many cases vectorization is not profitable. This can happen because of
1079 /// a number of reasons. In this class we mainly attempt to predict the
1080 /// expected speedup/slowdowns due to the supported instruction set. We use the
1081 /// TargetTransformInfo to query the different backends for the cost of
1082 /// different operations.
1083 class LoopVectorizationCostModel {
1084 public:
1085  LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI,
1086  LoopVectorizationLegality *Legal,
1087  const TargetTransformInfo &TTI,
1088  const TargetLibraryInfo *TLI, AssumptionCache *AC,
1089  const Function *F, const LoopVectorizeHints *Hints)
1090  : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI),
1091  TheFunction(F), Hints(Hints) {
1092  CodeMetrics::collectEphemeralValues(L, AC, EphValues);
1093  }
1094 
1095  /// Information about vectorization costs
1096  struct VectorizationFactor {
1097  unsigned Width; // Vector width with best cost
1098  unsigned Cost; // Cost of the loop with that width
1099  };
1100  /// \return The most profitable vectorization factor and the cost of that VF.
1101  /// This method checks every power of two up to VF. If UserVF is not ZERO
1102  /// then this vectorization factor will be selected if vectorization is
1103  /// possible.
1104  VectorizationFactor selectVectorizationFactor(bool OptForSize);
1105 
1106  /// \return The size (in bits) of the widest type in the code that
1107  /// needs to be vectorized. We ignore values that remain scalar such as
1108  /// 64 bit loop indices.
1109  unsigned getWidestType();
1110 
1111  /// \return The desired interleave count.
1112  /// If interleave count has been specified by metadata it will be returned.
1113  /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1114  /// are the selected vectorization factor and the cost of the selected VF.
1115  unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
1116  unsigned LoopCost);
1117 
1118  /// \return The most profitable unroll factor.
1119  /// This method finds the best unroll-factor based on register pressure and
1120  /// other parameters. VF and LoopCost are the selected vectorization factor
1121  /// and the cost of the selected VF.
1122  unsigned computeInterleaveCount(bool OptForSize, unsigned VF,
1123  unsigned LoopCost);
1124 
1125  /// \brief A struct that represents some properties of the register usage
1126  /// of a loop.
1127  struct RegisterUsage {
1128  /// Holds the number of loop invariant values that are used in the loop.
1129  unsigned LoopInvariantRegs;
1130  /// Holds the maximum number of concurrent live intervals in the loop.
1131  unsigned MaxLocalUsers;
1132  /// Holds the number of instructions in the loop.
1133  unsigned NumInstructions;
1134  };
1135 
1136  /// \return information about the register usage of the loop.
1137  RegisterUsage calculateRegisterUsage();
1138 
1139 private:
1140  /// Returns the expected execution cost. The unit of the cost does
1141  /// not matter because we use the 'cost' units to compare different
1142  /// vector widths. The cost that is returned is *not* normalized by
1143  /// the factor width.
1144  unsigned expectedCost(unsigned VF);
1145 
1146  /// Returns the execution time cost of an instruction for a given vector
1147  /// width. Vector width of one means scalar.
1148  unsigned getInstructionCost(Instruction *I, unsigned VF);
1149 
1150  /// Returns whether the instruction is a load or store and will be a emitted
1151  /// as a vector operation.
1152  bool isConsecutiveLoadOrStore(Instruction *I);
1153 
1154  /// Report an analysis message to assist the user in diagnosing loops that are
1155  /// not vectorized. These are handled as LoopAccessReport rather than
1156  /// VectorizationReport because the << operator of VectorizationReport returns
1157  /// LoopAccessReport.
1158  void emitAnalysis(const LoopAccessReport &Message) {
1159  LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, LV_NAME);
1160  }
1161 
1162  /// Values used only by @llvm.assume calls.
1164 
1165  /// The loop that we evaluate.
1166  Loop *TheLoop;
1167  /// Scev analysis.
1168  ScalarEvolution *SE;
1169  /// Loop Info analysis.
1170  LoopInfo *LI;
1171  /// Vectorization legality.
1172  LoopVectorizationLegality *Legal;
1173  /// Vector target information.
1174  const TargetTransformInfo &TTI;
1175  /// Target Library Info.
1176  const TargetLibraryInfo *TLI;
1177  const Function *TheFunction;
1178  // Loop Vectorize Hint.
1179  const LoopVectorizeHints *Hints;
1180 };
1181 
1182 /// Utility class for getting and setting loop vectorizer hints in the form
1183 /// of loop metadata.
1184 /// This class keeps a number of loop annotations locally (as member variables)
1185 /// and can, upon request, write them back as metadata on the loop. It will
1186 /// initially scan the loop for existing metadata, and will update the local
1187 /// values based on information in the loop.
1188 /// We cannot write all values to metadata, as the mere presence of some info,
1189 /// for example 'force', means a decision has been made. So, we need to be
1190 /// careful NOT to add them if the user hasn't specifically asked so.
1191 class LoopVectorizeHints {
1192  enum HintKind {
1193  HK_WIDTH,
1194  HK_UNROLL,
1195  HK_FORCE
1196  };
1197 
1198  /// Hint - associates name and validation with the hint value.
1199  struct Hint {
1200  const char * Name;
1201  unsigned Value; // This may have to change for non-numeric values.
1202  HintKind Kind;
1203 
1204  Hint(const char * Name, unsigned Value, HintKind Kind)
1205  : Name(Name), Value(Value), Kind(Kind) { }
1206 
1207  bool validate(unsigned Val) {
1208  switch (Kind) {
1209  case HK_WIDTH:
1210  return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth;
1211  case HK_UNROLL:
1212  return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
1213  case HK_FORCE:
1214  return (Val <= 1);
1215  }
1216  return false;
1217  }
1218  };
1219 
1220  /// Vectorization width.
1221  Hint Width;
1222  /// Vectorization interleave factor.
1223  Hint Interleave;
1224  /// Vectorization forced
1225  Hint Force;
1226 
1227  /// Return the loop metadata prefix.
1228  static StringRef Prefix() { return "llvm.loop."; }
1229 
1230 public:
1231  enum ForceKind {
1232  FK_Undefined = -1, ///< Not selected.
1233  FK_Disabled = 0, ///< Forcing disabled.
1234  FK_Enabled = 1, ///< Forcing enabled.
1235  };
1236 
1237  LoopVectorizeHints(const Loop *L, bool DisableInterleaving)
1238  : Width("vectorize.width", VectorizerParams::VectorizationFactor,
1239  HK_WIDTH),
1240  Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
1241  Force("vectorize.enable", FK_Undefined, HK_FORCE),
1242  TheLoop(L) {
1243  // Populate values with existing loop metadata.
1244  getHintsFromMetadata();
1245 
1246  // force-vector-interleave overrides DisableInterleaving.
1248  Interleave.Value = VectorizerParams::VectorizationInterleave;
1249 
1250  DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()
1251  << "LV: Interleaving disabled by the pass manager\n");
1252  }
1253 
1254  /// Mark the loop L as already vectorized by setting the width to 1.
1255  void setAlreadyVectorized() {
1256  Width.Value = Interleave.Value = 1;
1257  Hint Hints[] = {Width, Interleave};
1258  writeHintsToMetadata(Hints);
1259  }
1260 
1261  /// Dumps all the hint information.
1262  std::string emitRemark() const {
1263  VectorizationReport R;
1264  if (Force.Value == LoopVectorizeHints::FK_Disabled)
1265  R << "vectorization is explicitly disabled";
1266  else {
1267  R << "use -Rpass-analysis=loop-vectorize for more info";
1268  if (Force.Value == LoopVectorizeHints::FK_Enabled) {
1269  R << " (Force=true";
1270  if (Width.Value != 0)
1271  R << ", Vector Width=" << Width.Value;
1272  if (Interleave.Value != 0)
1273  R << ", Interleave Count=" << Interleave.Value;
1274  R << ")";
1275  }
1276  }
1277 
1278  return R.str();
1279  }
1280 
1281  unsigned getWidth() const { return Width.Value; }
1282  unsigned getInterleave() const { return Interleave.Value; }
1283  enum ForceKind getForce() const { return (ForceKind)Force.Value; }
1284 
1285 private:
1286  /// Find hints specified in the loop metadata and update local values.
1287  void getHintsFromMetadata() {
1288  MDNode *LoopID = TheLoop->getLoopID();
1289  if (!LoopID)
1290  return;
1291 
1292  // First operand should refer to the loop id itself.
1293  assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
1294  assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
1295 
1296  for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
1297  const MDString *S = nullptr;
1299 
1300  // The expected hint is either a MDString or a MDNode with the first
1301  // operand a MDString.
1302  if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
1303  if (!MD || MD->getNumOperands() == 0)
1304  continue;
1305  S = dyn_cast<MDString>(MD->getOperand(0));
1306  for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
1307  Args.push_back(MD->getOperand(i));
1308  } else {
1309  S = dyn_cast<MDString>(LoopID->getOperand(i));
1310  assert(Args.size() == 0 && "too many arguments for MDString");
1311  }
1312 
1313  if (!S)
1314  continue;
1315 
1316  // Check if the hint starts with the loop metadata prefix.
1317  StringRef Name = S->getString();
1318  if (Args.size() == 1)
1319  setHint(Name, Args[0]);
1320  }
1321  }
1322 
1323  /// Checks string hint with one operand and set value if valid.
1324  void setHint(StringRef Name, Metadata *Arg) {
1325  if (!Name.startswith(Prefix()))
1326  return;
1327  Name = Name.substr(Prefix().size(), StringRef::npos);
1328 
1329  const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
1330  if (!C) return;
1331  unsigned Val = C->getZExtValue();
1332 
1333  Hint *Hints[] = {&Width, &Interleave, &Force};
1334  for (auto H : Hints) {
1335  if (Name == H->Name) {
1336  if (H->validate(Val))
1337  H->Value = Val;
1338  else
1339  DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
1340  break;
1341  }
1342  }
1343  }
1344 
1345  /// Create a new hint from name / value pair.
1346  MDNode *createHintMetadata(StringRef Name, unsigned V) const {
1347  LLVMContext &Context = TheLoop->getHeader()->getContext();
1348  Metadata *MDs[] = {MDString::get(Context, Name),
1350  ConstantInt::get(Type::getInt32Ty(Context), V))};
1351  return MDNode::get(Context, MDs);
1352  }
1353 
1354  /// Matches metadata with hint name.
1355  bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) {
1356  MDString* Name = dyn_cast<MDString>(Node->getOperand(0));
1357  if (!Name)
1358  return false;
1359 
1360  for (auto H : HintTypes)
1361  if (Name->getString().endswith(H.Name))
1362  return true;
1363  return false;
1364  }
1365 
1366  /// Sets current hints into loop metadata, keeping other values intact.
1367  void writeHintsToMetadata(ArrayRef<Hint> HintTypes) {
1368  if (HintTypes.size() == 0)
1369  return;
1370 
1371  // Reserve the first element to LoopID (see below).
1373  // If the loop already has metadata, then ignore the existing operands.
1374  MDNode *LoopID = TheLoop->getLoopID();
1375  if (LoopID) {
1376  for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
1377  MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
1378  // If node in update list, ignore old value.
1379  if (!matchesHintMetadataName(Node, HintTypes))
1380  MDs.push_back(Node);
1381  }
1382  }
1383 
1384  // Now, add the missing hints.
1385  for (auto H : HintTypes)
1386  MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));
1387 
1388  // Replace current metadata node with new one.
1389  LLVMContext &Context = TheLoop->getHeader()->getContext();
1390  MDNode *NewLoopID = MDNode::get(Context, MDs);
1391  // Set operand 0 to refer to the loop id itself.
1392  NewLoopID->replaceOperandWith(0, NewLoopID);
1393 
1394  TheLoop->setLoopID(NewLoopID);
1395  }
1396 
1397  /// The loop these hints belong to.
1398  const Loop *TheLoop;
1399 };
1400 
1401 static void emitMissedWarning(Function *F, Loop *L,
1402  const LoopVectorizeHints &LH) {
1404  L->getStartLoc(), LH.emitRemark());
1405 
1406  if (LH.getForce() == LoopVectorizeHints::FK_Enabled) {
1407  if (LH.getWidth() != 1)
1409  F->getContext(), *F, L->getStartLoc(),
1410  "failed explicitly specified loop vectorization");
1411  else if (LH.getInterleave() != 1)
1413  F->getContext(), *F, L->getStartLoc(),
1414  "failed explicitly specified loop interleaving");
1415  }
1416 }
1417 
1418 static void addInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {
1419  if (L.empty())
1420  return V.push_back(&L);
1421 
1422  for (Loop *InnerL : L)
1423  addInnerLoop(*InnerL, V);
1424 }
1425 
1426 /// The LoopVectorize Pass.
1427 struct LoopVectorize : public FunctionPass {
1428  /// Pass identification, replacement for typeid
1429  static char ID;
1430 
1431  explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true)
1432  : FunctionPass(ID),
1433  DisableUnrolling(NoUnrolling),
1434  AlwaysVectorize(AlwaysVectorize) {
1436  }
1437 
1438  ScalarEvolution *SE;
1439  LoopInfo *LI;
1440  TargetTransformInfo *TTI;
1441  DominatorTree *DT;
1443  TargetLibraryInfo *TLI;
1444  AliasAnalysis *AA;
1445  AssumptionCache *AC;
1446  LoopAccessAnalysis *LAA;
1447  bool DisableUnrolling;
1448  bool AlwaysVectorize;
1449 
1450  BlockFrequency ColdEntryFreq;
1451 
1452  bool runOnFunction(Function &F) override {
1453  SE = &getAnalysis<ScalarEvolution>();
1454  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1455  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1456  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1457  BFI = &getAnalysis<BlockFrequencyInfo>();
1458  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1459  TLI = TLIP ? &TLIP->getTLI() : nullptr;
1460  AA = &getAnalysis<AliasAnalysis>();
1461  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1462  LAA = &getAnalysis<LoopAccessAnalysis>();
1463 
1464  // Compute some weights outside of the loop over the loops. Compute this
1465  // using a BranchProbability to re-use its scaling math.
1466  const BranchProbability ColdProb(1, 5); // 20%
1467  ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb;
1468 
1469  // Don't attempt if
1470  // 1. the target claims to have no vector registers, and
1471  // 2. interleaving won't help ILP.
1472  //
1473  // The second condition is necessary because, even if the target has no
1474  // vector registers, loop vectorization may still enable scalar
1475  // interleaving.
1476  if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2)
1477  return false;
1478 
1479  // Build up a worklist of inner-loops to vectorize. This is necessary as
1480  // the act of vectorizing or partially unrolling a loop creates new loops
1481  // and can invalidate iterators across the loops.
1482  SmallVector<Loop *, 8> Worklist;
1483 
1484  for (Loop *L : *LI)
1485  addInnerLoop(*L, Worklist);
1486 
1487  LoopsAnalyzed += Worklist.size();
1488 
1489  // Now walk the identified inner loops.
1490  bool Changed = false;
1491  while (!Worklist.empty())
1492  Changed |= processLoop(Worklist.pop_back_val());
1493 
1494  // Process each loop nest in the function.
1495  return Changed;
1496  }
1497 
1498  static void AddRuntimeUnrollDisableMetaData(Loop *L) {
1500  // Reserve first location for self reference to the LoopID metadata node.
1501  MDs.push_back(nullptr);
1502  bool IsUnrollMetadata = false;
1503  MDNode *LoopID = L->getLoopID();
1504  if (LoopID) {
1505  // First find existing loop unrolling disable metadata.
1506  for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
1507  MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
1508  if (MD) {
1509  const MDString *S = dyn_cast<MDString>(MD->getOperand(0));
1510  IsUnrollMetadata =
1511  S && S->getString().startswith("llvm.loop.unroll.disable");
1512  }
1513  MDs.push_back(LoopID->getOperand(i));
1514  }
1515  }
1516 
1517  if (!IsUnrollMetadata) {
1518  // Add runtime unroll disable metadata.
1519  LLVMContext &Context = L->getHeader()->getContext();
1520  SmallVector<Metadata *, 1> DisableOperands;
1521  DisableOperands.push_back(
1522  MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
1523  MDNode *DisableNode = MDNode::get(Context, DisableOperands);
1524  MDs.push_back(DisableNode);
1525  MDNode *NewLoopID = MDNode::get(Context, MDs);
1526  // Set operand 0 to refer to the loop id itself.
1527  NewLoopID->replaceOperandWith(0, NewLoopID);
1528  L->setLoopID(NewLoopID);
1529  }
1530  }
1531 
1532  bool processLoop(Loop *L) {
1533  assert(L->empty() && "Only process inner loops.");
1534 
1535 #ifndef NDEBUG
1536  const std::string DebugLocStr = getDebugLocString(L);
1537 #endif /* NDEBUG */
1538 
1539  DEBUG(dbgs() << "\nLV: Checking a loop in \""
1540  << L->getHeader()->getParent()->getName() << "\" from "
1541  << DebugLocStr << "\n");
1542 
1543  LoopVectorizeHints Hints(L, DisableUnrolling);
1544 
1545  DEBUG(dbgs() << "LV: Loop hints:"
1546  << " force="
1547  << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
1548  ? "disabled"
1549  : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
1550  ? "enabled"
1551  : "?")) << " width=" << Hints.getWidth()
1552  << " unroll=" << Hints.getInterleave() << "\n");
1553 
1554  // Function containing loop
1555  Function *F = L->getHeader()->getParent();
1556 
1557  // Looking at the diagnostic output is the only way to determine if a loop
1558  // was vectorized (other than looking at the IR or machine code), so it
1559  // is important to generate an optimization remark for each loop. Most of
1560  // these messages are generated by emitOptimizationRemarkAnalysis. Remarks
1561  // generated by emitOptimizationRemark and emitOptimizationRemarkMissed are
1562  // less verbose reporting vectorized loops and unvectorized loops that may
1563  // benefit from vectorization, respectively.
1564 
1565  if (Hints.getForce() == LoopVectorizeHints::FK_Disabled) {
1566  DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
1567  emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F,
1568  L->getStartLoc(), Hints.emitRemark());
1569  return false;
1570  }
1571 
1572  if (!AlwaysVectorize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) {
1573  DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
1574  emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F,
1575  L->getStartLoc(), Hints.emitRemark());
1576  return false;
1577  }
1578 
1579  if (Hints.getWidth() == 1 && Hints.getInterleave() == 1) {
1580  DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
1582  F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
1583  "loop not vectorized: vector width and interleave count are "
1584  "explicitly set to 1");
1585  return false;
1586  }
1587 
1588  // Check the loop for a trip count threshold:
1589  // do not vectorize loops with a tiny trip count.
1590  const unsigned TC = SE->getSmallConstantTripCount(L);
1591  if (TC > 0u && TC < TinyTripCountVectorThreshold) {
1592  DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
1593  << "This loop is not worth vectorizing.");
1594  if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
1595  DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
1596  else {
1597  DEBUG(dbgs() << "\n");
1599  F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
1600  "vectorization is not beneficial and is not explicitly forced");
1601  return false;
1602  }
1603  }
1604 
1605  // Check if it is legal to vectorize the loop.
1606  LoopVectorizationLegality LVL(L, SE, DT, TLI, AA, F, TTI, LAA);
1607  if (!LVL.canVectorize()) {
1608  DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
1609  emitMissedWarning(F, L, Hints);
1610  return false;
1611  }
1612 
1613  // Use the cost model.
1614  LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, TLI, AC, F, &Hints);
1615 
1616  // Check the function attributes to find out if this function should be
1617  // optimized for size.
1618  bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
1619  F->hasFnAttribute(Attribute::OptimizeForSize);
1620 
1621  // Compute the weighted frequency of this loop being executed and see if it
1622  // is less than 20% of the function entry baseline frequency. Note that we
1623  // always have a canonical loop here because we think we *can* vectoriez.
1624  // FIXME: This is hidden behind a flag due to pervasive problems with
1625  // exactly what block frequency models.
1627  BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader());
1628  if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
1629  LoopEntryFreq < ColdEntryFreq)
1630  OptForSize = true;
1631  }
1632 
1633  // Check the function attributes to see if implicit floats are allowed.a
1634  // FIXME: This check doesn't seem possibly correct -- what if the loop is
1635  // an integer loop and the vector instructions selected are purely integer
1636  // vector instructions?
1637  if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
1638  DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
1639  "attribute is used.\n");
1641  F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
1642  "loop not vectorized due to NoImplicitFloat attribute");
1643  emitMissedWarning(F, L, Hints);
1644  return false;
1645  }
1646 
1647  // Select the optimal vectorization factor.
1649  CM.selectVectorizationFactor(OptForSize);
1650 
1651  // Select the interleave count.
1652  unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);
1653 
1654  DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
1655  << DebugLocStr << '\n');
1656  DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
1657 
1658  if (VF.Width == 1) {
1659  DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial\n");
1660 
1661  if (IC == 1) {
1663  F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
1664  "not beneficial to vectorize and user disabled interleaving");
1665  return false;
1666  }
1667  DEBUG(dbgs() << "LV: Trying to at least unroll the loops.\n");
1668 
1669  // Report the unrolling decision.
1670  emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
1671  Twine("interleaved by " + Twine(IC) +
1672  " (vectorization not beneficial)"));
1673 
1674  InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, IC);
1675  Unroller.vectorize(&LVL);
1676  } else {
1677  // If we decided that it is *legal* to vectorize the loop then do it.
1678  InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, IC);
1679  LB.vectorize(&LVL);
1680  ++LoopsVectorized;
1681 
1682  // Add metadata to disable runtime unrolling scalar loop when there's no
1683  // runtime check about strides and memory. Because at this situation,
1684  // scalar loop is rarely used not worthy to be unrolled.
1685  if (!LB.IsSafetyChecksAdded())
1686  AddRuntimeUnrollDisableMetaData(L);
1687 
1688  // Report the vectorization decision.
1689  emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
1690  Twine("vectorized loop (vectorization width: ") +
1691  Twine(VF.Width) + ", interleaved count: " +
1692  Twine(IC) + ")");
1693  }
1694 
1695  // Mark the loop as already vectorized to avoid vectorizing again.
1696  Hints.setAlreadyVectorized();
1697 
1698  DEBUG(verifyFunction(*L->getHeader()->getParent()));
1699  return true;
1700  }
1701 
1702  void getAnalysisUsage(AnalysisUsage &AU) const override {
1705  AU.addRequiredID(LCSSAID);
1711  AU.addRequired<AliasAnalysis>();
1716  }
1717 
1718 };
1719 
1720 } // end anonymous namespace
1721 
1722 //===----------------------------------------------------------------------===//
1723 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1724 // LoopVectorizationCostModel.
1725 //===----------------------------------------------------------------------===//
1726 
1727 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1728  // We need to place the broadcast of invariant variables outside the loop.
1729  Instruction *Instr = dyn_cast<Instruction>(V);
1730  bool NewInstr =
1731  (Instr && std::find(LoopVectorBody.begin(), LoopVectorBody.end(),
1732  Instr->getParent()) != LoopVectorBody.end());
1733  bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
1734 
1735  // Place the code for broadcasting invariant variables in the new preheader.
1736  IRBuilder<>::InsertPointGuard Guard(Builder);
1737  if (Invariant)
1738  Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1739 
1740  // Broadcast the scalar into all locations in the vector.
1741  Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1742 
1743  return Shuf;
1744 }
1745 
1746 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx,
1747  Value *Step) {
1748  assert(Val->getType()->isVectorTy() && "Must be a vector");
1749  assert(Val->getType()->getScalarType()->isIntegerTy() &&
1750  "Elem must be an integer");
1751  assert(Step->getType() == Val->getType()->getScalarType() &&
1752  "Step has wrong type");
1753  // Create the types.
1754  Type *ITy = Val->getType()->getScalarType();
1755  VectorType *Ty = cast<VectorType>(Val->getType());
1756  int VLen = Ty->getNumElements();
1757  SmallVector<Constant*, 8> Indices;
1758 
1759  // Create a vector of consecutive numbers from zero to VF.
1760  for (int i = 0; i < VLen; ++i)
1761  Indices.push_back(ConstantInt::get(ITy, StartIdx + i));
1762 
1763  // Add the consecutive indices to the vector value.
1764  Constant *Cv = ConstantVector::get(Indices);
1765  assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1766  Step = Builder.CreateVectorSplat(VLen, Step);
1767  assert(Step->getType() == Val->getType() && "Invalid step vec");
1768  // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1769  // which can be found from the original scalar operations.
1770  Step = Builder.CreateMul(Cv, Step);
1771  return Builder.CreateAdd(Val, Step, "induction");
1772 }
1773 
1774 int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
1775  assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr");
1776  // Make sure that the pointer does not point to structs.
1778  return 0;
1779 
1780  // If this value is a pointer induction variable we know it is consecutive.
1781  PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);
1782  if (Phi && Inductions.count(Phi)) {
1783  InductionInfo II = Inductions[Phi];
1784  return II.getConsecutiveDirection();
1785  }
1786 
1787  GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
1788  if (!Gep)
1789  return 0;
1790 
1791  unsigned NumOperands = Gep->getNumOperands();
1792  Value *GpPtr = Gep->getPointerOperand();
1793  // If this GEP value is a consecutive pointer induction variable and all of
1794  // the indices are constant then we know it is consecutive. We can
1795  Phi = dyn_cast<PHINode>(GpPtr);
1796  if (Phi && Inductions.count(Phi)) {
1797 
1798  // Make sure that the pointer does not point to structs.
1799  PointerType *GepPtrType = cast<PointerType>(GpPtr->getType());
1800  if (GepPtrType->getElementType()->isAggregateType())
1801  return 0;
1802 
1803  // Make sure that all of the index operands are loop invariant.
1804  for (unsigned i = 1; i < NumOperands; ++i)
1805  if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
1806  return 0;
1807 
1808  InductionInfo II = Inductions[Phi];
1809  return II.getConsecutiveDirection();
1810  }
1811 
1812  unsigned InductionOperand = getGEPInductionOperand(Gep);
1813 
1814  // Check that all of the gep indices are uniform except for our induction
1815  // operand.
1816  for (unsigned i = 0; i != NumOperands; ++i)
1817  if (i != InductionOperand &&
1818  !SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
1819  return 0;
1820 
1821  // We can emit wide load/stores only if the last non-zero index is the
1822  // induction variable.
1823  const SCEV *Last = nullptr;
1824  if (!Strides.count(Gep))
1825  Last = SE->getSCEV(Gep->getOperand(InductionOperand));
1826  else {
1827  // Because of the multiplication by a stride we can have a s/zext cast.
1828  // We are going to replace this stride by 1 so the cast is safe to ignore.
1829  //
1830  // %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
1831  // %0 = trunc i64 %indvars.iv to i32
1832  // %mul = mul i32 %0, %Stride1
1833  // %idxprom = zext i32 %mul to i64 << Safe cast.
1834  // %arrayidx = getelementptr inbounds i32* %B, i64 %idxprom
1835  //
1836  Last = replaceSymbolicStrideSCEV(SE, Strides,
1837  Gep->getOperand(InductionOperand), Gep);
1838  if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(Last))
1839  Last =
1840  (C->getSCEVType() == scSignExtend || C->getSCEVType() == scZeroExtend)
1841  ? C->getOperand()
1842  : Last;
1843  }
1844  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {
1845  const SCEV *Step = AR->getStepRecurrence(*SE);
1846 
1847  // The memory is consecutive because the last index is consecutive
1848  // and all other indices are loop invariant.
1849  if (Step->isOne())
1850  return 1;
1851  if (Step->isAllOnesValue())
1852  return -1;
1853  }
1854 
1855  return 0;
1856 }
1857 
1858 bool LoopVectorizationLegality::isUniform(Value *V) {
1859  return LAI->isUniform(V);
1860 }
1861 
1863 InnerLoopVectorizer::getVectorValue(Value *V) {
1864  assert(V != Induction && "The new induction variable should not be used.");
1865  assert(!V->getType()->isVectorTy() && "Can't widen a vector");
1866 
1867  // If we have a stride that is replaced by one, do it here.
1868  if (Legal->hasStride(V))
1869  V = ConstantInt::get(V->getType(), 1);
1870 
1871  // If we have this scalar in the map, return it.
1872  if (WidenMap.has(V))
1873  return WidenMap.get(V);
1874 
1875  // If this scalar is unknown, assume that it is a constant or that it is
1876  // loop invariant. Broadcast V and save the value for future uses.
1877  Value *B = getBroadcastInstrs(V);
1878  return WidenMap.splat(V, B);
1879 }
1880 
1881 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
1882  assert(Vec->getType()->isVectorTy() && "Invalid type");
1883  SmallVector<Constant*, 8> ShuffleMask;
1884  for (unsigned i = 0; i < VF; ++i)
1885  ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
1886 
1887  return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
1888  ConstantVector::get(ShuffleMask),
1889  "reverse");
1890 }
1891 
1892 // Get a mask to interleave \p NumVec vectors into a wide vector.
1893 // I.e. <0, VF, VF*2, ..., VF*(NumVec-1), 1, VF+1, VF*2+1, ...>
1894 // E.g. For 2 interleaved vectors, if VF is 4, the mask is:
1895 // <0, 4, 1, 5, 2, 6, 3, 7>
1896 static Constant *getInterleavedMask(IRBuilder<> &Builder, unsigned VF,
1897  unsigned NumVec) {
1899  for (unsigned i = 0; i < VF; i++)
1900  for (unsigned j = 0; j < NumVec; j++)
1901  Mask.push_back(Builder.getInt32(j * VF + i));
1902 
1903  return ConstantVector::get(Mask);
1904 }
1905 
1906 // Get the strided mask starting from index \p Start.
1907 // I.e. <Start, Start + Stride, ..., Start + Stride*(VF-1)>
1908 static Constant *getStridedMask(IRBuilder<> &Builder, unsigned Start,
1909  unsigned Stride, unsigned VF) {
1911  for (unsigned i = 0; i < VF; i++)
1912  Mask.push_back(Builder.getInt32(Start + i * Stride));
1913 
1914  return ConstantVector::get(Mask);
1915 }
1916 
1917 // Get a mask of two parts: The first part consists of sequential integers
1918 // starting from 0, The second part consists of UNDEFs.
1919 // I.e. <0, 1, 2, ..., NumInt - 1, undef, ..., undef>
1920 static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned NumInt,
1921  unsigned NumUndef) {
1923  for (unsigned i = 0; i < NumInt; i++)
1924  Mask.push_back(Builder.getInt32(i));
1925 
1926  Constant *Undef = UndefValue::get(Builder.getInt32Ty());
1927  for (unsigned i = 0; i < NumUndef; i++)
1928  Mask.push_back(Undef);
1929 
1930  return ConstantVector::get(Mask);
1931 }
1932 
1933 // Concatenate two vectors with the same element type. The 2nd vector should
1934 // not have more elements than the 1st vector. If the 2nd vector has less
1935 // elements, extend it with UNDEFs.
1937  Value *V2) {
1938  VectorType *VecTy1 = dyn_cast<VectorType>(V1->getType());
1939  VectorType *VecTy2 = dyn_cast<VectorType>(V2->getType());
1940  assert(VecTy1 && VecTy2 &&
1941  VecTy1->getScalarType() == VecTy2->getScalarType() &&
1942  "Expect two vectors with the same element type");
1943 
1944  unsigned NumElts1 = VecTy1->getNumElements();
1945  unsigned NumElts2 = VecTy2->getNumElements();
1946  assert(NumElts1 >= NumElts2 && "Unexpect the first vector has less elements");
1947 
1948  if (NumElts1 > NumElts2) {
1949  // Extend with UNDEFs.
1950  Constant *ExtMask =
1951  getSequentialMask(Builder, NumElts2, NumElts1 - NumElts2);
1952  V2 = Builder.CreateShuffleVector(V2, UndefValue::get(VecTy2), ExtMask);
1953  }
1954 
1955  Constant *Mask = getSequentialMask(Builder, NumElts1 + NumElts2, 0);
1956  return Builder.CreateShuffleVector(V1, V2, Mask);
1957 }
1958 
1959 // Concatenate vectors in the given list. All vectors have the same type.
1961  ArrayRef<Value *> InputList) {
1962  unsigned NumVec = InputList.size();
1963  assert(NumVec > 1 && "Should be at least two vectors");
1964 
1965  SmallVector<Value *, 8> ResList;
1966  ResList.append(InputList.begin(), InputList.end());
1967  do {
1968  SmallVector<Value *, 8> TmpList;
1969  for (unsigned i = 0; i < NumVec - 1; i += 2) {
1970  Value *V0 = ResList[i], *V1 = ResList[i + 1];
1971  assert((V0->getType() == V1->getType() || i == NumVec - 2) &&
1972  "Only the last vector may have a different type");
1973 
1974  TmpList.push_back(ConcatenateTwoVectors(Builder, V0, V1));
1975  }
1976 
1977  // Push the last vector if the total number of vectors is odd.
1978  if (NumVec % 2 != 0)
1979  TmpList.push_back(ResList[NumVec - 1]);
1980 
1981  ResList = TmpList;
1982  NumVec = ResList.size();
1983  } while (NumVec > 1);
1984 
1985  return ResList[0];
1986 }
1987 
1988 // Try to vectorize the interleave group that \p Instr belongs to.
1989 //
1990 // E.g. Translate following interleaved load group (factor = 3):
1991 // for (i = 0; i < N; i+=3) {
1992 // R = Pic[i]; // Member of index 0
1993 // G = Pic[i+1]; // Member of index 1
1994 // B = Pic[i+2]; // Member of index 2
1995 // ... // do something to R, G, B
1996 // }
1997 // To:
1998 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
1999 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements
2000 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements
2001 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements
2002 //
2003 // Or translate following interleaved store group (factor = 3):
2004 // for (i = 0; i < N; i+=3) {
2005 // ... do something to R, G, B
2006 // Pic[i] = R; // Member of index 0
2007 // Pic[i+1] = G; // Member of index 1
2008 // Pic[i+2] = B; // Member of index 2
2009 // }
2010 // To:
2011 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2012 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2013 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2014 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2015 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2016 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
2017  const InterleaveGroup *Group = Legal->getInterleavedAccessGroup(Instr);
2018  assert(Group && "Fail to get an interleaved access group.");
2019 
2020  // Skip if current instruction is not the insert position.
2021  if (Instr != Group->getInsertPos())
2022  return;
2023 
2024  LoadInst *LI = dyn_cast<LoadInst>(Instr);
2025  StoreInst *SI = dyn_cast<StoreInst>(Instr);
2026  Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
2027 
2028  // Prepare for the vector type of the interleaved load/store.
2029  Type *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
2030  unsigned InterleaveFactor = Group->getFactor();
2031  Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2032  Type *PtrTy = VecTy->getPointerTo(Ptr->getType()->getPointerAddressSpace());
2033 
2034  // Prepare for the new pointers.
2035  setDebugLocFromInst(Builder, Ptr);
2036  VectorParts &PtrParts = getVectorValue(Ptr);
2037  SmallVector<Value *, 2> NewPtrs;
2038  unsigned Index = Group->getIndex(Instr);
2039  for (unsigned Part = 0; Part < UF; Part++) {
2040  // Extract the pointer for current instruction from the pointer vector. A
2041  // reverse access uses the pointer in the last lane.
2042  Value *NewPtr = Builder.CreateExtractElement(
2043  PtrParts[Part],
2044  Group->isReverse() ? Builder.getInt32(VF - 1) : Builder.getInt32(0));
2045 
2046  // Notice current instruction could be any index. Need to adjust the address
2047  // to the member of index 0.
2048  //
2049  // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2050  // b = A[i]; // Member of index 0
2051  // Current pointer is pointed to A[i+1], adjust it to A[i].
2052  //
2053  // E.g. A[i+1] = a; // Member of index 1
2054  // A[i] = b; // Member of index 0
2055  // A[i+2] = c; // Member of index 2 (Current instruction)
2056  // Current pointer is pointed to A[i+2], adjust it to A[i].
2057  NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index));
2058 
2059  // Cast to the vector pointer type.
2060  NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
2061  }
2062 
2063  setDebugLocFromInst(Builder, Instr);
2064  Value *UndefVec = UndefValue::get(VecTy);
2065 
2066  // Vectorize the interleaved load group.
2067  if (LI) {
2068  for (unsigned Part = 0; Part < UF; Part++) {
2069  Instruction *NewLoadInstr = Builder.CreateAlignedLoad(
2070  NewPtrs[Part], Group->getAlignment(), "wide.vec");
2071 
2072  for (unsigned i = 0; i < InterleaveFactor; i++) {
2073  Instruction *Member = Group->getMember(i);
2074 
2075  // Skip the gaps in the group.
2076  if (!Member)
2077  continue;
2078 
2079  Constant *StrideMask = getStridedMask(Builder, i, InterleaveFactor, VF);
2080  Value *StridedVec = Builder.CreateShuffleVector(
2081  NewLoadInstr, UndefVec, StrideMask, "strided.vec");
2082 
2083  // If this member has different type, cast the result type.
2084  if (Member->getType() != ScalarTy) {
2085  VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2086  StridedVec = Builder.CreateBitOrPointerCast(StridedVec, OtherVTy);
2087  }
2088 
2089  VectorParts &Entry = WidenMap.get(Member);
2090  Entry[Part] =
2091  Group->isReverse() ? reverseVector(StridedVec) : StridedVec;
2092  }
2093 
2094  propagateMetadata(NewLoadInstr, Instr);
2095  }
2096  return;
2097  }
2098 
2099  // The sub vector type for current instruction.
2100  VectorType *SubVT = VectorType::get(ScalarTy, VF);
2101 
2102  // Vectorize the interleaved store group.
2103  for (unsigned Part = 0; Part < UF; Part++) {
2104  // Collect the stored vector from each member.
2105  SmallVector<Value *, 4> StoredVecs;
2106  for (unsigned i = 0; i < InterleaveFactor; i++) {
2107  // Interleaved store group doesn't allow a gap, so each index has a member
2108  Instruction *Member = Group->getMember(i);
2109  assert(Member && "Fail to get a member from an interleaved store group");
2110 
2111  Value *StoredVec =
2112  getVectorValue(dyn_cast<StoreInst>(Member)->getValueOperand())[Part];
2113  if (Group->isReverse())
2114  StoredVec = reverseVector(StoredVec);
2115 
2116  // If this member has different type, cast it to an unified type.
2117  if (StoredVec->getType() != SubVT)
2118  StoredVec = Builder.CreateBitOrPointerCast(StoredVec, SubVT);
2119 
2120  StoredVecs.push_back(StoredVec);
2121  }
2122 
2123  // Concatenate all vectors into a wide vector.
2124  Value *WideVec = ConcatenateVectors(Builder, StoredVecs);
2125 
2126  // Interleave the elements in the wide vector.
2127  Constant *IMask = getInterleavedMask(Builder, VF, InterleaveFactor);
2128  Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2129  "interleaved.vec");
2130 
2131  Instruction *NewStoreInstr =
2132  Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment());
2133  propagateMetadata(NewStoreInstr, Instr);
2134  }
2135 }
2136 
2137 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
2138  // Attempt to issue a wide load.
2139  LoadInst *LI = dyn_cast<LoadInst>(Instr);
2140  StoreInst *SI = dyn_cast<StoreInst>(Instr);
2141 
2142  assert((LI || SI) && "Invalid Load/Store instruction");
2143 
2144  // Try to vectorize the interleave group if this access is interleaved.
2145  if (Legal->isAccessInterleaved(Instr))
2146  return vectorizeInterleaveGroup(Instr);
2147 
2148  Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType();
2149  Type *DataTy = VectorType::get(ScalarDataTy, VF);
2150  Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
2151  unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
2152  // An alignment of 0 means target abi alignment. We need to use the scalar's
2153  // target abi alignment in such a case.
2154  const DataLayout &DL = Instr->getModule()->getDataLayout();
2155  if (!Alignment)
2156  Alignment = DL.getABITypeAlignment(ScalarDataTy);
2157  unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2158  unsigned ScalarAllocatedSize = DL.getTypeAllocSize(ScalarDataTy);
2159  unsigned VectorElementSize = DL.getTypeStoreSize(DataTy) / VF;
2160 
2161  if (SI && Legal->blockNeedsPredication(SI->getParent()) &&
2162  !Legal->isMaskRequired(SI))
2163  return scalarizeInstruction(Instr, true);
2164 
2165  if (ScalarAllocatedSize != VectorElementSize)
2166  return scalarizeInstruction(Instr);
2167 
2168  // If the pointer is loop invariant or if it is non-consecutive,
2169  // scalarize the load.
2170  int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
2171  bool Reverse = ConsecutiveStride < 0;
2172  bool UniformLoad = LI && Legal->isUniform(Ptr);
2173  if (!ConsecutiveStride || UniformLoad)
2174  return scalarizeInstruction(Instr);
2175 
2176  Constant *Zero = Builder.getInt32(0);
2177  VectorParts &Entry = WidenMap.get(Instr);
2178 
2179  // Handle consecutive loads/stores.
2181  if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) {
2182  setDebugLocFromInst(Builder, Gep);
2183  Value *PtrOperand = Gep->getPointerOperand();
2184  Value *FirstBasePtr = getVectorValue(PtrOperand)[0];
2185  FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero);
2186 
2187  // Create the new GEP with the new induction variable.
2188  GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
2189  Gep2->setOperand(0, FirstBasePtr);
2190  Gep2->setName("gep.indvar.base");
2191  Ptr = Builder.Insert(Gep2);
2192  } else if (Gep) {
2193  setDebugLocFromInst(Builder, Gep);
2194  assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()),
2195  OrigLoop) && "Base ptr must be invariant");
2196 
2197  // The last index does not have to be the induction. It can be
2198  // consecutive and be a function of the index. For example A[I+1];
2199  unsigned NumOperands = Gep->getNumOperands();
2200  unsigned InductionOperand = getGEPInductionOperand(Gep);
2201  // Create the new GEP with the new induction variable.
2202  GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
2203 
2204  for (unsigned i = 0; i < NumOperands; ++i) {
2205  Value *GepOperand = Gep->getOperand(i);
2206  Instruction *GepOperandInst = dyn_cast<Instruction>(GepOperand);
2207 
2208  // Update last index or loop invariant instruction anchored in loop.
2209  if (i == InductionOperand ||
2210  (GepOperandInst && OrigLoop->contains(GepOperandInst))) {
2211  assert((i == InductionOperand ||
2212  SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) &&
2213  "Must be last index or loop invariant");
2214 
2215  VectorParts &GEPParts = getVectorValue(GepOperand);
2216  Value *Index = GEPParts[0];
2217  Index = Builder.CreateExtractElement(Index, Zero);
2218  Gep2->setOperand(i, Index);
2219  Gep2->setName("gep.indvar.idx");
2220  }
2221  }
2222  Ptr = Builder.Insert(Gep2);
2223  } else {
2224  // Use the induction element ptr.
2225  assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
2226  setDebugLocFromInst(Builder, Ptr);
2227  VectorParts &PtrVal = getVectorValue(Ptr);
2228  Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
2229  }
2230 
2231  VectorParts Mask = createBlockInMask(Instr->getParent());
2232  // Handle Stores:
2233  if (SI) {
2234  assert(!Legal->isUniform(SI->getPointerOperand()) &&
2235  "We do not allow storing to uniform addresses");
2236  setDebugLocFromInst(Builder, SI);
2237  // We don't want to update the value in the map as it might be used in
2238  // another expression. So don't use a reference type for "StoredVal".
2239  VectorParts StoredVal = getVectorValue(SI->getValueOperand());
2240 
2241  for (unsigned Part = 0; Part < UF; ++Part) {
2242  // Calculate the pointer for the specific unroll-part.
2243  Value *PartPtr =
2244  Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
2245 
2246  if (Reverse) {
2247  // If we store to reverse consecutive memory locations then we need
2248  // to reverse the order of elements in the stored value.
2249  StoredVal[Part] = reverseVector(StoredVal[Part]);
2250  // If the address is consecutive but reversed, then the
2251  // wide store needs to start at the last vector element.
2252  PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
2253  PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
2254  Mask[Part] = reverseVector(Mask[Part]);
2255  }
2256 
2257  Value *VecPtr = Builder.CreateBitCast(PartPtr,
2258  DataTy->getPointerTo(AddressSpace));
2259 
2260  Instruction *NewSI;
2261  if (Legal->isMaskRequired(SI))
2262  NewSI = Builder.CreateMaskedStore(StoredVal[Part], VecPtr, Alignment,
2263  Mask[Part]);
2264  else
2265  NewSI = Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment);
2266  propagateMetadata(NewSI, SI);
2267  }
2268  return;
2269  }
2270 
2271  // Handle loads.
2272  assert(LI && "Must have a load instruction");
2273  setDebugLocFromInst(Builder, LI);
2274  for (unsigned Part = 0; Part < UF; ++Part) {
2275  // Calculate the pointer for the specific unroll-part.
2276  Value *PartPtr =
2277  Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
2278 
2279  if (Reverse) {
2280  // If the address is consecutive but reversed, then the
2281  // wide load needs to start at the last vector element.
2282  PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
2283  PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
2284  Mask[Part] = reverseVector(Mask[Part]);
2285  }
2286 
2287  Instruction* NewLI;
2288  Value *VecPtr = Builder.CreateBitCast(PartPtr,
2289  DataTy->getPointerTo(AddressSpace));
2290  if (Legal->isMaskRequired(LI))
2291  NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
2292  UndefValue::get(DataTy),
2293  "wide.masked.load");
2294  else
2295  NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");
2296  propagateMetadata(NewLI, LI);
2297  Entry[Part] = Reverse ? reverseVector(NewLI) : NewLI;
2298  }
2299 }
2300 
2301 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredicateStore) {
2302  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2303  // Holds vector parameters or scalars, in case of uniform vals.
2305 
2306  setDebugLocFromInst(Builder, Instr);
2307 
2308  // Find all of the vectorized parameters.
2309  for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2310  Value *SrcOp = Instr->getOperand(op);
2311 
2312  // If we are accessing the old induction variable, use the new one.
2313  if (SrcOp == OldInduction) {
2314  Params.push_back(getVectorValue(SrcOp));
2315  continue;
2316  }
2317 
2318  // Try using previously calculated values.
2319  Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
2320 
2321  // If the src is an instruction that appeared earlier in the basic block
2322  // then it should already be vectorized.
2323  if (SrcInst && OrigLoop->contains(SrcInst)) {
2324  assert(WidenMap.has(SrcInst) && "Source operand is unavailable");
2325  // The parameter is a vector value from earlier.
2326  Params.push_back(WidenMap.get(SrcInst));
2327  } else {
2328  // The parameter is a scalar from outside the loop. Maybe even a constant.
2329  VectorParts Scalars;
2330  Scalars.append(UF, SrcOp);
2331  Params.push_back(Scalars);
2332  }
2333  }
2334 
2335  assert(Params.size() == Instr->getNumOperands() &&
2336  "Invalid number of operands");
2337 
2338  // Does this instruction return a value ?
2339  bool IsVoidRetTy = Instr->getType()->isVoidTy();
2340 
2341  Value *UndefVec = IsVoidRetTy ? nullptr :
2342  UndefValue::get(VectorType::get(Instr->getType(), VF));
2343  // Create a new entry in the WidenMap and initialize it to Undef or Null.
2344  VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
2345 
2346  Instruction *InsertPt = Builder.GetInsertPoint();
2347  BasicBlock *IfBlock = Builder.GetInsertBlock();
2348  BasicBlock *CondBlock = nullptr;
2349 
2350  VectorParts Cond;
2351  Loop *VectorLp = nullptr;
2352  if (IfPredicateStore) {
2353  assert(Instr->getParent()->getSinglePredecessor() &&
2354  "Only support single predecessor blocks");
2355  Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
2356  Instr->getParent());
2357  VectorLp = LI->getLoopFor(IfBlock);
2358  assert(VectorLp && "Must have a loop for this block");
2359  }
2360 
2361  // For each vector unroll 'part':
2362  for (unsigned Part = 0; Part < UF; ++Part) {
2363  // For each scalar that we create:
2364  for (unsigned Width = 0; Width < VF; ++Width) {
2365 
2366  // Start if-block.
2367  Value *Cmp = nullptr;
2368  if (IfPredicateStore) {
2369  Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width));
2370  Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1));
2371  CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
2372  LoopVectorBody.push_back(CondBlock);
2373  VectorLp->addBasicBlockToLoop(CondBlock, *LI);
2374  // Update Builder with newly created basic block.
2375  Builder.SetInsertPoint(InsertPt);
2376  }
2377 
2378  Instruction *Cloned = Instr->clone();
2379  if (!IsVoidRetTy)
2380  Cloned->setName(Instr->getName() + ".cloned");
2381  // Replace the operands of the cloned instructions with extracted scalars.
2382  for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2383  Value *Op = Params[op][Part];
2384  // Param is a vector. Need to extract the right lane.
2385  if (Op->getType()->isVectorTy())
2386  Op = Builder.CreateExtractElement(Op, Builder.getInt32(Width));
2387  Cloned->setOperand(op, Op);
2388  }
2389 
2390  // Place the cloned scalar in the new loop.
2391  Builder.Insert(Cloned);
2392 
2393  // If the original scalar returns a value we need to place it in a vector
2394  // so that future users will be able to use it.
2395  if (!IsVoidRetTy)
2396  VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned,
2397  Builder.getInt32(Width));
2398  // End if-block.
2399  if (IfPredicateStore) {
2400  BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
2401  LoopVectorBody.push_back(NewIfBlock);
2402  VectorLp->addBasicBlockToLoop(NewIfBlock, *LI);
2403  Builder.SetInsertPoint(InsertPt);
2405  BranchInst::Create(CondBlock, NewIfBlock, Cmp));
2406  IfBlock = NewIfBlock;
2407  }
2408  }
2409  }
2410 }
2411 
2413  Instruction *Loc) {
2414  if (FirstInst)
2415  return FirstInst;
2416  if (Instruction *I = dyn_cast<Instruction>(V))
2417  return I->getParent() == Loc->getParent() ? I : nullptr;
2418  return nullptr;
2419 }
2420 
2421 std::pair<Instruction *, Instruction *>
2422 InnerLoopVectorizer::addStrideCheck(Instruction *Loc) {
2423  Instruction *tnullptr = nullptr;
2424  if (!Legal->mustCheckStrides())
2425  return std::pair<Instruction *, Instruction *>(tnullptr, tnullptr);
2426 
2427  IRBuilder<> ChkBuilder(Loc);
2428 
2429  // Emit checks.
2430  Value *Check = nullptr;
2431  Instruction *FirstInst = nullptr;
2432  for (SmallPtrSet<Value *, 8>::iterator SI = Legal->strides_begin(),
2433  SE = Legal->strides_end();
2434  SI != SE; ++SI) {
2435  Value *Ptr = stripIntegerCast(*SI);
2436  Value *C = ChkBuilder.CreateICmpNE(Ptr, ConstantInt::get(Ptr->getType(), 1),
2437  "stride.chk");
2438  // Store the first instruction we create.
2439  FirstInst = getFirstInst(FirstInst, C, Loc);
2440  if (Check)
2441  Check = ChkBuilder.CreateOr(Check, C);
2442  else
2443  Check = C;
2444  }
2445 
2446  // We have to do this trickery because the IRBuilder might fold the check to a
2447  // constant expression in which case there is no Instruction anchored in a
2448  // the block.
2449  LLVMContext &Ctx = Loc->getContext();
2450  Instruction *TheCheck =
2451  BinaryOperator::CreateAnd(Check, ConstantInt::getTrue(Ctx));
2452  ChkBuilder.Insert(TheCheck, "stride.not.one");
2453  FirstInst = getFirstInst(FirstInst, TheCheck, Loc);
2454 
2455  return std::make_pair(FirstInst, TheCheck);
2456 }
2457 
2458 void InnerLoopVectorizer::createEmptyLoop() {
2459  /*
2460  In this function we generate a new loop. The new loop will contain
2461  the vectorized instructions while the old loop will continue to run the
2462  scalar remainder.
2463 
2464  [ ] <-- Back-edge taken count overflow check.
2465  / |
2466  / v
2467  | [ ] <-- vector loop bypass (may consist of multiple blocks).
2468  | / |
2469  | / v
2470  || [ ] <-- vector pre header.
2471  || |
2472  || v
2473  || [ ] \
2474  || [ ]_| <-- vector loop.
2475  || |
2476  | \ v
2477  | >[ ] <--- middle-block.
2478  | / |
2479  | / v
2480  -|- >[ ] <--- new preheader.
2481  | |
2482  | v
2483  | [ ] \
2484  | [ ]_| <-- old scalar loop to handle remainder.
2485  \ |
2486  \ v
2487  >[ ] <-- exit block.
2488  ...
2489  */
2490 
2491  BasicBlock *OldBasicBlock = OrigLoop->getHeader();
2492  BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2493  BasicBlock *ExitBlock = OrigLoop->getExitBlock();
2494  assert(VectorPH && "Invalid loop structure");
2495  assert(ExitBlock && "Must have an exit block");
2496 
2497  // Some loops have a single integer induction variable, while other loops
2498  // don't. One example is c++ iterators that often have multiple pointer
2499  // induction variables. In the code below we also support a case where we
2500  // don't have a single induction variable.
2501  OldInduction = Legal->getInduction();
2502  Type *IdxTy = Legal->getWidestInductionType();
2503 
2504  // Find the loop boundaries.
2505  const SCEV *ExitCount = SE->getBackedgeTakenCount(OrigLoop);
2506  assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
2507 
2508  // The exit count might have the type of i64 while the phi is i32. This can
2509  // happen if we have an induction variable that is sign extended before the
2510  // compare. The only way that we get a backedge taken count is that the
2511  // induction variable was signed and as such will not overflow. In such a case
2512  // truncation is legal.
2513  if (ExitCount->getType()->getPrimitiveSizeInBits() >
2514  IdxTy->getPrimitiveSizeInBits())
2515  ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy);
2516 
2517  const SCEV *BackedgeTakeCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy);
2518  // Get the total trip count from the count by adding 1.
2519  ExitCount = SE->getAddExpr(BackedgeTakeCount,
2520  SE->getConstant(BackedgeTakeCount->getType(), 1));
2521 
2522  const DataLayout &DL = OldBasicBlock->getModule()->getDataLayout();
2523 
2524  // Expand the trip count and place the new instructions in the preheader.
2525  // Notice that the pre-header does not change, only the loop body.
2526  SCEVExpander Exp(*SE, DL, "induction");
2527 
2528  // We need to test whether the backedge-taken count is uint##_max. Adding one
2529  // to it will cause overflow and an incorrect loop trip count in the vector
2530  // body. In case of overflow we want to directly jump to the scalar remainder
2531  // loop.
2532  Value *BackedgeCount =
2533  Exp.expandCodeFor(BackedgeTakeCount, BackedgeTakeCount->getType(),
2534  VectorPH->getTerminator());
2535  if (BackedgeCount->getType()->isPointerTy())
2536  BackedgeCount = CastInst::CreatePointerCast(BackedgeCount, IdxTy,
2537  "backedge.ptrcnt.to.int",
2538  VectorPH->getTerminator());
2539  Instruction *CheckBCOverflow =
2540  CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, BackedgeCount,
2541  Constant::getAllOnesValue(BackedgeCount->getType()),
2542  "backedge.overflow", VectorPH->getTerminator());
2543 
2544  // The loop index does not have to start at Zero. Find the original start
2545  // value from the induction PHI node. If we don't have an induction variable
2546  // then we know that it starts at zero.
2547  Builder.SetInsertPoint(VectorPH->getTerminator());
2548  Value *StartIdx = ExtendedIdx =
2549  OldInduction
2550  ? Builder.CreateZExt(OldInduction->getIncomingValueForBlock(VectorPH),
2551  IdxTy)
2552  : ConstantInt::get(IdxTy, 0);
2553 
2554  // Count holds the overall loop count (N).
2555  Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2556  VectorPH->getTerminator());
2557 
2558  LoopBypassBlocks.push_back(VectorPH);
2559 
2560  // Split the single block loop into the two loop structure described above.
2561  BasicBlock *VecBody =
2562  VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
2563  BasicBlock *MiddleBlock =
2564  VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
2565  BasicBlock *ScalarPH =
2566  MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
2567 
2568  // Create and register the new vector loop.
2569  Loop* Lp = new Loop();
2570  Loop *ParentLoop = OrigLoop->getParentLoop();
2571 
2572  // Insert the new loop into the loop nest and register the new basic blocks
2573  // before calling any utilities such as SCEV that require valid LoopInfo.
2574  if (ParentLoop) {
2575  ParentLoop->addChildLoop(Lp);
2576  ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
2577  ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
2578  } else {
2579  LI->addTopLevelLoop(Lp);
2580  }
2581  Lp->addBasicBlockToLoop(VecBody, *LI);
2582 
2583  // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
2584  // inside the loop.
2585  Builder.SetInsertPoint(VecBody->getFirstNonPHI());
2586 
2587  // Generate the induction variable.
2588  setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction));
2589  Induction = Builder.CreatePHI(IdxTy, 2, "index");
2590  // The loop step is equal to the vectorization factor (num of SIMD elements)
2591  // times the unroll factor (num of SIMD instructions).
2592  Constant *Step = ConstantInt::get(IdxTy, VF * UF);
2593 
2594  // Generate code to check that the loop's trip count that we computed by
2595  // adding one to the backedge-taken count will not overflow.
2596  BasicBlock *NewVectorPH =
2597  VectorPH->splitBasicBlock(VectorPH->getTerminator(), "overflow.checked");
2598  if (ParentLoop)
2599  ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI);
2601  VectorPH->getTerminator(),
2602  BranchInst::Create(ScalarPH, NewVectorPH, CheckBCOverflow));
2603  VectorPH = NewVectorPH;
2604 
2605  // This is the IR builder that we use to add all of the logic for bypassing
2606  // the new vector loop.
2607  IRBuilder<> BypassBuilder(VectorPH->getTerminator());
2608  setDebugLocFromInst(BypassBuilder,
2609  getDebugLocFromInstOrOperands(OldInduction));
2610 
2611  // We may need to extend the index in case there is a type mismatch.
2612  // We know that the count starts at zero and does not overflow.
2613  if (Count->getType() != IdxTy) {
2614  // The exit count can be of pointer type. Convert it to the correct
2615  // integer type.
2616  if (ExitCount->getType()->isPointerTy())
2617  Count = BypassBuilder.CreatePointerCast(Count, IdxTy, "ptrcnt.to.int");
2618  else
2619  Count = BypassBuilder.CreateZExtOrTrunc(Count, IdxTy, "cnt.cast");
2620  }
2621 
2622  // Add the start index to the loop count to get the new end index.
2623  Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx, "end.idx");
2624 
2625  // Now we need to generate the expression for N - (N % VF), which is
2626  // the part that the vectorized body will execute.
2627  Value *R = BypassBuilder.CreateURem(Count, Step, "n.mod.vf");
2628  Value *CountRoundDown = BypassBuilder.CreateSub(Count, R, "n.vec");
2629  Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx,
2630  "end.idx.rnd.down");
2631 
2632  // Now, compare the new count to zero. If it is zero skip the vector loop and
2633  // jump to the scalar loop.
2634  Value *Cmp =
2635  BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, "cmp.zero");
2636  NewVectorPH =
2637  VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph");
2638  if (ParentLoop)
2639  ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI);
2640  LoopBypassBlocks.push_back(VectorPH);
2641  ReplaceInstWithInst(VectorPH->getTerminator(),
2642  BranchInst::Create(MiddleBlock, NewVectorPH, Cmp));
2643  VectorPH = NewVectorPH;
2644 
2645  // Generate the code to check that the strides we assumed to be one are really
2646  // one. We want the new basic block to start at the first instruction in a
2647  // sequence of instructions that form a check.
2648  Instruction *StrideCheck;
2649  Instruction *FirstCheckInst;
2650  std::tie(FirstCheckInst, StrideCheck) =
2651  addStrideCheck(VectorPH->getTerminator());
2652  if (StrideCheck) {
2653  AddedSafetyChecks = true;
2654  // Create a new block containing the stride check.
2655  VectorPH->setName("vector.stridecheck");
2656  NewVectorPH =
2657  VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph");
2658  if (ParentLoop)
2659  ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI);
2660  LoopBypassBlocks.push_back(VectorPH);
2661 
2662  // Replace the branch into the memory check block with a conditional branch
2663  // for the "few elements case".
2665  VectorPH->getTerminator(),
2666  BranchInst::Create(MiddleBlock, NewVectorPH, StrideCheck));
2667 
2668  VectorPH = NewVectorPH;
2669  }
2670 
2671  // Generate the code that checks in runtime if arrays overlap. We put the
2672  // checks into a separate block to make the more common case of few elements
2673  // faster.
2674  Instruction *MemRuntimeCheck;
2675  std::tie(FirstCheckInst, MemRuntimeCheck) =
2676  Legal->getLAI()->addRuntimeCheck(VectorPH->getTerminator());
2677  if (MemRuntimeCheck) {
2678  AddedSafetyChecks = true;
2679  // Create a new block containing the memory check.
2680  VectorPH->setName("vector.memcheck");
2681  NewVectorPH =
2682  VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph");
2683  if (ParentLoop)
2684  ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI);
2685  LoopBypassBlocks.push_back(VectorPH);
2686 
2687  // Replace the branch into the memory check block with a conditional branch
2688  // for the "few elements case".
2690  VectorPH->getTerminator(),
2691  BranchInst::Create(MiddleBlock, NewVectorPH, MemRuntimeCheck));
2692 
2693  VectorPH = NewVectorPH;
2694  }
2695 
2696  // We are going to resume the execution of the scalar loop.
2697  // Go over all of the induction variables that we found and fix the
2698  // PHIs that are left in the scalar version of the loop.
2699  // The starting values of PHI nodes depend on the counter of the last
2700  // iteration in the vectorized loop.
2701  // If we come from a bypass edge then we need to start from the original
2702  // start value.
2703 
2704  // This variable saves the new starting index for the scalar loop.
2705  PHINode *ResumeIndex = nullptr;
2707  LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
2708  // Set builder to point to last bypass block.
2709  BypassBuilder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator());
2710  for (I = List->begin(), E = List->end(); I != E; ++I) {
2711  PHINode *OrigPhi = I->first;
2712  LoopVectorizationLegality::InductionInfo II = I->second;
2713 
2714  Type *ResumeValTy = (OrigPhi == OldInduction) ? IdxTy : OrigPhi->getType();
2715  PHINode *ResumeVal = PHINode::Create(ResumeValTy, 2, "resume.val",
2716  MiddleBlock->getTerminator());
2717  // We might have extended the type of the induction variable but we need a
2718  // truncated version for the scalar loop.
2719  PHINode *TruncResumeVal = (OrigPhi == OldInduction) ?
2720  PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val",
2721  MiddleBlock->getTerminator()) : nullptr;
2722 
2723  // Create phi nodes to merge from the backedge-taken check block.
2724  PHINode *BCResumeVal = PHINode::Create(ResumeValTy, 3, "bc.resume.val",
2725  ScalarPH->getTerminator());
2726  BCResumeVal->addIncoming(ResumeVal, MiddleBlock);
2727 
2728  PHINode *BCTruncResumeVal = nullptr;
2729  if (OrigPhi == OldInduction) {
2730  BCTruncResumeVal =
2731  PHINode::Create(OrigPhi->getType(), 2, "bc.trunc.resume.val",
2732  ScalarPH->getTerminator());
2733  BCTruncResumeVal->addIncoming(TruncResumeVal, MiddleBlock);
2734  }
2735 
2736  Value *EndValue = nullptr;
2737  switch (II.IK) {
2738  case LoopVectorizationLegality::IK_NoInduction:
2739  llvm_unreachable("Unknown induction");
2740  case LoopVectorizationLegality::IK_IntInduction: {
2741  // Handle the integer induction counter.
2742  assert(OrigPhi->getType()->isIntegerTy() && "Invalid type");
2743 
2744  // We have the canonical induction variable.
2745  if (OrigPhi == OldInduction) {
2746  // Create a truncated version of the resume value for the scalar loop,
2747  // we might have promoted the type to a larger width.
2748  EndValue =
2749  BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType());
2750  // The new PHI merges the original incoming value, in case of a bypass,
2751  // or the value at the end of the vectorized loop.
2752  for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
2753  TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
2754  TruncResumeVal->addIncoming(EndValue, VecBody);
2755 
2756  BCTruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]);
2757 
2758  // We know what the end value is.
2759  EndValue = IdxEndRoundDown;
2760  // We also know which PHI node holds it.
2761  ResumeIndex = ResumeVal;
2762  break;
2763  }
2764 
2765  // Not the canonical induction variable - add the vector loop count to the
2766  // start value.
2767  Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,
2768  II.StartValue->getType(),
2769  "cast.crd");
2770  EndValue = II.transform(BypassBuilder, CRD);
2771  EndValue->setName("ind.end");
2772  break;
2773  }
2774  case LoopVectorizationLegality::IK_PtrInduction: {
2775  Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,
2776  II.StepValue->getType(),
2777  "cast.crd");
2778  EndValue = II.transform(BypassBuilder, CRD);
2779  EndValue->setName("ptr.ind.end");
2780  break;
2781  }
2782  }// end of case
2783 
2784  // The new PHI merges the original incoming value, in case of a bypass,
2785  // or the value at the end of the vectorized loop.
2786  for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) {
2787  if (OrigPhi == OldInduction)
2788  ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]);
2789  else
2790  ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
2791  }
2792  ResumeVal->addIncoming(EndValue, VecBody);
2793 
2794  // Fix the scalar body counter (PHI node).
2795  unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
2796 
2797  // The old induction's phi node in the scalar body needs the truncated
2798  // value.
2799  if (OrigPhi == OldInduction) {
2800  BCResumeVal->addIncoming(StartIdx, LoopBypassBlocks[0]);
2801  OrigPhi->setIncomingValue(BlockIdx, BCTruncResumeVal);
2802  } else {
2803  BCResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]);
2804  OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
2805  }
2806  }
2807 
2808  // If we are generating a new induction variable then we also need to
2809  // generate the code that calculates the exit value. This value is not
2810  // simply the end of the counter because we may skip the vectorized body
2811  // in case of a runtime check.
2812  if (!OldInduction){
2813  assert(!ResumeIndex && "Unexpected resume value found");
2814  ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val",
2815  MiddleBlock->getTerminator());
2816  for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
2817  ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]);
2818  ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
2819  }
2820 
2821  // Make sure that we found the index where scalar loop needs to continue.
2822  assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() &&
2823  "Invalid resume Index");
2824 
2825  // Add a check in the middle block to see if we have completed
2826  // all of the iterations in the first vector loop.
2827  // If (N - N%VF) == N, then we *don't* need to run the remainder.
2828  Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd,
2829  ResumeIndex, "cmp.n",
2830  MiddleBlock->getTerminator());
2831  ReplaceInstWithInst(MiddleBlock->getTerminator(),
2832  BranchInst::Create(ExitBlock, ScalarPH, CmpN));
2833 
2834  // Create i+1 and fill the PHINode.
2835  Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next");
2836  Induction->addIncoming(StartIdx, VectorPH);
2837  Induction->addIncoming(NextIdx, VecBody);
2838  // Create the compare.
2839  Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown);
2840  Builder.CreateCondBr(ICmp, MiddleBlock, VecBody);
2841 
2842  // Now we have two terminators. Remove the old one from the block.
2843  VecBody->getTerminator()->eraseFromParent();
2844 
2845  // Get ready to start creating new instructions into the vectorized body.
2846  Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
2847 
2848  // Save the state.
2849  LoopVectorPreHeader = VectorPH;
2850  LoopScalarPreHeader = ScalarPH;
2851  LoopMiddleBlock = MiddleBlock;
2852  LoopExitBlock = ExitBlock;
2853  LoopVectorBody.push_back(VecBody);
2854  LoopScalarBody = OldBasicBlock;
2855 
2856  LoopVectorizeHints Hints(Lp, true);
2857  Hints.setAlreadyVectorized();
2858 }
2859 
2860 namespace {
2861 struct CSEDenseMapInfo {
2862  static bool canHandle(Instruction *I) {
2863  return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
2864  isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
2865  }
2866  static inline Instruction *getEmptyKey() {
2868  }
2869  static inline Instruction *getTombstoneKey() {
2871  }
2872  static unsigned getHashValue(Instruction *I) {
2873  assert(canHandle(I) && "Unknown instruction!");
2875  I->value_op_end()));
2876  }
2877  static bool isEqual(Instruction *LHS, Instruction *RHS) {
2878  if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2879  LHS == getTombstoneKey() || RHS == getTombstoneKey())
2880  return LHS == RHS;
2881  return LHS->isIdenticalTo(RHS);
2882  }
2883 };
2884 }
2885 
2886 /// \brief Check whether this block is a predicated block.
2887 /// Due to if predication of stores we might create a sequence of "if(pred) a[i]
2888 /// = ...; " blocks. We start with one vectorized basic block. For every
2889 /// conditional block we split this vectorized block. Therefore, every second
2890 /// block will be a predicated one.
2891 static bool isPredicatedBlock(unsigned BlockNum) {
2892  return BlockNum % 2;
2893 }
2894 
2895 ///\brief Perform cse of induction variable instructions.
2897  // Perform simple cse.
2899  for (unsigned i = 0, e = BBs.size(); i != e; ++i) {
2900  BasicBlock *BB = BBs[i];
2901  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
2902  Instruction *In = I++;
2903 
2904  if (!CSEDenseMapInfo::canHandle(In))
2905  continue;
2906 
2907  // Check if we can replace this instruction with any of the
2908  // visited instructions.
2909  if (Instruction *V = CSEMap.lookup(In)) {
2910  In->replaceAllUsesWith(V);
2911  In->eraseFromParent();
2912  continue;
2913  }
2914  // Ignore instructions in conditional blocks. We create "if (pred) a[i] =
2915  // ...;" blocks for predicated stores. Every second block is a predicated
2916  // block.
2917  if (isPredicatedBlock(i))
2918  continue;
2919 
2920  CSEMap[In] = In;
2921  }
2922  }
2923 }
2924 
2925 /// \brief Adds a 'fast' flag to floating point operations.
2927  if (isa<FPMathOperator>(V)){
2929  Flags.setUnsafeAlgebra();
2930  cast<Instruction>(V)->setFastMathFlags(Flags);
2931  }
2932  return V;
2933 }
2934 
2935 /// Estimate the overhead of scalarizing a value. Insert and Extract are set if
2936 /// the result needs to be inserted and/or extracted from vectors.
2937 static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract,
2938  const TargetTransformInfo &TTI) {
2939  if (Ty->isVoidTy())
2940  return 0;
2941 
2942  assert(Ty->isVectorTy() && "Can only scalarize vectors");
2943  unsigned Cost = 0;
2944 
2945  for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
2946  if (Insert)
2947  Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, i);
2948  if (Extract)
2949  Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, Ty, i);
2950  }
2951 
2952  return Cost;
2953 }
2954 
2955 // Estimate cost of a call instruction CI if it were vectorized with factor VF.
2956 // Return the cost of the instruction, including scalarization overhead if it's
2957 // needed. The flag NeedToScalarize shows if the call needs to be scalarized -
2958 // i.e. either vector version isn't available, or is too expensive.
2959 static unsigned getVectorCallCost(CallInst *CI, unsigned VF,
2960  const TargetTransformInfo &TTI,
2961  const TargetLibraryInfo *TLI,
2962  bool &NeedToScalarize) {
2963  Function *F = CI->getCalledFunction();
2964  StringRef FnName = CI->getCalledFunction()->getName();
2965  Type *ScalarRetTy = CI->getType();
2966  SmallVector<Type *, 4> Tys, ScalarTys;
2967  for (auto &ArgOp : CI->arg_operands())
2968  ScalarTys.push_back(ArgOp->getType());
2969 
2970  // Estimate cost of scalarized vector call. The source operands are assumed
2971  // to be vectors, so we need to extract individual elements from there,
2972  // execute VF scalar calls, and then gather the result into the vector return
2973  // value.
2974  unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
2975  if (VF == 1)
2976  return ScalarCallCost;
2977 
2978  // Compute corresponding vector type for return value and arguments.
2979  Type *RetTy = ToVectorTy(ScalarRetTy, VF);
2980  for (unsigned i = 0, ie = ScalarTys.size(); i != ie; ++i)
2981  Tys.push_back(ToVectorTy(ScalarTys[i], VF));
2982 
2983  // Compute costs of unpacking argument values for the scalar calls and
2984  // packing the return values to a vector.
2985  unsigned ScalarizationCost =
2986  getScalarizationOverhead(RetTy, true, false, TTI);
2987  for (unsigned i = 0, ie = Tys.size(); i != ie; ++i)
2988  ScalarizationCost += getScalarizationOverhead(Tys[i], false, true, TTI);
2989 
2990  unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
2991 
2992  // If we can't emit a vector call for this function, then the currently found
2993  // cost is the cost we need to return.
2994  NeedToScalarize = true;
2995  if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
2996  return Cost;
2997 
2998  // If the corresponding vector cost is cheaper, return its cost.
2999  unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3000  if (VectorCallCost < Cost) {
3001  NeedToScalarize = false;
3002  return VectorCallCost;
3003  }
3004  return Cost;
3005 }
3006 
3007 // Estimate cost of an intrinsic call instruction CI if it were vectorized with
3008 // factor VF. Return the cost of the instruction, including scalarization
3009 // overhead if it's needed.
3010 static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF,
3011  const TargetTransformInfo &TTI,
3012  const TargetLibraryInfo *TLI) {
3014  assert(ID && "Expected intrinsic call!");
3015 
3016  Type *RetTy = ToVectorTy(CI->getType(), VF);
3018  for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
3019  Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF));
3020 
3021  return TTI.getIntrinsicInstrCost(ID, RetTy, Tys);
3022 }
3023 
3024 void InnerLoopVectorizer::vectorizeLoop() {
3025  //===------------------------------------------------===//
3026  //
3027  // Notice: any optimization or new instruction that go
3028  // into the code below should be also be implemented in
3029  // the cost-model.
3030  //
3031  //===------------------------------------------------===//
3032  Constant *Zero = Builder.getInt32(0);
3033 
3034  // In order to support reduction variables we need to be able to vectorize
3035  // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
3036  // stages. First, we create a new vector PHI node with no incoming edges.
3037  // We use this value when we vectorize all of the instructions that use the
3038  // PHI. Next, after all of the instructions in the block are complete we
3039  // add the new incoming edges to the PHI. At this point all of the
3040  // instructions in the basic block are vectorized, so we can use them to
3041  // construct the PHI.
3042  PhiVector RdxPHIsToFix;
3043 
3044  // Scan the loop in a topological order to ensure that defs are vectorized
3045  // before users.
3046  LoopBlocksDFS DFS(OrigLoop);
3047  DFS.perform(LI);
3048 
3049  // Vectorize all of the blocks in the original loop.
3050  for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
3051  be = DFS.endRPO(); bb != be; ++bb)
3052  vectorizeBlockInLoop(*bb, &RdxPHIsToFix);
3053 
3054  // At this point every instruction in the original loop is widened to
3055  // a vector form. We are almost done. Now, we need to fix the PHI nodes
3056  // that we vectorized. The PHI nodes are currently empty because we did
3057  // not want to introduce cycles. Notice that the remaining PHI nodes
3058  // that we need to fix are reduction variables.
3059 
3060  // Create the 'reduced' values for each of the induction vars.
3061  // The reduced values are the vector values that we scalarize and combine
3062  // after the loop is finished.
3063  for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end();
3064  it != e; ++it) {
3065  PHINode *RdxPhi = *it;
3066  assert(RdxPhi && "Unable to recover vectorized PHI");
3067 
3068  // Find the reduction variable descriptor.
3069  assert(Legal->getReductionVars()->count(RdxPhi) &&
3070  "Unable to find the reduction variable");
3071  RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[RdxPhi];
3072 
3074  TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3075  Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3077  RdxDesc.getMinMaxRecurrenceKind();
3078  setDebugLocFromInst(Builder, ReductionStartValue);
3079 
3080  // We need to generate a reduction vector from the incoming scalar.
3081  // To do so, we need to generate the 'identity' vector and override
3082  // one of the elements with the incoming scalar reduction. We need
3083  // to do it in the vector-loop preheader.
3084  Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator());
3085 
3086  // This is the vector-clone of the value that leaves the loop.
3087  VectorParts &VectorExit = getVectorValue(LoopExitInst);
3088  Type *VecTy = VectorExit[0]->getType();
3089 
3090  // Find the reduction identity variable. Zero for addition, or, xor,
3091  // one for multiplication, -1 for And.
3092  Value *Identity;
3093  Value *VectorStart;
3096  // MinMax reduction have the start value as their identify.
3097  if (VF == 1) {
3098  VectorStart = Identity = ReductionStartValue;
3099  } else {
3100  VectorStart = Identity =
3101  Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3102  }
3103  } else {
3104  // Handle other reduction kinds:
3106  RK, VecTy->getScalarType());
3107  if (VF == 1) {
3108  Identity = Iden;
3109  // This vector is the Identity vector where the first element is the
3110  // incoming scalar reduction.
3111  VectorStart = ReductionStartValue;
3112  } else {
3113  Identity = ConstantVector::getSplat(VF, Iden);
3114 
3115  // This vector is the Identity vector where the first element is the
3116  // incoming scalar reduction.
3117  VectorStart =
3118  Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3119  }
3120  }
3121 
3122  // Fix the vector-loop phi.
3123 
3124  // Reductions do not have to start at zero. They can start with
3125  // any loop invariant values.
3126  VectorParts &VecRdxPhi = WidenMap.get(RdxPhi);
3127  BasicBlock *Latch = OrigLoop->getLoopLatch();
3128  Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch);
3129  VectorParts &Val = getVectorValue(LoopVal);
3130  for (unsigned part = 0; part < UF; ++part) {
3131  // Make sure to add the reduction stat value only to the
3132  // first unroll part.
3133  Value *StartVal = (part == 0) ? VectorStart : Identity;
3134  cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal,
3135  LoopVectorPreHeader);
3136  cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part],
3137  LoopVectorBody.back());
3138  }
3139 
3140  // Before each round, move the insertion point right between
3141  // the PHIs and the values we are going to write.
3142  // This allows us to write both PHINodes and the extractelement
3143  // instructions.
3144  Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
3145 
3146  VectorParts RdxParts;
3147  setDebugLocFromInst(Builder, LoopExitInst);
3148  for (unsigned part = 0; part < UF; ++part) {
3149  // This PHINode contains the vectorized reduction variable, or
3150  // the initial value vector, if we bypass the vector loop.
3151  VectorParts &RdxExitVal = getVectorValue(LoopExitInst);
3152  PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
3153  Value *StartVal = (part == 0) ? VectorStart : Identity;
3154  for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
3155  NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]);
3156  NewPhi->addIncoming(RdxExitVal[part],
3157  LoopVectorBody.back());
3158  RdxParts.push_back(NewPhi);
3159  }
3160 
3161  // Reduce all of the unrolled parts into a single vector.
3162  Value *ReducedPartRdx = RdxParts[0];
3163  unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3164  setDebugLocFromInst(Builder, ReducedPartRdx);
3165  for (unsigned part = 1; part < UF; ++part) {
3166  if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3167  // Floating point operations had to be 'fast' to enable the reduction.
3168  ReducedPartRdx = addFastMathFlag(
3169  Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part],
3170  ReducedPartRdx, "bin.rdx"));
3171  else
3172  ReducedPartRdx = RecurrenceDescriptor::createMinMaxOp(
3173  Builder, MinMaxKind, ReducedPartRdx, RdxParts[part]);
3174  }
3175 
3176  if (VF > 1) {
3177  // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
3178  // and vector ops, reducing the set of values being computed by half each
3179  // round.
3180  assert(isPowerOf2_32(VF) &&
3181  "Reduction emission only supported for pow2 vectors!");
3182  Value *TmpVec = ReducedPartRdx;
3183  SmallVector<Constant*, 32> ShuffleMask(VF, nullptr);
3184  for (unsigned i = VF; i != 1; i >>= 1) {
3185  // Move the upper half of the vector to the lower half.
3186  for (unsigned j = 0; j != i/2; ++j)
3187  ShuffleMask[j] = Builder.getInt32(i/2 + j);
3188 
3189  // Fill the rest of the mask with undef.
3190  std::fill(&ShuffleMask[i/2], ShuffleMask.end(),
3191  UndefValue::get(Builder.getInt32Ty()));
3192 
3193  Value *Shuf =
3194  Builder.CreateShuffleVector(TmpVec,
3195  UndefValue::get(TmpVec->getType()),
3196  ConstantVector::get(ShuffleMask),
3197  "rdx.shuf");
3198 
3199  if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3200  // Floating point operations had to be 'fast' to enable the reduction.
3201  TmpVec = addFastMathFlag(Builder.CreateBinOp(
3202  (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx"));
3203  else
3204  TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind,
3205  TmpVec, Shuf);
3206  }
3207 
3208  // The result is in the first element of the vector.
3209  ReducedPartRdx = Builder.CreateExtractElement(TmpVec,
3210  Builder.getInt32(0));
3211  }
3212 
3213  // Create a phi node that merges control-flow from the backedge-taken check
3214  // block and the middle block.
3215  PHINode *BCBlockPhi = PHINode::Create(RdxPhi->getType(), 2, "bc.merge.rdx",
3216  LoopScalarPreHeader->getTerminator());
3217  BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[0]);
3218  BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3219 
3220  // Now, we need to fix the users of the reduction variable
3221  // inside and outside of the scalar remainder loop.
3222  // We know that the loop is in LCSSA form. We need to update the
3223  // PHI nodes in the exit blocks.
3224  for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
3225  LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
3226  PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
3227  if (!LCSSAPhi) break;
3228 
3229  // All PHINodes need to have a single entry edge, or two if
3230  // we already fixed them.
3231  assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3232 
3233  // We found our reduction value exit-PHI. Update it with the
3234  // incoming bypass edge.
3235  if (LCSSAPhi->getIncomingValue(0) == LoopExitInst) {
3236  // Add an edge coming from the bypass.
3237  LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3238  break;
3239  }
3240  }// end of the LCSSA phi scan.
3241 
3242  // Fix the scalar loop reduction variable with the incoming reduction sum
3243  // from the vector body and from the backedge value.
3244  int IncomingEdgeBlockIdx =
3245  (RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch());
3246  assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3247  // Pick the other block.
3248  int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3249  (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3250  (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3251  }// end of for each redux variable.
3252 
3253  fixLCSSAPHIs();
3254 
3255  // Remove redundant induction instructions.
3256  cse(LoopVectorBody);
3257 }
3258 
3259 void InnerLoopVectorizer::fixLCSSAPHIs() {
3260  for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
3261  LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
3262  PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
3263  if (!LCSSAPhi) break;
3264  if (LCSSAPhi->getNumIncomingValues() == 1)
3265  LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()),
3266  LoopMiddleBlock);
3267  }
3268 }
3269 
3271 InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
3272  assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) &&
3273  "Invalid edge");
3274 
3275  // Look for cached value.
3276  std::pair<BasicBlock*, BasicBlock*> Edge(Src, Dst);
3277  EdgeMaskCache::iterator ECEntryIt = MaskCache.find(Edge);
3278  if (ECEntryIt != MaskCache.end())
3279  return ECEntryIt->second;
3280 
3281  VectorParts SrcMask = createBlockInMask(Src);
3282 
3283  // The terminator has to be a branch inst!
3285  assert(BI && "Unexpected terminator found");
3286 
3287  if (BI->isConditional()) {
3288  VectorParts EdgeMask = getVectorValue(BI->getCondition());
3289 
3290  if (BI->getSuccessor(0) != Dst)
3291  for (unsigned part = 0; part < UF; ++part)
3292  EdgeMask[part] = Builder.CreateNot(EdgeMask[part]);
3293 
3294  for (unsigned part = 0; part < UF; ++part)
3295  EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]);
3296 
3297  MaskCache[Edge] = EdgeMask;
3298  return EdgeMask;
3299  }
3300 
3301  MaskCache[Edge] = SrcMask;
3302  return SrcMask;
3303 }
3304 
3306 InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
3307  assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
3308 
3309  // Loop incoming mask is all-one.
3310  if (OrigLoop->getHeader() == BB) {
3312  return getVectorValue(C);
3313  }
3314 
3315  // This is the block mask. We OR all incoming edges, and with zero.
3317  VectorParts BlockMask = getVectorValue(Zero);
3318 
3319  // For each pred:
3320  for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) {
3321  VectorParts EM = createEdgeMask(*it, BB);
3322  for (unsigned part = 0; part < UF; ++part)
3323  BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]);
3324  }
3325 
3326  return BlockMask;
3327 }
3328 
3329 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
3331  unsigned UF, unsigned VF, PhiVector *PV) {
3332  PHINode* P = cast<PHINode>(PN);
3333  // Handle reduction variables:
3334  if (Legal->getReductionVars()->count(P)) {
3335  for (unsigned part = 0; part < UF; ++part) {
3336  // This is phase one of vectorizing PHIs.
3337  Type *VecTy = (VF == 1) ? PN->getType() :
3338  VectorType::get(PN->getType(), VF);
3339  Entry[part] = PHINode::Create(VecTy, 2, "vec.phi",
3340  LoopVectorBody.back()-> getFirstInsertionPt());
3341  }
3342  PV->push_back(P);
3343  return;
3344  }
3345 
3346  setDebugLocFromInst(Builder, P);
3347  // Check for PHI nodes that are lowered to vector selects.
3348  if (P->getParent() != OrigLoop->getHeader()) {
3349  // We know that all PHIs in non-header blocks are converted into
3350  // selects, so we don't have to worry about the insertion order and we
3351  // can just use the builder.
3352  // At this point we generate the predication tree. There may be
3353  // duplications since this is a simple recursive scan, but future
3354  // optimizations will clean it up.
3355 
3356  unsigned NumIncoming = P->getNumIncomingValues();
3357 
3358  // Generate a sequence of selects of the form:
3359  // SELECT(Mask3, In3,
3360  // SELECT(Mask2, In2,
3361  // ( ...)))
3362  for (unsigned In = 0; In < NumIncoming; In++) {
3363  VectorParts Cond = createEdgeMask(P->getIncomingBlock(In),
3364  P->getParent());
3365  VectorParts &In0 = getVectorValue(P->getIncomingValue(In));
3366 
3367  for (unsigned part = 0; part < UF; ++part) {
3368  // We might have single edge PHIs (blocks) - use an identity
3369  // 'select' for the first PHI operand.
3370  if (In == 0)
3371  Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
3372  In0[part]);
3373  else
3374  // Select between the current value and the previous incoming edge
3375  // based on the incoming mask.
3376  Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
3377  Entry[part], "predphi");
3378  }
3379  }
3380  return;
3381  }
3382 
3383  // This PHINode must be an induction variable.
3384  // Make sure that we know about it.
3385  assert(Legal->getInductionVars()->count(P) &&
3386  "Not an induction variable");
3387 
3388  LoopVectorizationLegality::InductionInfo II =
3389  Legal->getInductionVars()->lookup(P);
3390 
3391  // FIXME: The newly created binary instructions should contain nsw/nuw flags,
3392  // which can be found from the original scalar operations.
3393  switch (II.IK) {
3394  case LoopVectorizationLegality::IK_NoInduction:
3395  llvm_unreachable("Unknown induction");
3396  case LoopVectorizationLegality::IK_IntInduction: {
3397  assert(P->getType() == II.StartValue->getType() && "Types must match");
3398  Type *PhiTy = P->getType();
3399  Value *Broadcasted;
3400  if (P == OldInduction) {
3401  // Handle the canonical induction variable. We might have had to
3402  // extend the type.
3403  Broadcasted = Builder.CreateTrunc(Induction, PhiTy);
3404  } else {
3405  // Handle other induction variables that are now based on the
3406  // canonical one.
3407  Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx,
3408  "normalized.idx");
3409  NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy);
3410  Broadcasted = II.transform(Builder, NormalizedIdx);
3411  Broadcasted->setName("offset.idx");
3412  }
3413  Broadcasted = getBroadcastInstrs(Broadcasted);
3414  // After broadcasting the induction variable we need to make the vector
3415  // consecutive by adding 0, 1, 2, etc.
3416  for (unsigned part = 0; part < UF; ++part)
3417  Entry[part] = getStepVector(Broadcasted, VF * part, II.StepValue);
3418  return;
3419  }
3420  case LoopVectorizationLegality::IK_PtrInduction:
3421  // Handle the pointer induction variable case.
3422  assert(P->getType()->isPointerTy() && "Unexpected type.");
3423  // This is the normalized GEP that starts counting at zero.
3424  Value *NormalizedIdx =
3425  Builder.CreateSub(Induction, ExtendedIdx, "normalized.idx");
3426  NormalizedIdx =
3427  Builder.CreateSExtOrTrunc(NormalizedIdx, II.StepValue->getType());
3428  // This is the vector of results. Notice that we don't generate
3429  // vector geps because scalar geps result in better code.
3430  for (unsigned part = 0; part < UF; ++part) {
3431  if (VF == 1) {
3432  int EltIndex = part;
3433  Constant *Idx = ConstantInt::get(NormalizedIdx->getType(), EltIndex);
3434  Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx);
3435  Value *SclrGep = II.transform(Builder, GlobalIdx);
3436  SclrGep->setName("next.gep");
3437  Entry[part] = SclrGep;
3438  continue;
3439  }
3440 
3441  Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
3442  for (unsigned int i = 0; i < VF; ++i) {
3443  int EltIndex = i + part * VF;
3444  Constant *Idx = ConstantInt::get(NormalizedIdx->getType(), EltIndex);
3445  Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx);
3446  Value *SclrGep = II.transform(Builder, GlobalIdx);
3447  SclrGep->setName("next.gep");
3448  VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
3449  Builder.getInt32(i),
3450  "insert.gep");
3451  }
3452  Entry[part] = VecVal;
3453  }
3454  return;
3455  }
3456 }
3457 
3458 void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
3459  // For each instruction in the old loop.
3460  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
3461  VectorParts &Entry = WidenMap.get(it);
3462  switch (it->getOpcode()) {
3463  case Instruction::Br:
3464  // Nothing to do for PHIs and BR, since we already took care of the
3465  // loop control flow instructions.
3466  continue;
3467  case Instruction::PHI: {
3468  // Vectorize PHINodes.
3469  widenPHIInstruction(it, Entry, UF, VF, PV);
3470  continue;
3471  }// End of PHI.
3472 
3473  case Instruction::Add:
3474  case Instruction::FAdd:
3475  case Instruction::Sub:
3476  case Instruction::FSub:
3477  case Instruction::Mul:
3478  case Instruction::FMul:
3479  case Instruction::UDiv:
3480  case Instruction::SDiv:
3481  case Instruction::FDiv:
3482  case Instruction::URem:
3483  case Instruction::SRem:
3484  case Instruction::FRem:
3485  case Instruction::Shl:
3486  case Instruction::LShr:
3487  case Instruction::AShr:
3488  case Instruction::And:
3489  case Instruction::Or:
3490  case Instruction::Xor: {
3491  // Just widen binops.
3492  BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it);
3493  setDebugLocFromInst(Builder, BinOp);
3494  VectorParts &A = getVectorValue(it->getOperand(0));
3495  VectorParts &B = getVectorValue(it->getOperand(1));
3496 
3497  // Use this vector value for all users of the original instruction.
3498  for (unsigned Part = 0; Part < UF; ++Part) {
3499  Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]);
3500 
3501  if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
3502  VecOp->copyIRFlags(BinOp);
3503 
3504  Entry[Part] = V;
3505  }
3506 
3507  propagateMetadata(Entry, it);
3508  break;
3509  }
3510  case Instruction::Select: {
3511  // Widen selects.
3512  // If the selector is loop invariant we can create a select
3513  // instruction with a scalar condition. Otherwise, use vector-select.
3514  bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)),
3515  OrigLoop);
3516  setDebugLocFromInst(Builder, it);
3517 
3518  // The condition can be loop invariant but still defined inside the
3519  // loop. This means that we can't just use the original 'cond' value.
3520  // We have to take the 'vectorized' value and pick the first lane.
3521  // Instcombine will make this a no-op.
3522  VectorParts &Cond = getVectorValue(it->getOperand(0));
3523  VectorParts &Op0 = getVectorValue(it->getOperand(1));
3524  VectorParts &Op1 = getVectorValue(it->getOperand(2));
3525 
3526  Value *ScalarCond = (VF == 1) ? Cond[0] :
3527  Builder.CreateExtractElement(Cond[0], Builder.getInt32(0));
3528 
3529  for (unsigned Part = 0; Part < UF; ++Part) {
3530  Entry[Part] = Builder.CreateSelect(
3531  InvariantCond ? ScalarCond : Cond[Part],
3532  Op0[Part],
3533  Op1[Part]);
3534  }
3535 
3536  propagateMetadata(Entry, it);
3537  break;
3538  }
3539 
3540  case Instruction::ICmp:
3541  case Instruction::FCmp: {
3542  // Widen compares. Generate vector compares.
3543  bool FCmp = (it->getOpcode() == Instruction::FCmp);
3544  CmpInst *Cmp = dyn_cast<CmpInst>(it);
3545  setDebugLocFromInst(Builder, it);
3546  VectorParts &A = getVectorValue(it->getOperand(0));
3547  VectorParts &B = getVectorValue(it->getOperand(1));
3548  for (unsigned Part = 0; Part < UF; ++Part) {
3549  Value *C = nullptr;
3550  if (FCmp)
3551  C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]);
3552  else
3553  C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]);
3554  Entry[Part] = C;
3555  }
3556 
3557  propagateMetadata(Entry, it);
3558  break;
3559  }
3560 
3561  case Instruction::Store:
3562  case Instruction::Load:
3563  vectorizeMemoryInstruction(it);
3564  break;
3565  case Instruction::ZExt:
3566  case Instruction::SExt:
3567  case Instruction::FPToUI:
3568  case Instruction::FPToSI:
3569  case Instruction::FPExt:
3570  case Instruction::PtrToInt:
3571  case Instruction::IntToPtr:
3572  case Instruction::SIToFP:
3573  case Instruction::UIToFP:
3574  case Instruction::Trunc:
3575  case Instruction::FPTrunc:
3576  case Instruction::BitCast: {
3577  CastInst *CI = dyn_cast<CastInst>(it);
3578  setDebugLocFromInst(Builder, it);
3579  /// Optimize the special case where the source is the induction
3580  /// variable. Notice that we can only optimize the 'trunc' case
3581  /// because: a. FP conversions lose precision, b. sext/zext may wrap,
3582  /// c. other casts depend on pointer size.
3583  if (CI->getOperand(0) == OldInduction &&
3584  it->getOpcode() == Instruction::Trunc) {
3585  Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction,
3586  CI->getType());
3587  Value *Broadcasted = getBroadcastInstrs(ScalarCast);
3588  LoopVectorizationLegality::InductionInfo II =
3589  Legal->getInductionVars()->lookup(OldInduction);
3590  Constant *Step =
3591  ConstantInt::getSigned(CI->getType(), II.StepValue->getSExtValue());
3592  for (unsigned Part = 0; Part < UF; ++Part)
3593  Entry[Part] = getStepVector(Broadcasted, VF * Part, Step);
3594  propagateMetadata(Entry, it);
3595  break;
3596  }
3597  /// Vectorize casts.
3598  Type *DestTy = (VF == 1) ? CI->getType() :
3599  VectorType::get(CI->getType(), VF);
3600 
3601  VectorParts &A = getVectorValue(it->getOperand(0));
3602  for (unsigned Part = 0; Part < UF; ++Part)
3603  Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy);
3604  propagateMetadata(Entry, it);
3605  break;
3606  }
3607 
3608  case Instruction::Call: {
3609  // Ignore dbg intrinsics.
3610  if (isa<DbgInfoIntrinsic>(it))
3611  break;
3612  setDebugLocFromInst(Builder, it);
3613 
3614  Module *M = BB->getParent()->getParent();
3615  CallInst *CI = cast<CallInst>(it);
3616 
3617  StringRef FnName = CI->getCalledFunction()->getName();
3618  Function *F = CI->getCalledFunction();
3619  Type *RetTy = ToVectorTy(CI->getType(), VF);
3621  for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
3622  Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF));
3623 
3625  if (ID &&
3626  (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
3627  ID == Intrinsic::lifetime_start)) {
3628  scalarizeInstruction(it);
3629  break;
3630  }
3631  // The flag shows whether we use Intrinsic or a usual Call for vectorized
3632  // version of the instruction.
3633  // Is it beneficial to perform intrinsic call compared to lib call?
3634  bool NeedToScalarize;
3635  unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
3636  bool UseVectorIntrinsic =
3637  ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
3638  if (!UseVectorIntrinsic && NeedToScalarize) {
3639  scalarizeInstruction(it);
3640  break;
3641  }
3642 
3643  for (unsigned Part = 0; Part < UF; ++Part) {
3645  for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
3646  Value *Arg = CI->getArgOperand(i);
3647  // Some intrinsics have a scalar argument - don't replace it with a
3648  // vector.
3649  if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) {
3650  VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i));
3651  Arg = VectorArg[Part];
3652  }
3653  Args.push_back(Arg);
3654  }
3655 
3656  Function *VectorF;
3657  if (UseVectorIntrinsic) {
3658  // Use vector version of the intrinsic.
3659  Type *TysForDecl[] = {CI->getType()};
3660  if (VF > 1)
3661  TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
3662  VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
3663  } else {
3664  // Use vector version of the library call.
3665  StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
3666  assert(!VFnName.empty() && "Vector function name is empty.");
3667  VectorF = M->getFunction(VFnName);
3668  if (!VectorF) {
3669  // Generate a declaration
3670  FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
3671  VectorF =
3672  Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
3673  VectorF->copyAttributesFrom(F);
3674  }
3675  }
3676  assert(VectorF && "Can't create vector function.");
3677  Entry[Part] = Builder.CreateCall(VectorF, Args);
3678  }
3679 
3680  propagateMetadata(Entry, it);
3681  break;
3682  }
3683 
3684  default:
3685  // All other instructions are unsupported. Scalarize them.
3686  scalarizeInstruction(it);
3687  break;
3688  }// end of switch.
3689  }// end of for_each instr.
3690 }
3691 
3692 void InnerLoopVectorizer::updateAnalysis() {
3693  // Forget the original basic block.
3694  SE->forgetLoop(OrigLoop);
3695 
3696  // Update the dominator tree information.
3697  assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
3698  "Entry does not dominate exit.");
3699 
3700  for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
3701  DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]);
3702  DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back());
3703 
3704  // Due to if predication of stores we might create a sequence of "if(pred)
3705  // a[i] = ...; " blocks.
3706  for (unsigned i = 0, e = LoopVectorBody.size(); i != e; ++i) {
3707  if (i == 0)
3708  DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader);
3709  else if (isPredicatedBlock(i)) {
3710  DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-1]);
3711  } else {
3712  DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-2]);
3713  }
3714  }
3715 
3716  DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks[1]);
3717  DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
3718  DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
3719  DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
3720 
3721  DEBUG(DT->verifyDomTree());
3722 }
3723 
3724 /// \brief Check whether it is safe to if-convert this phi node.
3725 ///
3726 /// Phi nodes with constant expressions that can trap are not safe to if
3727 /// convert.
3729  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
3730  PHINode *Phi = dyn_cast<PHINode>(I);
3731  if (!Phi)
3732  return true;
3733  for (unsigned p = 0, e = Phi->getNumIncomingValues(); p != e; ++p)
3734  if (Constant *C = dyn_cast<Constant>(Phi->getIncomingValue(p)))
3735  if (C->canTrap())
3736  return false;
3737  }
3738  return true;
3739 }
3740 
3741 bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
3742  if (!EnableIfConversion) {
3743  emitAnalysis(VectorizationReport() << "if-conversion is disabled");
3744  return false;
3745  }
3746 
3747  assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
3748 
3749  // A list of pointers that we can safely read and write to.
3750  SmallPtrSet<Value *, 8> SafePointes;
3751 
3752  // Collect safe addresses.
3753  for (Loop::block_iterator BI = TheLoop->block_begin(),
3754  BE = TheLoop->block_end(); BI != BE; ++BI) {
3755  BasicBlock *BB = *BI;
3756 
3757  if (blockNeedsPredication(BB))
3758  continue;
3759 
3760  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
3761  if (LoadInst *LI = dyn_cast<LoadInst>(I))
3762  SafePointes.insert(LI->getPointerOperand());
3763  else if (StoreInst *SI = dyn_cast<StoreInst>(I))
3764  SafePointes.insert(SI->getPointerOperand());
3765  }
3766  }
3767 
3768  // Collect the blocks that need predication.
3769  BasicBlock *Header = TheLoop->getHeader();
3770  for (Loop::block_iterator BI = TheLoop->block_begin(),
3771  BE = TheLoop->block_end(); BI != BE; ++BI) {
3772  BasicBlock *BB = *BI;
3773 
3774  // We don't support switch statements inside loops.
3775  if (!isa<BranchInst>(BB->getTerminator())) {
3776  emitAnalysis(VectorizationReport(BB->getTerminator())
3777  << "loop contains a switch statement");
3778  return false;
3779  }
3780 
3781  // We must be able to predicate all blocks that need to be predicated.
3782  if (blockNeedsPredication(BB)) {
3783  if (!blockCanBePredicated(BB, SafePointes)) {
3784  emitAnalysis(VectorizationReport(BB->getTerminator())
3785  << "control flow cannot be substituted for a select");
3786  return false;
3787  }
3788  } else if (BB != Header && !canIfConvertPHINodes(BB)) {
3789  emitAnalysis(VectorizationReport(BB->getTerminator())
3790  << "control flow cannot be substituted for a select");
3791  return false;
3792  }
3793  }
3794 
3795  // We can if-convert this loop.
3796  return true;
3797 }
3798 
3799 bool LoopVectorizationLegality::canVectorize() {
3800  // We must have a loop in canonical form. Loops with indirectbr in them cannot
3801  // be canonicalized.
3802  if (!TheLoop->getLoopPreheader()) {
3803  emitAnalysis(
3804  VectorizationReport() <<
3805  "loop control flow is not understood by vectorizer");
3806  return false;
3807  }
3808 
3809  // We can only vectorize innermost loops.
3810  if (!TheLoop->empty()) {
3811  emitAnalysis(VectorizationReport() << "loop is not the innermost loop");
3812  return false;
3813  }
3814 
3815  // We must have a single backedge.
3816  if (TheLoop->getNumBackEdges() != 1) {
3817  emitAnalysis(
3818  VectorizationReport() <<
3819  "loop control flow is not understood by vectorizer");
3820  return false;
3821  }
3822 
3823  // We must have a single exiting block.
3824  if (!TheLoop->getExitingBlock()) {
3825  emitAnalysis(
3826  VectorizationReport() <<
3827  "loop control flow is not understood by vectorizer");
3828  return false;
3829  }
3830 
3831  // We only handle bottom-tested loops, i.e. loop in which the condition is
3832  // checked at the end of each iteration. With that we can assume that all
3833  // instructions in the loop are executed the same number of times.
3834  if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
3835  emitAnalysis(
3836  VectorizationReport() <<
3837  "loop control flow is not understood by vectorizer");
3838  return false;
3839  }
3840 
3841  // We need to have a loop header.
3842  DEBUG(dbgs() << "LV: Found a loop: " <<
3843  TheLoop->getHeader()->getName() << '\n');
3844 
3845  // Check if we can if-convert non-single-bb loops.
3846  unsigned NumBlocks = TheLoop->getNumBlocks();
3847  if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
3848  DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
3849  return false;
3850  }
3851 
3852  // ScalarEvolution needs to be able to find the exit count.
3853  const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop);
3854  if (ExitCount == SE->getCouldNotCompute()) {
3855  emitAnalysis(VectorizationReport() <<
3856  "could not determine number of loop iterations");
3857  DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
3858  return false;
3859  }
3860 
3861  // Check if we can vectorize the instructions and CFG in this loop.
3862  if (!canVectorizeInstrs()) {
3863  DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
3864  return false;
3865  }
3866 
3867  // Go over each instruction and look at memory deps.
3868  if (!canVectorizeMemory()) {
3869  DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
3870  return false;
3871  }
3872 
3873  // Collect all of the variables that remain uniform after vectorization.
3874  collectLoopUniforms();
3875 
3876  DEBUG(dbgs() << "LV: We can vectorize this loop"
3877  << (LAI->getRuntimePointerChecking()->Need
3878  ? " (with a runtime bound check)"
3879  : "")
3880  << "!\n");
3881 
3882  // Analyze interleaved memory accesses.
3884  InterleaveInfo.analyzeInterleaving(Strides);
3885 
3886  // Okay! We can vectorize. At this point we don't have any other mem analysis
3887  // which may limit our maximum vectorization factor, so just return true with
3888  // no restrictions.
3889  return true;
3890 }
3891 
3893  if (Ty->isPointerTy())
3894  return DL.getIntPtrType(Ty);
3895 
3896  // It is possible that char's or short's overflow when we ask for the loop's
3897  // trip count, work around this by changing the type size.
3898  if (Ty->getScalarSizeInBits() < 32)
3899  return Type::getInt32Ty(Ty->getContext());
3900 
3901  return Ty;
3902 }
3903 
3904 static Type* getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
3905  Ty0 = convertPointerToIntegerType(DL, Ty0);
3906  Ty1 = convertPointerToIntegerType(DL, Ty1);
3907  if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
3908  return Ty0;
3909  return Ty1;
3910 }
3911 
3912 /// \brief Check that the instruction has outside loop users and is not an
3913 /// identified reduction variable.
3914 static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
3915  SmallPtrSetImpl<Value *> &Reductions) {
3916  // Reduction instructions are allowed to have exit users. All other
3917  // instructions must not have external users.
3918  if (!Reductions.count(Inst))
3919  //Check that all of the users of the loop are inside the BB.
3920  for (User *U : Inst->users()) {
3921  Instruction *UI = cast<Instruction>(U);
3922  // This user may be a reduction exit value.
3923  if (!TheLoop->contains(UI)) {
3924  DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n');
3925  return true;
3926  }
3927  }
3928  return false;
3929 }
3930 
3931 bool LoopVectorizationLegality::canVectorizeInstrs() {
3932  BasicBlock *PreHeader = TheLoop->getLoopPreheader();
3933  BasicBlock *Header = TheLoop->getHeader();
3934 
3935  // Look for the attribute signaling the absence of NaNs.
3936  Function &F = *Header->getParent();
3937  const DataLayout &DL = F.getParent()->getDataLayout();
3938  if (F.hasFnAttribute("no-nans-fp-math"))
3939  HasFunNoNaNAttr =
3940  F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
3941 
3942  // For each block in the loop.
3943  for (Loop::block_iterator bb = TheLoop->block_begin(),
3944  be = TheLoop->block_end(); bb != be; ++bb) {
3945 
3946  // Scan the instructions in the block and look for hazards.
3947  for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
3948  ++it) {
3949 
3950  if (PHINode *Phi = dyn_cast<PHINode>(it)) {
3951  Type *PhiTy = Phi->getType();
3952  // Check that this PHI type is allowed.
3953  if (!PhiTy->isIntegerTy() &&
3954  !PhiTy->isFloatingPointTy() &&
3955  !PhiTy->isPointerTy()) {
3956  emitAnalysis(VectorizationReport(it)
3957  << "loop control flow is not understood by vectorizer");
3958  DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
3959  return false;
3960  }
3961 
3962  // If this PHINode is not in the header block, then we know that we
3963  // can convert it to select during if-conversion. No need to check if
3964  // the PHIs in this block are induction or reduction variables.
3965  if (*bb != Header) {
3966  // Check that this instruction has no outside users or is an
3967  // identified reduction value with an outside user.
3968  if (!hasOutsideLoopUser(TheLoop, it, AllowedExit))
3969  continue;
3970  emitAnalysis(VectorizationReport(it) <<
3971  "value could not be identified as "
3972  "an induction or reduction variable");
3973  return false;
3974  }
3975 
3976  // We only allow if-converted PHIs with exactly two incoming values.
3977  if (Phi->getNumIncomingValues() != 2) {
3978  emitAnalysis(VectorizationReport(it)
3979  << "control flow not understood by vectorizer");
3980  DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
3981  return false;
3982  }
3983 
3984  // This is the value coming from the preheader.
3985  Value *StartValue = Phi->getIncomingValueForBlock(PreHeader);
3986  ConstantInt *StepValue = nullptr;
3987  // Check if this is an induction variable.
3988  InductionKind IK = isInductionVariable(Phi, StepValue);
3989 
3990  if (IK_NoInduction != IK) {
3991  // Get the widest type.
3992  if (!WidestIndTy)
3993  WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
3994  else
3995  WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
3996 
3997  // Int inductions are special because we only allow one IV.
3998  if (IK == IK_IntInduction && StepValue->isOne()) {
3999  // Use the phi node with the widest type as induction. Use the last
4000  // one if there are multiple (no good reason for doing this other
4001  // than it is expedient).
4002  if (!Induction || PhiTy == WidestIndTy)
4003  Induction = Phi;
4004  }
4005 
4006  DEBUG(dbgs() << "LV: Found an induction variable.\n");
4007  Inductions[Phi] = InductionInfo(StartValue, IK, StepValue);
4008 
4009  // Until we explicitly handle the case of an induction variable with
4010  // an outside loop user we have to give up vectorizing this loop.
4011  if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) {
4012  emitAnalysis(VectorizationReport(it) <<
4013  "use of induction value outside of the "
4014  "loop is not handled by vectorizer");
4015  return false;
4016  }
4017 
4018  continue;
4019  }
4020 
4021  if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop,
4022  Reductions[Phi])) {
4023  AllowedExit.insert(Reductions[Phi].getLoopExitInstr());
4024  continue;
4025  }
4026 
4027  emitAnalysis(VectorizationReport(it) <<
4028  "value that could not be identified as "
4029  "reduction is used outside the loop");
4030  DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
4031  return false;
4032  }// end of PHI handling
4033 
4034  // We handle calls that:
4035  // * Are debug info intrinsics.
4036  // * Have a mapping to an IR intrinsic.
4037  // * Have a vector version available.
4038  CallInst *CI = dyn_cast<CallInst>(it);
4039  if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI) &&
4040  !(CI->getCalledFunction() && TLI &&
4041  TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) {
4042  emitAnalysis(VectorizationReport(it) <<
4043  "call instruction cannot be vectorized");
4044  DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n");
4045  return false;
4046  }
4047 
4048  // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the
4049  // second argument is the same (i.e. loop invariant)
4050  if (CI &&
4052  if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) {
4053  emitAnalysis(VectorizationReport(it)
4054  << "intrinsic instruction cannot be vectorized");
4055  DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n");
4056  return false;
4057  }
4058  }
4059 
4060  // Check that the instruction return type is vectorizable.
4061  // Also, we can't vectorize extractelement instructions.
4062  if ((!VectorType::isValidElementType(it->getType()) &&
4063  !it->getType()->isVoidTy()) || isa<ExtractElementInst>(it)) {
4064  emitAnalysis(VectorizationReport(it)
4065  << "instruction return type cannot be vectorized");
4066  DEBUG(dbgs() << "LV: Found unvectorizable type.\n");
4067  return false;
4068  }
4069 
4070  // Check that the stored type is vectorizable.
4071  if (StoreInst *ST = dyn_cast<StoreInst>(it)) {
4072  Type *T = ST->getValueOperand()->getType();
4074  emitAnalysis(VectorizationReport(ST) <<
4075  "store instruction cannot be vectorized");
4076  return false;
4077  }
4079  collectStridedAccess(ST);
4080  }
4081 
4083  if (LoadInst *LI = dyn_cast<LoadInst>(it))
4084  collectStridedAccess(LI);
4085 
4086  // Reduction instructions are allowed to have exit users.
4087  // All other instructions must not have external users.
4088  if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) {
4089  emitAnalysis(VectorizationReport(it) <<
4090  "value cannot be used outside the loop");
4091  return false;
4092  }
4093 
4094  } // next instr.
4095 
4096  }
4097 
4098  if (!Induction) {
4099  DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
4100  if (Inductions.empty()) {
4101  emitAnalysis(VectorizationReport()
4102  << "loop induction variable could not be identified");
4103  return false;
4104  }
4105  }
4106 
4107  return true;
4108 }
4109 
4110 void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) {
4111  Value *Ptr = nullptr;
4112  if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess))
4113  Ptr = LI->getPointerOperand();
4114  else if (StoreInst *SI = dyn_cast<StoreInst>(MemAccess))
4115  Ptr = SI->getPointerOperand();
4116  else
4117  return;
4118 
4119  Value *Stride = getStrideFromPointer(Ptr, SE, TheLoop);
4120  if (!Stride)
4121  return;
4122 
4123  DEBUG(dbgs() << "LV: Found a strided access that we can version");
4124  DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *Stride << "\n");
4125  Strides[Ptr] = Stride;
4126  StrideSet.insert(Stride);
4127 }
4128 
4129 void LoopVectorizationLegality::collectLoopUniforms() {
4130  // We now know that the loop is vectorizable!
4131  // Collect variables that will remain uniform after vectorization.
4132  std::vector<Value*> Worklist;
4133  BasicBlock *Latch = TheLoop->getLoopLatch();
4134 
4135  // Start with the conditional branch and walk up the block.
4136  Worklist.push_back(Latch->getTerminator()->getOperand(0));
4137 
4138  // Also add all consecutive pointer values; these values will be uniform
4139  // after vectorization (and subsequent cleanup) and, until revectorization is
4140  // supported, all dependencies must also be uniform.
4141  for (Loop::block_iterator B = TheLoop->block_begin(),
4142  BE = TheLoop->block_end(); B != BE; ++B)
4143  for (BasicBlock::iterator I = (*B)->begin(), IE = (*B)->end();
4144  I != IE; ++I)
4145  if (I->getType()->isPointerTy() && isConsecutivePtr(I))
4146  Worklist.insert(Worklist.end(), I->op_begin(), I->op_end());
4147 
4148  while (!Worklist.empty()) {
4149  Instruction *I = dyn_cast<Instruction>(Worklist.back());
4150  Worklist.pop_back();
4151 
4152  // Look at instructions inside this loop.
4153  // Stop when reaching PHI nodes.
4154  // TODO: we need to follow values all over the loop, not only in this block.
4155  if (!I || !TheLoop->contains(I) || isa<PHINode>(I))
4156  continue;
4157 
4158  // This is a known uniform.
4159  Uniforms.insert(I);
4160 
4161  // Insert all operands.
4162  Worklist.insert(Worklist.end(), I->op_begin(), I->op_end());
4163  }
4164 }
4165 
4166 bool LoopVectorizationLegality::canVectorizeMemory() {
4167  LAI = &LAA->getInfo(TheLoop, Strides);
4168  auto &OptionalReport = LAI->getReport();
4169  if (OptionalReport)
4170  emitAnalysis(VectorizationReport(*OptionalReport));
4171  if (!LAI->canVectorizeMemory())
4172  return false;
4173 
4174  if (LAI->hasStoreToLoopInvariantAddress()) {
4175  emitAnalysis(
4176  VectorizationReport()
4177  << "write to a loop invariant address could not be vectorized");
4178  DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
4179  return false;
4180  }
4181 
4182  if (LAI->getNumRuntimePointerChecks() >
4184  emitAnalysis(VectorizationReport()
4185  << LAI->getNumRuntimePointerChecks() << " exceeds limit of "
4187  << " dependent memory operations checked at runtime");
4188  DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
4189  return false;
4190  }
4191  return true;
4192 }
4193 
4194 LoopVectorizationLegality::InductionKind
4195 LoopVectorizationLegality::isInductionVariable(PHINode *Phi,
4196  ConstantInt *&StepValue) {
4197  if (!isInductionPHI(Phi, SE, StepValue))
4198  return IK_NoInduction;
4199 
4200  Type *PhiTy = Phi->getType();
4201  // Found an Integer induction variable.
4202  if (PhiTy->isIntegerTy())
4203  return IK_IntInduction;
4204  // Found an Pointer induction variable.
4205  return IK_PtrInduction;
4206 }
4207 
4208 bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
4209  Value *In0 = const_cast<Value*>(V);
4210  PHINode *PN = dyn_cast_or_null<PHINode>(In0);
4211  if (!PN)
4212  return false;
4213 
4214  return Inductions.count(PN);
4215 }
4216 
4217 bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
4218  return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
4219 }
4220 
4221 bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,
4222  SmallPtrSetImpl<Value *> &SafePtrs) {
4223 
4224  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
4225  // Check that we don't have a constant expression that can trap as operand.
4226  for (Instruction::op_iterator OI = it->op_begin(), OE = it->op_end();
4227  OI != OE; ++OI) {
4228  if (Constant *C = dyn_cast<Constant>(*OI))
4229  if (C->canTrap())
4230  return false;
4231  }
4232  // We might be able to hoist the load.
4233  if (it->mayReadFromMemory()) {
4234  LoadInst *LI = dyn_cast<LoadInst>(it);
4235  if (!LI)
4236  return false;
4237  if (!SafePtrs.count(LI->getPointerOperand())) {
4238  if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand())) {
4239  MaskedOp.insert(LI);
4240  continue;
4241  }
4242  return false;
4243  }
4244  }
4245 
4246  // We don't predicate stores at the moment.
4247  if (it->mayWriteToMemory()) {
4248  StoreInst *SI = dyn_cast<StoreInst>(it);
4249  // We only support predication of stores in basic blocks with one
4250  // predecessor.
4251  if (!SI)
4252  return false;
4253 
4254  bool isSafePtr = (SafePtrs.count(SI->getPointerOperand()) != 0);
4255  bool isSinglePredecessor = SI->getParent()->getSinglePredecessor();
4256 
4257  if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr ||
4258  !isSinglePredecessor) {
4259  // Build a masked store if it is legal for the target, otherwise scalarize
4260  // the block.
4261  bool isLegalMaskedOp =
4262  isLegalMaskedStore(SI->getValueOperand()->getType(),
4263  SI->getPointerOperand());
4264  if (isLegalMaskedOp) {
4265  --NumPredStores;
4266  MaskedOp.insert(SI);
4267  continue;
4268  }
4269  return false;
4270  }
4271  }
4272  if (it->mayThrow())
4273  return false;
4274 
4275  // The instructions below can trap.
4276  switch (it->getOpcode()) {
4277  default: continue;
4278  case Instruction::UDiv:
4279  case Instruction::SDiv:
4280  case Instruction::URem:
4281  case Instruction::SRem:
4282  return false;
4283  }
4284  }
4285 
4286  return true;
4287 }
4288 
4289 void InterleavedAccessInfo::collectConstStridedAccesses(
4291  const ValueToValueMap &Strides) {
4292  // Holds load/store instructions in program order.
4293  SmallVector<Instruction *, 16> AccessList;
4294 
4295  for (auto *BB : TheLoop->getBlocks()) {
4296  bool IsPred = LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
4297 
4298  for (auto &I : *BB) {
4299  if (!isa<LoadInst>(&I) && !isa<StoreInst>(&I))
4300  continue;
4301  // FIXME: Currently we can't handle mixed accesses and predicated accesses
4302  if (IsPred)
4303  return;
4304 
4305  AccessList.push_back(&I);
4306  }
4307  }
4308 
4309  if (AccessList.empty())
4310  return;
4311 
4312  auto &DL = TheLoop->getHeader()->getModule()->getDataLayout();
4313  for (auto I : AccessList) {
4314  LoadInst *LI = dyn_cast<LoadInst>(I);
4315  StoreInst *SI = dyn_cast<StoreInst>(I);
4316 
4317  Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
4318  int Stride = isStridedPtr(SE, Ptr, TheLoop, Strides);
4319 
4320  // The factor of the corresponding interleave group.
4321  unsigned Factor = std::abs(Stride);
4322 
4323  // Ignore the access if the factor is too small or too large.
4324  if (Factor < 2 || Factor > MaxInterleaveGroupFactor)
4325  continue;
4326 
4327  const SCEV *Scev = replaceSymbolicStrideSCEV(SE, Strides, Ptr);
4328  PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
4329  unsigned Size = DL.getTypeAllocSize(PtrTy->getElementType());
4330 
4331  // An alignment of 0 means target ABI alignment.
4332  unsigned Align = LI ? LI->getAlignment() : SI->getAlignment();
4333  if (!Align)
4334  Align = DL.getABITypeAlignment(PtrTy->getElementType());
4335 
4336  StrideAccesses[I] = StrideDescriptor(Stride, Scev, Size, Align);
4337  }
4338 }
4339 
4340 // Analyze interleaved accesses and collect them into interleave groups.
4341 //
4342 // Notice that the vectorization on interleaved groups will change instruction
4343 // orders and may break dependences. But the memory dependence check guarantees
4344 // that there is no overlap between two pointers of different strides, element
4345 // sizes or underlying bases.
4346 //
4347 // For pointers sharing the same stride, element size and underlying base, no
4348 // need to worry about Read-After-Write dependences and Write-After-Read
4349 // dependences.
4350 //
4351 // E.g. The RAW dependence: A[i] = a;
4352 // b = A[i];
4353 // This won't exist as it is a store-load forwarding conflict, which has
4354 // already been checked and forbidden in the dependence check.
4355 //
4356 // E.g. The WAR dependence: a = A[i]; // (1)
4357 // A[i] = b; // (2)
4358 // The store group of (2) is always inserted at or below (2), and the load group
4359 // of (1) is always inserted at or above (1). The dependence is safe.
4360 void InterleavedAccessInfo::analyzeInterleaving(
4361  const ValueToValueMap &Strides) {
4362  DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n");
4363 
4364  // Holds all the stride accesses.
4366  collectConstStridedAccesses(StrideAccesses, Strides);
4367 
4368  if (StrideAccesses.empty())
4369  return;
4370 
4371  // Holds all interleaved store groups temporarily.
4373 
4374  // Search the load-load/write-write pair B-A in bottom-up order and try to
4375  // insert B into the interleave group of A according to 3 rules:
4376  // 1. A and B have the same stride.
4377  // 2. A and B have the same memory object size.
4378  // 3. B belongs to the group according to the distance.
4379  //
4380  // The bottom-up order can avoid breaking the Write-After-Write dependences
4381  // between two pointers of the same base.
4382  // E.g. A[i] = a; (1)
4383  // A[i] = b; (2)
4384  // A[i+1] = c (3)
4385  // We form the group (2)+(3) in front, so (1) has to form groups with accesses
4386  // above (1), which guarantees that (1) is always above (2).
4387  for (auto I = StrideAccesses.rbegin(), E = StrideAccesses.rend(); I != E;
4388  ++I) {
4389  Instruction *A = I->first;
4390  StrideDescriptor DesA = I->second;
4391 
4392  InterleaveGroup *Group = getInterleaveGroup(A);
4393  if (!Group) {
4394  DEBUG(dbgs() << "LV: Creating an interleave group with:" << *A << '\n');
4395  Group = createInterleaveGroup(A, DesA.Stride, DesA.Align);
4396  }
4397 
4398  if (A->mayWriteToMemory())
4399  StoreGroups.insert(Group);
4400 
4401  for (auto II = std::next(I); II != E; ++II) {
4402  Instruction *B = II->first;
4403  StrideDescriptor DesB = II->second;
4404 
4405  // Ignore if B is already in a group or B is a different memory operation.
4406  if (isInterleaved(B) || A->mayReadFromMemory() != B->mayReadFromMemory())
4407  continue;
4408 
4409  // Check the rule 1 and 2.
4410  if (DesB.Stride != DesA.Stride || DesB.Size != DesA.Size)
4411  continue;
4412 
4413  // Calculate the distance and prepare for the rule 3.
4414  const SCEVConstant *DistToA =
4415  dyn_cast<SCEVConstant>(SE->getMinusSCEV(DesB.Scev, DesA.Scev));
4416  if (!DistToA)
4417  continue;
4418 
4419  int DistanceToA = DistToA->getValue()->getValue().getSExtValue();
4420 
4421  // Skip if the distance is not multiple of size as they are not in the
4422  // same group.
4423  if (DistanceToA % static_cast<int>(DesA.Size))
4424  continue;
4425 
4426  // The index of B is the index of A plus the related index to A.
4427  int IndexB =
4428  Group->getIndex(A) + DistanceToA / static_cast<int>(DesA.Size);
4429 
4430  // Try to insert B into the group.
4431  if (Group->insertMember(B, IndexB, DesB.Align)) {
4432  DEBUG(dbgs() << "LV: Inserted:" << *B << '\n'
4433  << " into the interleave group with" << *A << '\n');
4434  InterleaveGroupMap[B] = Group;
4435 
4436  // Set the first load in program order as the insert position.
4437  if (B->mayReadFromMemory())
4438  Group->setInsertPos(B);
4439  }
4440  } // Iteration on instruction B
4441  } // Iteration on instruction A
4442 
4443  // Remove interleaved store groups with gaps.
4444  for (InterleaveGroup *Group : StoreGroups)
4445  if (Group->getNumMembers() != Group->getFactor())
4446  releaseGroup(Group);
4447 }
4448 
4450 LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
4451  // Width 1 means no vectorize
4452  VectorizationFactor Factor = { 1U, 0U };
4453  if (OptForSize && Legal->getRuntimePointerChecking()->Need) {
4454  emitAnalysis(VectorizationReport() <<
4455  "runtime pointer checks needed. Enable vectorization of this "
4456  "loop with '#pragma clang loop vectorize(enable)' when "
4457  "compiling with -Os");
4458  DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n");
4459  return Factor;
4460  }
4461 
4462  if (!EnableCondStoresVectorization && Legal->getNumPredStores()) {
4463  emitAnalysis(VectorizationReport() <<
4464  "store that is conditionally executed prevents vectorization");
4465  DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");
4466  return Factor;
4467  }
4468 
4469  // Find the trip count.
4470  unsigned TC = SE->getSmallConstantTripCount(TheLoop);
4471  DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4472 
4473  unsigned WidestType = getWidestType();
4474  unsigned WidestRegister = TTI.getRegisterBitWidth(true);
4475  unsigned MaxSafeDepDist = -1U;
4476  if (Legal->getMaxSafeDepDistBytes() != -1U)
4477  MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
4478  WidestRegister = ((WidestRegister < MaxSafeDepDist) ?
4479  WidestRegister : MaxSafeDepDist);
4480  unsigned MaxVectorSize = WidestRegister / WidestType;
4481  DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n");
4482  DEBUG(dbgs() << "LV: The Widest register is: "
4483  << WidestRegister << " bits.\n");
4484 
4485  if (MaxVectorSize == 0) {
4486  DEBUG(dbgs() << "LV: The target has no vector registers.\n");
4487  MaxVectorSize = 1;
4488  }
4489 
4490  assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements"
4491  " into one vector!");
4492 
4493  unsigned VF = MaxVectorSize;
4494 
4495  // If we optimize the program for size, avoid creating the tail loop.
4496  if (OptForSize) {
4497  // If we are unable to calculate the trip count then don't try to vectorize.
4498  if (TC < 2) {
4499  emitAnalysis
4500  (VectorizationReport() <<
4501  "unable to calculate the loop count due to complex control flow");
4502  DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
4503  return Factor;
4504  }
4505 
4506  // Find the maximum SIMD width that can fit within the trip count.
4507  VF = TC % MaxVectorSize;
4508 
4509  if (VF == 0)
4510  VF = MaxVectorSize;
4511  else {
4512  // If the trip count that we found modulo the vectorization factor is not
4513  // zero then we require a tail.
4514  emitAnalysis(VectorizationReport() <<
4515  "cannot optimize for size and vectorize at the "
4516  "same time. Enable vectorization of this loop "
4517  "with '#pragma clang loop vectorize(enable)' "
4518  "when compiling with -Os");
4519  DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
4520  return Factor;
4521  }
4522  }
4523 
4524  int UserVF = Hints->getWidth();
4525  if (UserVF != 0) {
4526  assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
4527  DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
4528 
4529  Factor.Width = UserVF;
4530  return Factor;
4531  }
4532 
4533  float Cost = expectedCost(1);
4534 #ifndef NDEBUG
4535  const float ScalarCost = Cost;
4536 #endif /* NDEBUG */
4537  unsigned Width = 1;
4538  DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
4539 
4540  bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
4541  // Ignore scalar width, because the user explicitly wants vectorization.
4542  if (ForceVectorization && VF > 1) {
4543  Width = 2;
4544  Cost = expectedCost(Width) / (float)Width;
4545  }
4546 
4547  for (unsigned i=2; i <= VF; i*=2) {
4548  // Notice that the vector loop needs to be executed less times, so
4549  // we need to divide the cost of the vector loops by the width of
4550  // the vector elements.
4551  float VectorCost = expectedCost(i) / (float)i;
4552  DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " <<
4553  (int)VectorCost << ".\n");
4554  if (VectorCost < Cost) {
4555  Cost = VectorCost;
4556  Width = i;
4557  }
4558  }
4559 
4560  DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
4561  << "LV: Vectorization seems to be not beneficial, "
4562  << "but was forced by a user.\n");
4563  DEBUG(dbgs() << "LV: Selecting VF: "<< Width << ".\n");
4564  Factor.Width = Width;
4565  Factor.Cost = Width * Cost;
4566  return Factor;
4567 }
4568 
4569 unsigned LoopVectorizationCostModel::getWidestType() {
4570  unsigned MaxWidth = 8;
4571  const DataLayout &DL = TheFunction->getParent()->getDataLayout();
4572 
4573  // For each block.
4574  for (Loop::block_iterator bb = TheLoop->block_begin(),
4575  be = TheLoop->block_end(); bb != be; ++bb) {
4576  BasicBlock *BB = *bb;
4577 
4578  // For each instruction in the loop.
4579  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
4580  Type *T = it->getType();
4581 
4582  // Ignore ephemeral values.
4583  if (EphValues.count(it))
4584  continue;
4585 
4586  // Only examine Loads, Stores and PHINodes.
4587  if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it))
4588  continue;
4589 
4590  // Examine PHI nodes that are reduction variables.
4591  if (PHINode *PN = dyn_cast<PHINode>(it))
4592  if (!Legal->getReductionVars()->count(PN))
4593  continue;
4594 
4595  // Examine the stored values.
4596  if (StoreInst *ST = dyn_cast<StoreInst>(it))
4597  T = ST->getValueOperand()->getType();
4598 
4599  // Ignore loaded pointer types and stored pointer types that are not
4600  // consecutive. However, we do want to take consecutive stores/loads of
4601  // pointer vectors into account.
4602  if (T->isPointerTy() && !isConsecutiveLoadOrStore(it))
4603  continue;
4604 
4605  MaxWidth = std::max(MaxWidth,
4606  (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
4607  }
4608  }
4609 
4610  return MaxWidth;
4611 }
4612 
4613 unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
4614  unsigned VF,
4615  unsigned LoopCost) {
4616 
4617  // -- The interleave heuristics --
4618  // We interleave the loop in order to expose ILP and reduce the loop overhead.
4619  // There are many micro-architectural considerations that we can't predict
4620  // at this level. For example, frontend pressure (on decode or fetch) due to
4621  // code size, or the number and capabilities of the execution ports.
4622  //
4623  // We use the following heuristics to select the interleave count:
4624  // 1. If the code has reductions, then we interleave to break the cross
4625  // iteration dependency.
4626  // 2. If the loop is really small, then we interleave to reduce the loop
4627  // overhead.
4628  // 3. We don't interleave if we think that we will spill registers to memory
4629  // due to the increased register pressure.
4630 
4631  // Use the user preference, unless 'auto' is selected.
4632  int UserUF = Hints->getInterleave();
4633  if (UserUF != 0)
4634  return UserUF;
4635 
4636  // When we optimize for size, we don't interleave.
4637  if (OptForSize)
4638  return 1;
4639 
4640  // We used the distance for the interleave count.
4641  if (Legal->getMaxSafeDepDistBytes() != -1U)
4642  return 1;
4643 
4644  // Do not interleave loops with a relatively small trip count.
4645  unsigned TC = SE->getSmallConstantTripCount(TheLoop);
4646  if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
4647  return 1;
4648 
4649  unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
4650  DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters <<
4651  " registers\n");
4652 
4653  if (VF == 1) {
4654  if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
4655  TargetNumRegisters = ForceTargetNumScalarRegs;
4656  } else {
4657  if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
4658  TargetNumRegisters = ForceTargetNumVectorRegs;
4659  }
4660 
4661  LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage();
4662  // We divide by these constants so assume that we have at least one
4663  // instruction that uses at least one register.
4664  R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
4665  R.NumInstructions = std::max(R.NumInstructions, 1U);
4666 
4667  // We calculate the interleave count using the following formula.
4668  // Subtract the number of loop invariants from the number of available
4669  // registers. These registers are used by all of the interleaved instances.
4670  // Next, divide the remaining registers by the number of registers that is
4671  // required by the loop, in order to estimate how many parallel instances
4672  // fit without causing spills. All of this is rounded down if necessary to be
4673  // a power of two. We want power of two interleave count to simplify any
4674  // addressing operations or alignment considerations.
4675  unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
4676  R.MaxLocalUsers);
4677 
4678  // Don't count the induction variable as interleaved.
4680  IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
4681  std::max(1U, (R.MaxLocalUsers - 1)));
4682 
4683  // Clamp the interleave ranges to reasonable counts.
4684  unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
4685 
4686  // Check if the user has overridden the max.
4687  if (VF == 1) {
4688  if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
4689  MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
4690  } else {
4691  if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
4692  MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4693  }
4694 
4695  // If we did not calculate the cost for VF (because the user selected the VF)
4696  // then we calculate the cost of VF here.
4697  if (LoopCost == 0)
4698  LoopCost = expectedCost(VF);
4699 
4700  // Clamp the calculated IC to be between the 1 and the max interleave count
4701  // that the target allows.
4702  if (IC > MaxInterleaveCount)
4703  IC = MaxInterleaveCount;
4704  else if (IC < 1)
4705  IC = 1;
4706 
4707  // Interleave if we vectorized this loop and there is a reduction that could
4708  // benefit from interleaving.
4709  if (VF > 1 && Legal->getReductionVars()->size()) {
4710  DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
4711  return IC;
4712  }
4713 
4714  // Note that if we've already vectorized the loop we will have done the
4715  // runtime check and so interleaving won't require further checks.
4716  bool InterleavingRequiresRuntimePointerCheck =
4717  (VF == 1 && Legal->getRuntimePointerChecking()->Need);
4718 
4719  // We want to interleave small loops in order to reduce the loop overhead and
4720  // potentially expose ILP opportunities.
4721  DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
4722  if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
4723  // We assume that the cost overhead is 1 and we use the cost model
4724  // to estimate the cost of the loop and interleave until the cost of the
4725  // loop overhead is about 5% of the cost of the loop.
4726  unsigned SmallIC =
4727  std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
4728 
4729  // Interleave until store/load ports (estimated by max interleave count) are
4730  // saturated.
4731  unsigned NumStores = Legal->getNumStores();
4732  unsigned NumLoads = Legal->getNumLoads();
4733  unsigned StoresIC = IC / (NumStores ? NumStores : 1);
4734  unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
4735 
4736  // If we have a scalar reduction (vector reductions are already dealt with
4737  // by this point), we can increase the critical path length if the loop
4738  // we're interleaving is inside another loop. Limit, by default to 2, so the
4739  // critical path only gets increased by one reduction operation.
4740  if (Legal->getReductionVars()->size() &&
4741  TheLoop->getLoopDepth() > 1) {
4742  unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
4743  SmallIC = std::min(SmallIC, F);
4744  StoresIC = std::min(StoresIC, F);
4745  LoadsIC = std::min(LoadsIC, F);
4746  }
4747 
4749  std::max(StoresIC, LoadsIC) > SmallIC) {
4750  DEBUG(dbgs() << "LV: Interleaving to saturate store or load ports.\n");
4751  return std::max(StoresIC, LoadsIC);
4752  }
4753 
4754  DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
4755  return SmallIC;
4756  }
4757 
4758  // Interleave if this is a large loop (small loops are already dealt with by
4759  // this
4760  // point) that could benefit from interleaving.
4761  bool HasReductions = (Legal->getReductionVars()->size() > 0);
4762  if (TTI.enableAggressiveInterleaving(HasReductions)) {
4763  DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
4764  return IC;
4765  }
4766 
4767  DEBUG(dbgs() << "LV: Not Interleaving.\n");
4768  return 1;
4769 }
4770 
4771 LoopVectorizationCostModel::RegisterUsage
4772 LoopVectorizationCostModel::calculateRegisterUsage() {
4773  // This function calculates the register usage by measuring the highest number
4774  // of values that are alive at a single location. Obviously, this is a very
4775  // rough estimation. We scan the loop in a topological order in order and
4776  // assign a number to each instruction. We use RPO to ensure that defs are
4777  // met before their users. We assume that each instruction that has in-loop
4778  // users starts an interval. We record every time that an in-loop value is
4779  // used, so we have a list of the first and last occurrences of each
4780  // instruction. Next, we transpose this data structure into a multi map that
4781  // holds the list of intervals that *end* at a specific location. This multi
4782  // map allows us to perform a linear search. We scan the instructions linearly
4783  // and record each time that a new interval starts, by placing it in a set.
4784  // If we find this value in the multi-map then we remove it from the set.
4785  // The max register usage is the maximum size of the set.
4786  // We also search for instructions that are defined outside the loop, but are
4787  // used inside the loop. We need this number separately from the max-interval
4788  // usage number because when we unroll, loop-invariant values do not take
4789  // more register.
4790  LoopBlocksDFS DFS(TheLoop);
4791  DFS.perform(LI);
4792 
4793  RegisterUsage R;
4794  R.NumInstructions = 0;
4795 
4796  // Each 'key' in the map opens a new interval. The values
4797  // of the map are the index of the 'last seen' usage of the
4798  // instruction that is the key.
4800  // Maps instruction to its index.
4802  // Marks the end of each interval.
4803  IntervalMap EndPoint;
4804  // Saves the list of instruction indices that are used in the loop.
4806  // Saves the list of values that are used in the loop but are
4807  // defined outside the loop, such as arguments and constants.
4808  SmallPtrSet<Value*, 8> LoopInvariants;
4809 
4810  unsigned Index = 0;
4811  for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
4812  be = DFS.endRPO(); bb != be; ++bb) {
4813  R.NumInstructions += (*bb)->size();
4814  for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
4815  ++it) {
4816  Instruction *I = it;
4817  IdxToInstr[Index++] = I;
4818 
4819  // Save the end location of each USE.
4820  for (unsigned i = 0; i < I->getNumOperands(); ++i) {
4821  Value *U = I->getOperand(i);
4822  Instruction *Instr = dyn_cast<Instruction>(U);
4823 
4824  // Ignore non-instruction values such as arguments, constants, etc.
4825  if (!Instr) continue;
4826 
4827  // If this instruction is outside the loop then record it and continue.
4828  if (!TheLoop->contains(Instr)) {
4829  LoopInvariants.insert(Instr);
4830  continue;
4831  }
4832 
4833  // Overwrite previous end points.
4834  EndPoint[Instr] = Index;
4835  Ends.insert(Instr);
4836  }
4837  }
4838  }
4839 
4840  // Saves the list of intervals that end with the index in 'key'.
4841  typedef SmallVector<Instruction*, 2> InstrList;
4842  DenseMap<unsigned, InstrList> TransposeEnds;
4843 
4844  // Transpose the EndPoints to a list of values that end at each index.
4845  for (IntervalMap::iterator it = EndPoint.begin(), e = EndPoint.end();
4846  it != e; ++it)
4847  TransposeEnds[it->second].push_back(it->first);
4848 
4849  SmallSet<Instruction*, 8> OpenIntervals;
4850  unsigned MaxUsage = 0;
4851 
4852 
4853  DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
4854  for (unsigned int i = 0; i < Index; ++i) {
4855  Instruction *I = IdxToInstr[i];
4856  // Ignore instructions that are never used within the loop.
4857  if (!Ends.count(I)) continue;
4858 
4859  // Ignore ephemeral values.
4860  if (EphValues.count(I))
4861  continue;
4862 
4863  // Remove all of the instructions that end at this location.
4864  InstrList &List = TransposeEnds[i];
4865  for (unsigned int j=0, e = List.size(); j < e; ++j)
4866  OpenIntervals.erase(List[j]);
4867 
4868  // Count the number of live interals.
4869  MaxUsage = std::max(MaxUsage, OpenIntervals.size());
4870 
4871  DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " <<
4872  OpenIntervals.size() << '\n');
4873 
4874  // Add the current instruction to the list of open intervals.
4875  OpenIntervals.insert(I);
4876  }
4877 
4878  unsigned Invariant = LoopInvariants.size();
4879  DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << '\n');
4880  DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n');
4881  DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << '\n');
4882 
4883  R.LoopInvariantRegs = Invariant;
4884  R.MaxLocalUsers = MaxUsage;
4885  return R;
4886 }
4887 
4888 unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
4889  unsigned Cost = 0;
4890 
4891  // For each block.
4892  for (Loop::block_iterator bb = TheLoop->block_begin(),
4893  be = TheLoop->block_end(); bb != be; ++bb) {
4894  unsigned BlockCost = 0;
4895  BasicBlock *BB = *bb;
4896 
4897  // For each instruction in the old loop.
4898  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
4899  // Skip dbg intrinsics.
4900  if (isa<DbgInfoIntrinsic>(it))
4901  continue;
4902 
4903  // Ignore ephemeral values.
4904  if (EphValues.count(it))
4905  continue;
4906 
4907  unsigned C = getInstructionCost(it, VF);
4908 
4909  // Check if we should override the cost.
4910  if (ForceTargetInstructionCost.getNumOccurrences() > 0)
4912 
4913  BlockCost += C;
4914  DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " <<
4915  VF << " For instruction: " << *it << '\n');
4916  }
4917 
4918  // We assume that if-converted blocks have a 50% chance of being executed.
4919  // When the code is scalar then some of the blocks are avoided due to CF.
4920  // When the code is vectorized we execute all code paths.
4921  if (VF == 1 && Legal->blockNeedsPredication(*bb))
4922  BlockCost /= 2;
4923 
4924  Cost += BlockCost;
4925  }
4926 
4927  return Cost;
4928 }
4929 
4930 /// \brief Check whether the address computation for a non-consecutive memory
4931 /// access looks like an unlikely candidate for being merged into the indexing
4932 /// mode.
4933 ///
4934 /// We look for a GEP which has one index that is an induction variable and all
4935 /// other indices are loop invariant. If the stride of this access is also
4936 /// within a small bound we decide that this address computation can likely be
4937 /// merged into the addressing mode.
4938 /// In all other cases, we identify the address computation as complex.
4940  LoopVectorizationLegality *Legal,
4941  ScalarEvolution *SE,
4942  const Loop *TheLoop) {
4944  if (!Gep)
4945  return true;
4946 
4947  // We are looking for a gep with all loop invariant indices except for one
4948  // which should be an induction variable.
4949  unsigned NumOperands = Gep->getNumOperands();
4950  for (unsigned i = 1; i < NumOperands; ++i) {
4951  Value *Opd = Gep->getOperand(i);
4952  if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
4953  !Legal->isInductionVariable(Opd))
4954  return true;
4955  }
4956 
4957  // Now we know we have a GEP ptr, %inv, %ind, %inv. Make sure that the step
4958  // can likely be merged into the address computation.
4959  unsigned MaxMergeDistance = 64;
4960 
4961  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Ptr));
4962  if (!AddRec)
4963  return true;
4964 
4965  // Check the step is constant.
4966  const SCEV *Step = AddRec->getStepRecurrence(*SE);
4967  // Calculate the pointer stride and check if it is consecutive.
4968  const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
4969  if (!C)
4970  return true;
4971 
4972  const APInt &APStepVal = C->getValue()->getValue();
4973 
4974  // Huge step value - give up.
4975  if (APStepVal.getBitWidth() > 64)
4976  return true;
4977 
4978  int64_t StepVal = APStepVal.getSExtValue();
4979 
4980  return StepVal > MaxMergeDistance;
4981 }
4982 
4983 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
4984  if (Legal->hasStride(I->getOperand(0)) || Legal->hasStride(I->getOperand(1)))
4985  return true;
4986  return false;
4987 }
4988 
4989 unsigned
4990 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
4991  // If we know that this instruction will remain uniform, check the cost of
4992  // the scalar version.
4993  if (Legal->isUniformAfterVectorization(I))
4994  VF = 1;
4995 
4996  Type *RetTy = I->getType();
4997  Type *VectorTy = ToVectorTy(RetTy, VF);
4998 
4999  // TODO: We need to estimate the cost of intrinsic calls.
5000  switch (I->getOpcode()) {
5001  case Instruction::GetElementPtr:
5002  // We mark this instruction as zero-cost because the cost of GEPs in
5003  // vectorized code depends on whether the corresponding memory instruction
5004  // is scalarized or not. Therefore, we handle GEPs with the memory
5005  // instruction cost.
5006  return 0;
5007  case Instruction::Br: {
5008  return TTI.getCFInstrCost(I->getOpcode());
5009  }
5010  case Instruction::PHI:
5011  //TODO: IF-converted IFs become selects.
5012  return 0;
5013  case Instruction::Add:
5014  case Instruction::FAdd:
5015  case Instruction::Sub:
5016  case Instruction::FSub:
5017  case Instruction::Mul:
5018  case Instruction::FMul:
5019  case Instruction::UDiv:
5020  case Instruction::SDiv:
5021  case Instruction::FDiv:
5022  case Instruction::URem:
5023  case Instruction::SRem:
5024  case Instruction::FRem:
5025  case Instruction::Shl:
5026  case Instruction::LShr:
5027  case Instruction::AShr:
5028  case Instruction::And:
5029  case Instruction::Or:
5030  case Instruction::Xor: {
5031  // Since we will replace the stride by 1 the multiplication should go away.
5032  if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
5033  return 0;
5034  // Certain instructions can be cheaper to vectorize if they have a constant
5035  // second vector operand. One example of this are shifts on x86.
5044  Value *Op2 = I->getOperand(1);
5045 
5046  // Check for a splat of a constant or for a non uniform vector of constants.
5047  if (isa<ConstantInt>(Op2)) {
5048  ConstantInt *CInt = cast<ConstantInt>(Op2);
5049  if (CInt && CInt->getValue().isPowerOf2())
5052  } else if (isa<ConstantVector>(Op2) || isa<ConstantDataVector>(Op2)) {
5054  Constant *SplatValue = cast<Constant>(Op2)->getSplatValue();
5055  if (SplatValue) {
5056  ConstantInt *CInt = dyn_cast<ConstantInt>(SplatValue);
5057  if (CInt && CInt->getValue().isPowerOf2())
5060  }
5061  }
5062 
5063  return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK,
5064  Op1VP, Op2VP);
5065  }
5066  case Instruction::Select: {
5067  SelectInst *SI = cast<SelectInst>(I);
5068  const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
5069  bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
5070  Type *CondTy = SI->getCondition()->getType();
5071  if (!ScalarCond)
5072  CondTy = VectorType::get(CondTy, VF);
5073 
5074  return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
5075  }
5076  case Instruction::ICmp:
5077  case Instruction::FCmp: {
5078  Type *ValTy = I->getOperand(0)->getType();
5079  VectorTy = ToVectorTy(ValTy, VF);
5080  return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy);
5081  }
5082  case Instruction::Store:
5083  case Instruction::Load: {
5084  StoreInst *SI = dyn_cast<StoreInst>(I);
5085  LoadInst *LI = dyn_cast<LoadInst>(I);
5086  Type *ValTy = (SI ? SI->getValueOperand()->getType() :
5087  LI->getType());
5088  VectorTy = ToVectorTy(ValTy, VF);
5089 
5090  unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment();
5091  unsigned AS = SI ? SI->getPointerAddressSpace() :
5092  LI->getPointerAddressSpace();
5093  Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand();
5094  // We add the cost of address computation here instead of with the gep
5095  // instruction because only here we know whether the operation is
5096  // scalarized.
5097  if (VF == 1)
5098  return TTI.getAddressComputationCost(VectorTy) +
5099  TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
5100 
5101  // For an interleaved access, calculate the total cost of the whole
5102  // interleave group.
5103  if (Legal->isAccessInterleaved(I)) {
5104  auto Group = Legal->getInterleavedAccessGroup(I);
5105  assert(Group && "Fail to get an interleaved access group.");
5106 
5107  // Only calculate the cost once at the insert position.
5108  if (Group->getInsertPos() != I)
5109  return 0;
5110 
5111  unsigned InterleaveFactor = Group->getFactor();
5112  Type *WideVecTy =
5114  VectorTy->getVectorNumElements() * InterleaveFactor);
5115 
5116  // Holds the indices of existing members in an interleaved load group.
5117  // An interleaved store group doesn't need this as it dones't allow gaps.
5118  SmallVector<unsigned, 4> Indices;
5119  if (LI) {
5120  for (unsigned i = 0; i < InterleaveFactor; i++)
5121  if (Group->getMember(i))
5122  Indices.push_back(i);
5123  }
5124 
5125  // Calculate the cost of the whole interleaved group.
5126  unsigned Cost = TTI.getInterleavedMemoryOpCost(
5127  I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5128  Group->getAlignment(), AS);
5129 
5130  if (Group->isReverse())
5131  Cost +=
5132  Group->getNumMembers() *
5133  TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5134 
5135  // FIXME: The interleaved load group with a huge gap could be even more
5136  // expensive than scalar operations. Then we could ignore such group and
5137  // use scalar operations instead.
5138  return Cost;
5139  }
5140 
5141  // Scalarized loads/stores.
5142  int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5143  bool Reverse = ConsecutiveStride < 0;
5144  const DataLayout &DL = I->getModule()->getDataLayout();
5145  unsigned ScalarAllocatedSize = DL.getTypeAllocSize(ValTy);
5146  unsigned VectorElementSize = DL.getTypeStoreSize(VectorTy) / VF;
5147  if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) {
5148  bool IsComplexComputation =
5149  isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop);
5150  unsigned Cost = 0;
5151  // The cost of extracting from the value vector and pointer vector.
5152  Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5153  for (unsigned i = 0; i < VF; ++i) {
5154  // The cost of extracting the pointer operand.
5155  Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i);
5156  // In case of STORE, the cost of ExtractElement from the vector.
5157  // In case of LOAD, the cost of InsertElement into the returned
5158  // vector.
5159  Cost += TTI.getVectorInstrCost(SI ? Instruction::ExtractElement :
5160  Instruction::InsertElement,
5161  VectorTy, i);
5162  }
5163 
5164  // The cost of the scalar loads/stores.
5165  Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation);
5166  Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
5167  Alignment, AS);
5168  return Cost;
5169  }
5170 
5171  // Wide load/stores.
5172  unsigned Cost = TTI.getAddressComputationCost(VectorTy);
5173  if (Legal->isMaskRequired(I))
5174  Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment,
5175  AS);
5176  else
5177  Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
5178 
5179  if (Reverse)
5180  Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
5181  VectorTy, 0);
5182  return Cost;
5183  }
5184  case Instruction::ZExt:
5185  case Instruction::SExt:
5186  case Instruction::FPToUI:
5187  case Instruction::FPToSI:
5188  case Instruction::FPExt:
5189  case Instruction::PtrToInt:
5190  case Instruction::IntToPtr:
5191  case Instruction::SIToFP:
5192  case Instruction::UIToFP:
5193  case Instruction::Trunc:
5194  case Instruction::FPTrunc:
5195  case Instruction::BitCast: {
5196  // We optimize the truncation of induction variable.
5197  // The cost of these is the same as the scalar operation.
5198  if (I->getOpcode() == Instruction::Trunc &&
5199  Legal->isInductionVariable(I->getOperand(0)))
5200  return TTI.getCastInstrCost(I->getOpcode(), I->getType(),
5201  I->getOperand(0)->getType());
5202 
5203  Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
5204  return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
5205  }
5206  case Instruction::Call: {
5207  bool NeedToScalarize;
5208  CallInst *CI = cast<CallInst>(I);
5209  unsigned CallCost = getVectorCallCost(CI, VF, TTI, TLI, NeedToScalarize);
5210  if (getIntrinsicIDForCall(CI, TLI))
5211  return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI));
5212  return CallCost;
5213  }
5214  default: {
5215  // We are scalarizing the instruction. Return the cost of the scalar
5216  // instruction, plus the cost of insert and extract into vector
5217  // elements, times the vector width.
5218  unsigned Cost = 0;
5219 
5220  if (!RetTy->isVoidTy() && VF != 1) {
5221  unsigned InsCost = TTI.getVectorInstrCost(Instruction::InsertElement,
5222  VectorTy);
5223  unsigned ExtCost = TTI.getVectorInstrCost(Instruction::ExtractElement,
5224  VectorTy);
5225 
5226  // The cost of inserting the results plus extracting each one of the
5227  // operands.
5228  Cost += VF * (InsCost + ExtCost * I->getNumOperands());
5229  }
5230 
5231  // The cost of executing VF copies of the scalar instruction. This opcode
5232  // is unknown. Assume that it is the same as 'mul'.
5233  Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy);
5234  return Cost;
5235  }
5236  }// end of switch.
5237 }
5238 
5239 char LoopVectorize::ID = 0;
5240 static const char lv_name[] = "Loop Vectorization";
5241 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
5250 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
5252 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
5253 
5254 namespace llvm {
5255  Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) {
5256  return new LoopVectorize(NoUnrolling, AlwaysVectorize);
5257  }
5258 }
5259 
5260 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
5261  // Check for a store.
5262  if (StoreInst *ST = dyn_cast<StoreInst>(Inst))
5263  return Legal->isConsecutivePtr(ST->getPointerOperand()) != 0;
5264 
5265  // Check for a load.
5266  if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
5267  return Legal->isConsecutivePtr(LI->getPointerOperand()) != 0;
5268 
5269  return false;
5270 }
5271 
5272 
5273 void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
5274  bool IfPredicateStore) {
5275  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
5276  // Holds vector parameters or scalars, in case of uniform vals.
5278 
5279  setDebugLocFromInst(Builder, Instr);
5280 
5281  // Find all of the vectorized parameters.
5282  for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
5283  Value *SrcOp = Instr->getOperand(op);
5284 
5285  // If we are accessing the old induction variable, use the new one.
5286  if (SrcOp == OldInduction) {
5287  Params.push_back(getVectorValue(SrcOp));
5288  continue;
5289  }
5290 
5291  // Try using previously calculated values.
5292  Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
5293 
5294  // If the src is an instruction that appeared earlier in the basic block
5295  // then it should already be vectorized.
5296  if (SrcInst && OrigLoop->contains(SrcInst)) {
5297  assert(WidenMap.has(SrcInst) && "Source operand is unavailable");
5298  // The parameter is a vector value from earlier.
5299  Params.push_back(WidenMap.get(SrcInst));
5300  } else {
5301  // The parameter is a scalar from outside the loop. Maybe even a constant.
5302  VectorParts Scalars;
5303  Scalars.append(UF, SrcOp);
5304  Params.push_back(Scalars);
5305  }
5306  }
5307 
5308  assert(Params.size() == Instr->getNumOperands() &&
5309  "Invalid number of operands");
5310 
5311  // Does this instruction return a value ?
5312  bool IsVoidRetTy = Instr->getType()->isVoidTy();
5313 
5314  Value *UndefVec = IsVoidRetTy ? nullptr :
5315  UndefValue::get(Instr->getType());
5316  // Create a new entry in the WidenMap and initialize it to Undef or Null.
5317  VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
5318 
5319  Instruction *InsertPt = Builder.GetInsertPoint();
5320  BasicBlock *IfBlock = Builder.GetInsertBlock();
5321  BasicBlock *CondBlock = nullptr;
5322 
5323  VectorParts Cond;
5324  Loop *VectorLp = nullptr;
5325  if (IfPredicateStore) {
5326  assert(Instr->getParent()->getSinglePredecessor() &&
5327  "Only support single predecessor blocks");
5328  Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
5329  Instr->getParent());
5330  VectorLp = LI->getLoopFor(IfBlock);
5331  assert(VectorLp && "Must have a loop for this block");
5332  }
5333 
5334  // For each vector unroll 'part':
5335  for (unsigned Part = 0; Part < UF; ++Part) {
5336  // For each scalar that we create:
5337 
5338  // Start an "if (pred) a[i] = ..." block.
5339  Value *Cmp = nullptr;
5340  if (IfPredicateStore) {
5341  if (Cond[Part]->getType()->isVectorTy())
5342  Cond[Part] =
5343  Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0));
5344  Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part],
5345  ConstantInt::get(Cond[Part]->getType(), 1));
5346  CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
5347  LoopVectorBody.push_back(CondBlock);
5348  VectorLp->addBasicBlockToLoop(CondBlock, *LI);
5349  // Update Builder with newly created basic block.
5350  Builder.SetInsertPoint(InsertPt);
5351  }
5352 
5353  Instruction *Cloned = Instr->clone();
5354  if (!IsVoidRetTy)
5355  Cloned->setName(Instr->getName() + ".cloned");
5356  // Replace the operands of the cloned instructions with extracted scalars.
5357  for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
5358  Value *Op = Params[op][Part];
5359  Cloned->setOperand(op, Op);
5360  }
5361 
5362  // Place the cloned scalar in the new loop.
5363  Builder.Insert(Cloned);
5364 
5365  // If the original scalar returns a value we need to place it in a vector
5366  // so that future users will be able to use it.
5367  if (!IsVoidRetTy)
5368  VecResults[Part] = Cloned;
5369 
5370  // End if-block.
5371  if (IfPredicateStore) {
5372  BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
5373  LoopVectorBody.push_back(NewIfBlock);
5374  VectorLp->addBasicBlockToLoop(NewIfBlock, *LI);
5375  Builder.SetInsertPoint(InsertPt);
5377  BranchInst::Create(CondBlock, NewIfBlock, Cmp));
5378  IfBlock = NewIfBlock;
5379  }
5380  }
5381 }
5382 
5383 void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) {
5384  StoreInst *SI = dyn_cast<StoreInst>(Instr);
5385  bool IfPredicateStore = (SI && Legal->blockNeedsPredication(SI->getParent()));
5386 
5387  return scalarizeInstruction(Instr, IfPredicateStore);
5388 }
5389 
5390 Value *InnerLoopUnroller::reverseVector(Value *Vec) {
5391  return Vec;
5392 }
5393 
5394 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) {
5395  return V;
5396 }
5397 
5398 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step) {
5399  // When unrolling and the VF is 1, we only need to add a simple scalar.
5400  Type *ITy = Val->getType();
5401  assert(!ITy->isVectorTy() && "Val must be a scalar");
5402  Constant *C = ConstantInt::get(ITy, StartIdx);
5403  return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
5404 }
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:82
static unsigned RuntimeMemoryCheckThreshold
\brief When performing memory disambiguation checks at runtime do not make more than this number of c...
VectorType::iterator iterator
Definition: MapVector.h:39
Value * CreateGEP(Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition: IRBuilder.h:1032
static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract, const TargetTransformInfo &TTI)
Estimate the overhead of scalarizing a value.
Value * getValueOperand()
Definition: Instructions.h:406
iplist< Instruction >::iterator eraseFromParent()
eraseFromParent - This method unlinks 'this' from the containing basic block and deletes it...
Definition: Instruction.cpp:70
Intrinsic::ID getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:104
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
Definition: InstrTypes.h:649
void ReplaceInstWithInst(BasicBlock::InstListType &BIL, BasicBlock::iterator &BI, Instruction *I)
ReplaceInstWithInst - Replace the instruction specified by BI with the instruction specified by I...
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:679
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
static IntegerType * getInt1Ty(LLVMContext &C)
Definition: Type.cpp:236
void addIncoming(Value *V, BasicBlock *BB)
addIncoming - Add an incoming value to the end of the PHI list
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
const_iterator begin() const
Definition: IntervalMap.h:1100
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function. ...
Definition: Function.cpp:223
bool isOne() const
isOne - Return true if the expression is a constant one.
STATISTIC(NumFunctions,"Total number of functions")
ValueT lookup(const KeyT &Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:159
static cl::opt< unsigned > ForceTargetNumScalarRegs("force-target-num-scalar-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of scalar registers."))
int getWidth()
Get the width of a number.
Definition: ScaledNumber.h:43
A Module instance is used to store all the information related to an LLVM module. ...
Definition: Module.h:114
void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
Definition: Metadata.cpp:743
static Constant * getSequentialMask(IRBuilder<> &Builder, unsigned NumInt, unsigned NumUndef)
static MDString * get(LLVMContext &Context, StringRef Str)
Definition: Metadata.cpp:360
reverse_iterator rend()
Definition: MapVector.h:53
Min/max implemented in terms of select(cmp()).
Definition: LoopUtils.h:69
unsigned getNumOperands() const
Definition: User.h:138
static Value * ConcatenateVectors(IRBuilder<> &Builder, ArrayRef< Value * > InputList)
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:942
value_op_iterator value_op_begin()
Definition: User.h:209
ScalarEvolution - This class is the main scalar evolution driver.
iterator end() const
Definition: ArrayRef.h:123
bool isInductionPHI(PHINode *, ScalarEvolution *, ConstantInt *&)
Checks if the given PHINode in a loop header is an induction variable.
Definition: LoopUtils.cpp:455
bool endswith(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:224
CallInst - This class represents a function call, abstracting a target machine's calling convention...
size_type count(PtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:276
An immutable pass that tracks lazily created AssumptionCache objects.
static bool isReductionPHI(PHINode *Phi, Loop *TheLoop, RecurrenceDescriptor &RedDes)
Returns true if Phi is a reduction in TheLoop.
Definition: LoopUtils.cpp:314
A cache of .assume calls within a function.
StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition: StringRef.h:405
static cl::opt< unsigned > TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), cl::Hidden, cl::desc("Don't vectorize loops with a constant ""trip count that is smaller than this ""value."))
We don't vectorize loops with a known constant trip count below this number.
const SCEV * getStepRecurrence(ScalarEvolution &SE) const
getStepRecurrence - This method constructs and returns the recurrence indicating how much this expres...
Externally visible function.
Definition: GlobalValue.h:40
void initializeLoopVectorizePass(PassRegistry &)
This class implements a map that also provides access to all stored values in a deterministic order...
Definition: MapVector.h:32
bool isLoopInvariant(const SCEV *S, const Loop *L)
isLoopInvariant - Return true if the value of the given SCEV is unchanging in the specified loop...
value_op_iterator value_op_end()
Definition: User.h:212
LoopT * getParentLoop() const
Definition: LoopInfo.h:97
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:111
A debug info location.
Definition: DebugLoc.h:34
Metadata node.
Definition: Metadata.h:740
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.h:225
F(f)
LoadInst - an instruction for reading from memory.
Definition: Instructions.h:177
const_iterator end() const
Definition: IntervalMap.h:1112
FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys=None)
Return the function type for an intrinsic.
Definition: Function.cpp:822
#define op(i)
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: Type.cpp:216
static cl::opt< unsigned > NumberOfStoresToPredicate("vectorize-num-stores-pred", cl::init(1), cl::Hidden, cl::desc("Max number of stores to be predicated behind an if."))
The number of stores in a loop that are allowed to need predication.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:1522
bool erase(const T &V)
Definition: SmallSet.h:96
void emitOptimizationRemark(LLVMContext &Ctx, const char *PassName, const Function &Fn, const DebugLoc &DLoc, const Twine &Msg)
Emit an optimization-applied message.
const SCEV * replaceSymbolicStrideSCEV(ScalarEvolution *SE, const ValueToValueMap &PtrToStride, Value *Ptr, Value *OrigPtr=nullptr)
Return the SCEV corresponding to a pointer with the symbolic stride replaced with constant one...
static Value * ConcatenateTwoVectors(IRBuilder<> &Builder, Value *V1, Value *V2)
op_iterator op_begin()
Definition: User.h:183
BlockT * getHeader() const
Definition: LoopInfo.h:96
aarch64 collect AArch64 Collect Linker Optimization Hint(LOH)"
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:726
SCEVCastExpr - This is the base class for unary cast operator classes.
Type * getPointerElementType() const
Definition: Type.h:366
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:188
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:242
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:231
static void cse(SmallVector< BasicBlock *, 4 > &BBs)
Perform cse of induction variable instructions.
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:306
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:70
std::string str() const
Return the twine contents as a std::string.
Definition: Twine.cpp:16
bool empty() const
Definition: MapVector.h:56
SelectInst - This class represents the LLVM 'select' instruction.
bool isIdenticalTo(const Instruction *I) const
isIdenticalTo - Return true if the specified instruction is exactly identical to the current one...
void emitOptimizationRemarkAnalysis(LLVMContext &Ctx, const char *PassName, const Function &Fn, const DebugLoc &DLoc, const Twine &Msg)
Emit an optimization analysis remark message.
static const unsigned MaxVectorWidth
Maximum SIMD width.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:79
static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal)
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:389
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:106
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr it the function does no...
Definition: BasicBlock.cpp:116
T LLVM_ATTRIBUTE_UNUSED_RESULT pop_back_val()
Definition: SmallVector.h:406
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:98
A Use represents the edge between a Value definition and its users.
Definition: Use.h:69
static cl::opt< bool > EnableLoadStoreRuntimeInterleave("enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, cl::desc("Enable runtime interleaving until load/store ports are saturated"))
static cl::opt< unsigned > MaxInterleaveGroupFactor("max-interleave-group-factor", cl::Hidden, cl::desc("Maximum factor for an interleaved access group (default = 8)"), cl::init(8))
Maximum factor for an interleaved memory access.
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:75
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APInt.h:33
unsigned getNumArgOperands() const
getNumArgOperands - Return the number of call arguments.
Instruction * getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:165
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1057
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:517
unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys) const
Number of individual test Apply this number of consecutive mutations to each input exit after the first new interesting input is found the minimized corpus is saved into the first input directory Number of jobs to run If min(jobs, NumberOfCpuCores()/2)\" is used.") FUZZER_FLAG_INT(reload
size_type size() const
Definition: SmallSet.h:48
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:250
static ConstantInt * ExtractElement(Constant *V, Constant *Idx)
Type * getVectorElementType() const
Definition: Type.h:364
Instruction * clone() const
clone() - Create a copy of 'this' instruction that is identical in all ways except the following: ...
#define false
Definition: ConvertUTF.c:65
static const unsigned TinyTripCountInterleaveThreshold
We don't interleave loops with a known constant trip count below this number.
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:117
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:414
Value * getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp)
Get the stride of a pointer access in a loop.
FunctionType - Class to represent function types.
Definition: DerivedTypes.h:96
static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI)
void emitOptimizationRemarkMissed(LLVMContext &Ctx, const char *PassName, const Function &Fn, const DebugLoc &DLoc, const Twine &Msg)
Emit an optimization-missed message.
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:102
bool mayReadFromMemory() const
mayReadFromMemory - Return true if this instruction may read memory.
LLVMContext & getContext() const
getContext - Return the LLVMContext in which this type was uniqued.
Definition: Type.h:125
void addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase< BlockT, LoopT > &LI)
addBasicBlockToLoop - This method is used by other analyses to update loop information.
Definition: LoopInfoImpl.h:187
SCEVAddRecExpr - This node represents a polynomial recurrence on the trip count of the specified loop...
bool LLVM_ATTRIBUTE_UNUSED_RESULT empty() const
Definition: SmallVector.h:57
void addChildLoop(LoopT *NewChild)
addChildLoop - Add the specified loop to be a child of this loop.
Definition: LoopInfo.h:265
#define DEBUG_TYPE
Pass * createLoopVectorizePass(bool NoUnrolling=false, bool AlwaysVectorize=true)
static bool isValidElementType(Type *ElemTy)
isValidElementType - Return true if the specified type is valid as a element type.
Definition: Type.cpp:729
BasicBlock * getSuccessor(unsigned i) const
static FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
FunctionType::get - This static method is the primary way of constructing a FunctionType.
Definition: Type.cpp:361
static Instruction * getFirstInst(Instruction *FirstInst, Value *V, Instruction *Loc)
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: ArrayRef.h:31
bool isFloatingPointTy() const
isFloatingPointTy - Return true if this is one of the six floating point types
Definition: Type.h:159
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:866
StoreInst - an instruction for storing to memory.
Definition: Instructions.h:316
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:109
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:351
unsigned getNumElements() const
Return the number of elements in the Vector type.
Definition: DerivedTypes.h:432
Reverse the order of the vector.
static cl::opt< unsigned > ForceTargetMaxScalarInterleaveFactor("force-target-max-scalar-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for ""scalar loops."))
static Constant * getInterleavedMask(IRBuilder<> &Builder, unsigned VF, unsigned NumVec)
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:318
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:67
Type * getElementType() const
Definition: DerivedTypes.h:323
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:134
PointerType - Class to represent pointers.
Definition: DerivedTypes.h:449
unsigned getNumIncomingValues() const
getNumIncomingValues - Return the number of incoming edges
static bool canIfConvertPHINodes(BasicBlock *BB)
Check whether it is safe to if-convert this phi node.
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition: Module.cpp:188
Optimization analysis message produced during vectorization.
GetElementPtrInst - an instruction for type-safe pointer arithmetic to access elements of arrays and ...
Definition: Instructions.h:830
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
#define P(N)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:325
static CmpInst * Create(OtherOps Op, unsigned short predicate, Value *S1, Value *S2, const Twine &Name="", Instruction *InsertBefore=nullptr)
Construct a compare instruction, given the opcode, the predicate and the two operands.
unsigned getAlignment() const
getAlignment - Return the alignment of the access that is being performed
Definition: Instructions.h:365
Wrapper pass for TargetTransformInfo.
BlockT * getLoopPreheader() const
getLoopPreheader - If there is a preheader for this loop, return it.
Definition: LoopInfoImpl.h:108
LLVM Basic Block Representation.
Definition: BasicBlock.h:65
static cl::opt< unsigned > MaxNestedScalarReductionIC("max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, cl::desc("The maximum interleave count to use when interleaving a scalar ""reduction in a nested loop."))
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:45
size_type size() const
Definition: SmallPtrSet.h:79
void getAllMetadataOtherThanDebugLoc(SmallVectorImpl< std::pair< unsigned, MDNode * >> &MDs) const
getAllMetadataOtherThanDebugLoc - This does the same thing as getAllMetadata, except that it filters ...
Definition: Instruction.h:190
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:41
Type * getType() const
getType - Return the LLVM type of this SCEV expression.
BranchInst - Conditional or Unconditional Branch instruction.
Min/max implemented in terms of select(cmp()).
Definition: LoopUtils.h:72
bool isVectorTy() const
isVectorTy - True if this is an instance of VectorType.
Definition: Type.h:226
Value handle that tracks a Value across RAUW.
Definition: ValueHandle.h:280
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:704
This is an important base class in LLVM.
Definition: Constant.h:41
const Value * getCondition() const
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1339
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:32
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static Type * getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1)
static Value * addFastMathFlag(Value *V)
Adds a 'fast' flag to floating point operations.
#define H(x, y, z)
Definition: MD5.cpp:53
APInt Or(const APInt &LHS, const APInt &RHS)
Bitwise OR function for APInt.
Definition: APInt.h:1895
char & LCSSAID
Definition: LCSSA.cpp:312
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:264
APInt Xor(const APInt &LHS, const APInt &RHS)
Bitwise XOR function for APInt.
Definition: APInt.h:1900
static bool isLikelyComplexAddressComputation(Value *Ptr, LoopVectorizationLegality *Legal, ScalarEvolution *SE, const Loop *TheLoop)
Check whether the address computation for a non-consecutive memory access looks like an unlikely cand...
static Constant * getStridedMask(IRBuilder<> &Builder, unsigned Start, unsigned Stride, unsigned VF)
Interval::pred_iterator pred_begin(Interval *I)
pred_begin/pred_end - define methods so that Intervals may be used just like BasicBlocks can with the...
Definition: Interval.h:114
const DebugLoc & getDebugLoc() const
getDebugLoc - Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:230
MDNode * getLoopID() const
Return the llvm.loop loop id metadata node for this loop if it is present.
Definition: LoopInfo.cpp:228
void setUnsafeAlgebra()
Definition: Operator.h:200
Represent the analysis usage information of a pass.
op_iterator op_end()
Definition: User.h:185
BasicBlock * getIncomingBlock(unsigned i) const
getIncomingBlock - Return incoming basic block number i.
bool contains(const LoopT *L) const
contains - Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:105
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1273
iterator begin() const
Definition: ArrayRef.h:122
std::vector< BasicBlock * >::const_reverse_iterator RPOIterator
Definition: LoopIterator.h:42
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:294
Value * getOperand(unsigned i) const
Definition: User.h:118
Interval::pred_iterator pred_end(Interval *I)
Definition: Interval.h:117
Value * getPointerOperand()
Definition: Instructions.h:284
int isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp, const ValueToValueMap &StridesMap)
Check the stride of the pointer and ensure that it does not wrap in the address space.
iterator begin() const
Definition: SmallPtrSet.h:286
std::pair< NoneType, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:69
static cl::opt< unsigned > ForceTargetNumVectorRegs("force-target-num-vector-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of vector registers."))
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:760
static CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd)
Create a BitCast AddrSpaceCast, or a PtrToInt cast instruction.
static Constant * getAllOnesValue(Type *Ty)
Get the all ones value.
Definition: Constants.cpp:230
#define INITIALIZE_AG_DEPENDENCY(depName)
Definition: PassSupport.h:72
void append(in_iter in_start, in_iter in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:416
unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef< Type * > Tys) const
bool isPointerTy() const
isPointerTy - True if this is an instance of PointerType.
Definition: Type.h:217
static UndefValue * get(Type *T)
get() - Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1473
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:519
RecurrenceKind getRecurrenceKind()
Definition: LoopUtils.h:162
PointerType * getPointerTo(unsigned AddrSpace=0)
getPointerTo - Return a pointer to the current type.
Definition: Type.cpp:764
static unsigned getRecurrenceBinOp(RecurrenceKind Kind)
Returns the opcode of binary operation corresponding to the RecurrenceKind.
Definition: LoopUtils.cpp:393
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:386
Value * stripIntegerCast(Value *V)
static void emitAnalysis(const LoopAccessReport &Message, const Function *TheFunction, const Loop *TheLoop, const char *PassName)
Emit an analysis note for PassName with the debug location from the instruction in Message if availab...
void setMetadata(unsigned KindID, MDNode *Node)
setMetadata - Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1083
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.h:460
bool mayWriteToMemory() const
mayWriteToMemory - Return true if this instruction may modify memory.
void emitLoopVectorizeWarning(LLVMContext &Ctx, const Function &Fn, const DebugLoc &DLoc, const Twine &Msg)
Emit a warning when loop vectorization is specified but fails.
char & LoopSimplifyID
OperandValueProperties
Additional properties of an operand's values.
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: LoopUtils.h:58
bool isConditional() const
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:53
void setLoopID(MDNode *LoopID) const
Set the llvm.loop loop id metadata for this loop.
Definition: LoopInfo.cpp:262
IntegerType * getIntPtrType(LLVMContext &C, unsigned AddressSpace=0) const
Returns an integer type with size at least as big as that of a pointer in the given address space...
Definition: DataLayout.cpp:694
BinaryOps getOpcode() const
Definition: InstrTypes.h:323
StringRef getString() const
Definition: Metadata.cpp:375
unsigned getABITypeAlignment(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:674
static Constant * getSplat(unsigned NumElts, Constant *Elt)
getSplat - Return a ConstantVector with the specified constant in each element.
Definition: Constants.cpp:1162
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:936
void emitLoopInterleaveWarning(LLVMContext &Ctx, const Function &Fn, const DebugLoc &DLoc, const Twine &Msg)
Emit a warning when loop interleaving is specified but fails.
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:217
See the file comment.
Definition: ValueMap.h:80
bool startswith(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:215
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:299
#define LV_NAME
This is the shared class of boolean and integer constants.
Definition: Constants.h:47
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Value * getIncomingValue(unsigned i) const
getIncomingValue - Return incoming value number x
uint64_t getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:388
unsigned getVectorNumElements() const
Definition: Type.cpp:212
static void emitAnalysis(CallSite CS, const Twine &Msg)
Definition: Inliner.cpp:296
iterator end()
Definition: BasicBlock.h:233
unsigned getScalarSizeInBits() const LLVM_READONLY
getScalarSizeInBits - If this is a vector type, return the getPrimitiveSizeInBits value for the eleme...
Definition: Type.cpp:139
AnalysisUsage & addRequiredID(const void *ID)
Definition: Pass.cpp:276
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:57
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:861
bool isAllOnesValue() const
isAllOnesValue - Return true if the expression is a constant all-ones value.
Module.h This file contains the declarations for the Module class.
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:222
static cl::opt< unsigned > ForceTargetInstructionCost("force-target-instruction-cost", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's expected cost for ""an instruction to a single constant value. Mostly ""useful for getting consistent testing."))
Provides information about what library functions are available for the current target.
AddressSpace
Definition: NVPTXBaseInfo.h:22
static Type * convertPointerToIntegerType(const DataLayout &DL, Type *Ty)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:67
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:266
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:748
Drive the analysis of memory accesses in the loop.
static Constant * get(Type *Ty, uint64_t V, bool isSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:582
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition: Constants.cpp:597
Function * getCalledFunction() const
getCalledFunction - Return the function called, or null if this is an indirect function invocation...
static BranchInst * Create(BasicBlock *IfTrue, Instruction *InsertBefore=nullptr)
static cl::opt< bool > EnableIndVarRegisterHeur("enable-ind-var-reg-heur", cl::init(true), cl::Hidden, cl::desc("Count the induction variable only once when interleaving"))
reverse_iterator rbegin()
Definition: MapVector.h:51
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
ConstantInt * getValue() const
bool hasVectorInstrinsicScalarOpd(Intrinsic::ID ID, unsigned ScalarOpdIdx)
Identifies if the intrinsic has a scalar operand.
Definition: VectorUtils.cpp:62
static cl::opt< AlignMode > Align(cl::desc("Load/store alignment support"), cl::Hidden, cl::init(NoStrictAlign), cl::values(clEnumValN(StrictAlign,"aarch64-strict-align","Disallow all unaligned memory accesses"), clEnumValN(NoStrictAlign,"aarch64-no-strict-align","Allow unaligned memory accesses"), clEnumValEnd))
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:530
void setOperand(unsigned i, Value *Val)
Definition: User.h:122
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:123
Value * getArgOperand(unsigned i) const
getArgOperand/setArgOperand - Return/set the i-th call argument.
static Constant * getRecurrenceIdentity(RecurrenceKind K, Type *Tp)
Returns identity corresponding to the RecurrenceKind.
Definition: LoopUtils.cpp:367
Store the result of a depth first search within basic blocks contained by a single loop...
Definition: LoopIterator.h:38
VectorType - Class to represent vector types.
Definition: DerivedTypes.h:362
Class for arbitrary precision integers.
Definition: APInt.h:73
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:603
Value * getIncomingValueForBlock(const BasicBlock *BB) const
bool isIntegerTy() const
isIntegerTy - True if this is an instance of IntegerType.
Definition: Type.h:193
iterator_range< user_iterator > users()
Definition: Value.h:300
BasicBlock * getSinglePredecessor()
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:211
static Value * createMinMaxOp(IRBuilder<> &Builder, MinMaxRecurrenceKind RK, Value *Left, Value *Right)
Returns a Min/Max operation corresponding to MinMaxRecurrenceKind.
Definition: LoopUtils.cpp:418
static const char lv_name[]
This class uses information about analyze scalars to rewrite expressions in canonical form...
LLVM_ATTRIBUTE_UNUSED_RESULT std::enable_if< !is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:285
static const unsigned MaxInterleaveFactor
Maximum vectorization interleave count.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:481
std::vector< BlockT * >::const_iterator block_iterator
Definition: LoopInfo.h:140
Holds information about the memory runtime legality checks to verify that a group of pointers do not ...
const Type * getScalarType() const LLVM_READONLY
getScalarType - If this is a vector type, return the element type, otherwise return 'this'...
Definition: Type.cpp:51
APInt And(const APInt &LHS, const APInt &RHS)
Bitwise AND function for APInt.
Definition: APInt.h:1890
unsigned getGEPInductionOperand(const GetElementPtrInst *Gep)
Find the operand of the GEP that should be checked for consecutive stores.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1030
static cl::opt< bool > EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, cl::desc("Enable if-conversion during vectorization."))
iterator end() const
Definition: SmallPtrSet.h:289
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.cpp:372
This analysis provides dependence information for the memory accesses of a loop.
Value * getCondition() const
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.h:217
bool isAggregateType() const
isAggregateType - Return true if the type is an aggregate type.
Definition: Type.h:260
static const size_t npos
Definition: StringRef.h:44
static unsigned getVectorCallCost(CallInst *CI, unsigned VF, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, bool &NeedToScalarize)
SCEV - This class represents an analyzed expression in the program.
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:239
static cl::opt< bool > EnableInterleavedMemAccesses("enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop"))
bool isFunctionVectorizable(StringRef F, unsigned VF) const
std::error_code Check(std::error_code Err)
unsigned getAlignment() const
getAlignment - Return the alignment of the access that is being performed
Definition: Instructions.h:243
static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, SmallPtrSetImpl< Value * > &Reductions)
Check that the instruction has outside loop users and is not an identified reduction variable...
#define I(x, y, z)
Definition: MD5.cpp:54
TerminatorInst * getTerminator()
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.cpp:124
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:651
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:3639
static cl::opt< unsigned, true > VectorizationFactor("force-vector-width", cl::Hidden, cl::desc("Sets the SIMD width. Zero is autoselect."), cl::location(VectorizerParams::VectorizationFactor))
void size_t size
static cl::opt< unsigned > ForceTargetMaxVectorInterleaveFactor("force-target-max-vector-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for ""vectorized loops."))
static cl::opt< unsigned > SmallLoopCost("small-loop-cost", cl::init(20), cl::Hidden, cl::desc("The cost of a loop that is considered 'small' by the interleaver."))
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:348
uint64_t getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type...
Definition: DataLayout.h:371
Collection of parameters shared beetween the Loop Vectorizer and the Loop Access Analysis.
iterator begin()
Definition: MapVector.h:46
iterator_range< op_iterator > arg_operands()
arg_operands - iteration adapter for range-for loops.
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:140
iterator end()
Definition: MapVector.h:48
static cl::opt< bool > LoopVectorizeWithBlockFrequency("loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden, cl::desc("Enable the use of the block frequency analysis to access PGO ""heuristics minimizing code growth in cold regions and being more ""aggressive in hot regions."))
const ARM::ArchExtKind Kind
Use * op_iterator
Definition: User.h:178
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:32
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:935
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:465
unsigned getPrimitiveSizeInBits() const LLVM_READONLY
getPrimitiveSizeInBits - Return the basic size of this type if it is a primitive type.
Definition: Type.cpp:121
uint64_t PowerOf2Floor(uint64_t A)
Returns the power of two which is less than or equal to the given value.
Definition: MathExtras.h:594
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:365
LLVM Value Representation.
Definition: Value.h:69
const SCEV * getSCEV(Value *V)
getSCEV - Return a SCEV expression for the full generality of the specified expression.
unsigned getOpcode() const
getOpcode() returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:112
static VectorType * get(Type *ElementType, unsigned NumElements)
VectorType::get - This static method is the primary way to construct an VectorType.
Definition: Type.cpp:713
Disable implicit floating point insts.
Definition: Attributes.h:87
static bool blockNeedsPredication(BasicBlock *BB, Loop *TheLoop, DominatorTree *DT)
Return true if the block BB needs to be predicated in order for the loop to be vectorized.
unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index=-1) const
uint64_t getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:507
bool empty() const
Definition: LoopInfo.h:135
#define DEBUG(X)
Definition: Debug.h:92
The legacy pass manager's analysis pass to compute loop information.
Definition: LoopInfo.h:737
static unsigned VectorizationInterleave
Interleave factor as overridden by the user.
C - The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
bool isPowerOf2_32(uint32_t Value)
isPowerOf2_32 - This function returns true if the argument is a power of two > 0. ...
Definition: MathExtras.h:354
Convenience struct for specifying and reasoning about fast-math flags.
Definition: Operator.h:164
OperandValueKind
Additional information about an operand's possible values.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:40
A single uniqued string.
Definition: Metadata.h:508
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, const Twine &N="", Module *M=nullptr)
Definition: Function.h:121
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:203
static cl::opt< bool > EnableMemAccessVersioning("enable-mem-access-versioning", cl::init(true), cl::Hidden, cl::desc("Enable symblic stride memory access versioning"))
This enables versioning on the strides of symbolically striding memory accesses in code like the foll...
This pass exposes codegen information to IR-level passes.
iterator getFirstInsertionPt()
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:194
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop)...
Definition: CodeMetrics.cpp:70
void setIncomingValue(unsigned i, Value *V)
static bool isInterleaveForced()
True if force-vector-interleave was specified by the user.
static cl::opt< bool > EnableCondStoresVectorization("enable-cond-stores-vec", cl::init(false), cl::Hidden, cl::desc("Enable if predication of stores during vectorization."))
Root of the metadata hierarchy.
Definition: Metadata.h:45
Value * getPointerOperand()
Definition: Instructions.h:409
int getBasicBlockIndex(const BasicBlock *BB) const
getBasicBlockIndex - Return the first index of the specified basic block in the value list for this P...
const BasicBlock * getParent() const
Definition: Instruction.h:72
bool isOne() const
This is just a convenience method to make client code smaller for a common case.
Definition: Constants.h:169
unsigned getMaxSafeDepDistBytes() const
static bool isPredicatedBlock(unsigned BlockNum)
Check whether this block is a predicated block.
RecurrenceKind
This enum represents the kinds of recurrences that we support.
Definition: LoopUtils.h:62
bool isVoidTy() const
isVoidTy - Return true if this is 'void'.
Definition: Type.h:137
SCEVConstant - This class represents a constant integer value.
bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:110