LLVM  6.0.0svn
LoopVectorize.cpp
Go to the documentation of this file.
1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
11 // and generates target-independent LLVM-IR.
12 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
13 // of instructions in order to estimate the profitability of vectorization.
14 //
15 // The loop vectorizer combines consecutive loop iterations into a single
16 // 'wide' iteration. After this transformation the index is incremented
17 // by the SIMD vector width, and not by one.
18 //
19 // This pass has three parts:
20 // 1. The main loop pass that drives the different parts.
21 // 2. LoopVectorizationLegality - A unit that checks for the legality
22 // of the vectorization.
23 // 3. InnerLoopVectorizer - A unit that performs the actual
24 // widening of instructions.
25 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
26 // of vectorization. It decides on the optimal vector width, which
27 // can be one, if vectorization is not profitable.
28 //
29 //===----------------------------------------------------------------------===//
30 //
31 // The reduction-variable vectorization is based on the paper:
32 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
33 //
34 // Variable uniformity checks are inspired by:
35 // Karrenberg, R. and Hack, S. Whole Function Vectorization.
36 //
37 // The interleaved access vectorization is based on the paper:
38 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
39 // Data for SIMD
40 //
41 // Other ideas/concepts are from:
42 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
43 //
44 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
45 // Vectorizing Compilers.
46 //
47 //===----------------------------------------------------------------------===//
48 
50 #include "llvm/ADT/DenseMap.h"
51 #include "llvm/ADT/Hashing.h"
52 #include "llvm/ADT/MapVector.h"
53 #include "llvm/ADT/Optional.h"
54 #include "llvm/ADT/SCCIterator.h"
55 #include "llvm/ADT/SetVector.h"
56 #include "llvm/ADT/SmallPtrSet.h"
57 #include "llvm/ADT/SmallSet.h"
58 #include "llvm/ADT/SmallVector.h"
59 #include "llvm/ADT/Statistic.h"
60 #include "llvm/ADT/StringExtras.h"
63 #include "llvm/Analysis/LoopInfo.h"
65 #include "llvm/Analysis/LoopPass.h"
70 #include "llvm/IR/Constants.h"
71 #include "llvm/IR/DataLayout.h"
72 #include "llvm/IR/DebugInfo.h"
73 #include "llvm/IR/DerivedTypes.h"
74 #include "llvm/IR/DiagnosticInfo.h"
75 #include "llvm/IR/Dominators.h"
76 #include "llvm/IR/Function.h"
77 #include "llvm/IR/IRBuilder.h"
78 #include "llvm/IR/Instructions.h"
79 #include "llvm/IR/IntrinsicInst.h"
80 #include "llvm/IR/LLVMContext.h"
81 #include "llvm/IR/Module.h"
82 #include "llvm/IR/PatternMatch.h"
83 #include "llvm/IR/Type.h"
84 #include "llvm/IR/User.h"
85 #include "llvm/IR/Value.h"
86 #include "llvm/IR/ValueHandle.h"
87 #include "llvm/IR/Verifier.h"
88 #include "llvm/Pass.h"
91 #include "llvm/Support/Debug.h"
93 #include "llvm/Transforms/Scalar.h"
100 #include <algorithm>
101 #include <map>
102 #include <tuple>
103 
104 using namespace llvm;
105 using namespace llvm::PatternMatch;
106 
107 #define LV_NAME "loop-vectorize"
108 #define DEBUG_TYPE LV_NAME
109 
110 STATISTIC(LoopsVectorized, "Number of loops vectorized");
111 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
112 
113 static cl::opt<bool>
114  EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
115  cl::desc("Enable if-conversion during vectorization."));
116 
117 /// Loops with a known constant trip count below this number are vectorized only
118 /// if no scalar iteration overheads are incurred.
120  "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
121  cl::desc("Loops with a constant trip count that is smaller than this "
122  "value are vectorized only if no scalar iteration overheads "
123  "are incurred."));
124 
126  "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
127  cl::desc("Maximize bandwidth when selecting vectorization factor which "
128  "will be determined by the smallest type in loop."));
129 
131  "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
132  cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
133 
134 /// Maximum factor for an interleaved memory access.
136  "max-interleave-group-factor", cl::Hidden,
137  cl::desc("Maximum factor for an interleaved access group (default = 8)"),
138  cl::init(8));
139 
140 /// We don't interleave loops with a known constant trip count below this
141 /// number.
142 static const unsigned TinyTripCountInterleaveThreshold = 128;
143 
145  "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
146  cl::desc("A flag that overrides the target's number of scalar registers."));
147 
149  "force-target-num-vector-regs", cl::init(0), cl::Hidden,
150  cl::desc("A flag that overrides the target's number of vector registers."));
151 
152 /// Maximum vectorization interleave count.
153 static const unsigned MaxInterleaveFactor = 16;
154 
156  "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
157  cl::desc("A flag that overrides the target's max interleave factor for "
158  "scalar loops."));
159 
161  "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
162  cl::desc("A flag that overrides the target's max interleave factor for "
163  "vectorized loops."));
164 
166  "force-target-instruction-cost", cl::init(0), cl::Hidden,
167  cl::desc("A flag that overrides the target's expected cost for "
168  "an instruction to a single constant value. Mostly "
169  "useful for getting consistent testing."));
170 
172  "small-loop-cost", cl::init(20), cl::Hidden,
173  cl::desc(
174  "The cost of a loop that is considered 'small' by the interleaver."));
175 
177  "loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden,
178  cl::desc("Enable the use of the block frequency analysis to access PGO "
179  "heuristics minimizing code growth in cold regions and being more "
180  "aggressive in hot regions."));
181 
182 // Runtime interleave loops for load/store throughput.
184  "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
185  cl::desc(
186  "Enable runtime interleaving until load/store ports are saturated"));
187 
188 /// The number of stores in a loop that are allowed to need predication.
190  "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
191  cl::desc("Max number of stores to be predicated behind an if."));
192 
194  "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
195  cl::desc("Count the induction variable only once when interleaving"));
196 
198  "enable-cond-stores-vec", cl::init(true), cl::Hidden,
199  cl::desc("Enable if predication of stores during vectorization."));
200 
202  "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
203  cl::desc("The maximum interleave count to use when interleaving a scalar "
204  "reduction in a nested loop."));
205 
207  "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
208  cl::desc("The maximum allowed number of runtime memory checks with a "
209  "vectorize(enable) pragma."));
210 
212  "vectorize-scev-check-threshold", cl::init(16), cl::Hidden,
213  cl::desc("The maximum number of SCEV checks allowed."));
214 
216  "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden,
217  cl::desc("The maximum number of SCEV checks allowed with a "
218  "vectorize(enable) pragma"));
219 
220 /// Create an analysis remark that explains why vectorization failed
221 ///
222 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
223 /// RemarkName is the identifier for the remark. If \p I is passed it is an
224 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for
225 /// the location of the remark. \return the remark object that can be
226 /// streamed to.
228 createMissedAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
229  Instruction *I = nullptr) {
230  Value *CodeRegion = TheLoop->getHeader();
231  DebugLoc DL = TheLoop->getStartLoc();
232 
233  if (I) {
234  CodeRegion = I->getParent();
235  // If there is no debug location attached to the instruction, revert back to
236  // using the loop's.
237  if (I->getDebugLoc())
238  DL = I->getDebugLoc();
239  }
240 
241  OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
242  R << "loop not vectorized: ";
243  return R;
244 }
245 
246 namespace {
247 
248 // Forward declarations.
249 class LoopVectorizeHints;
250 class LoopVectorizationLegality;
251 class LoopVectorizationCostModel;
252 class LoopVectorizationRequirements;
253 
254 /// Returns true if the given loop body has a cycle, excluding the loop
255 /// itself.
256 static bool hasCyclesInLoopBody(const Loop &L) {
257  if (!L.empty())
258  return true;
259 
260  for (const auto &SCC :
263  if (SCC.size() > 1) {
264  DEBUG(dbgs() << "LVL: Detected a cycle in the loop body:\n");
265  DEBUG(L.dump());
266  return true;
267  }
268  }
269  return false;
270 }
271 
272 /// A helper function for converting Scalar types to vector types.
273 /// If the incoming type is void, we return void. If the VF is 1, we return
274 /// the scalar type.
275 static Type *ToVectorTy(Type *Scalar, unsigned VF) {
276  if (Scalar->isVoidTy() || VF == 1)
277  return Scalar;
278  return VectorType::get(Scalar, VF);
279 }
280 
281 // FIXME: The following helper functions have multiple implementations
282 // in the project. They can be effectively organized in a common Load/Store
283 // utilities unit.
284 
285 /// A helper function that returns the pointer operand of a load or store
286 /// instruction.
287 static Value *getPointerOperand(Value *I) {
288  if (auto *LI = dyn_cast<LoadInst>(I))
289  return LI->getPointerOperand();
290  if (auto *SI = dyn_cast<StoreInst>(I))
291  return SI->getPointerOperand();
292  return nullptr;
293 }
294 
295 /// A helper function that returns the type of loaded or stored value.
296 static Type *getMemInstValueType(Value *I) {
297  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
298  "Expected Load or Store instruction");
299  if (auto *LI = dyn_cast<LoadInst>(I))
300  return LI->getType();
301  return cast<StoreInst>(I)->getValueOperand()->getType();
302 }
303 
304 /// A helper function that returns the alignment of load or store instruction.
305 static unsigned getMemInstAlignment(Value *I) {
306  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
307  "Expected Load or Store instruction");
308  if (auto *LI = dyn_cast<LoadInst>(I))
309  return LI->getAlignment();
310  return cast<StoreInst>(I)->getAlignment();
311 }
312 
313 /// A helper function that returns the address space of the pointer operand of
314 /// load or store instruction.
315 static unsigned getMemInstAddressSpace(Value *I) {
316  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
317  "Expected Load or Store instruction");
318  if (auto *LI = dyn_cast<LoadInst>(I))
319  return LI->getPointerAddressSpace();
320  return cast<StoreInst>(I)->getPointerAddressSpace();
321 }
322 
323 /// A helper function that returns true if the given type is irregular. The
324 /// type is irregular if its allocated size doesn't equal the store size of an
325 /// element of the corresponding vector type at the given vectorization factor.
326 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
327 
328  // Determine if an array of VF elements of type Ty is "bitcast compatible"
329  // with a <VF x Ty> vector.
330  if (VF > 1) {
331  auto *VectorTy = VectorType::get(Ty, VF);
332  return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
333  }
334 
335  // If the vectorization factor is one, we just check if an array of type Ty
336  // requires padding between elements.
337  return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
338 }
339 
340 /// A helper function that returns the reciprocal of the block probability of
341 /// predicated blocks. If we return X, we are assuming the predicated block
342 /// will execute once for for every X iterations of the loop header.
343 ///
344 /// TODO: We should use actual block probability here, if available. Currently,
345 /// we always assume predicated blocks have a 50% chance of executing.
346 static unsigned getReciprocalPredBlockProb() { return 2; }
347 
348 /// A helper function that adds a 'fast' flag to floating-point operations.
349 static Value *addFastMathFlag(Value *V) {
350  if (isa<FPMathOperator>(V)) {
352  Flags.setUnsafeAlgebra();
353  cast<Instruction>(V)->setFastMathFlags(Flags);
354  }
355  return V;
356 }
357 
358 /// A helper function that returns an integer or floating-point constant with
359 /// value C.
360 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
361  return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
362  : ConstantFP::get(Ty, C);
363 }
364 
365 /// InnerLoopVectorizer vectorizes loops which contain only one basic
366 /// block to a specified vectorization factor (VF).
367 /// This class performs the widening of scalars into vectors, or multiple
368 /// scalars. This class also implements the following features:
369 /// * It inserts an epilogue loop for handling loops that don't have iteration
370 /// counts that are known to be a multiple of the vectorization factor.
371 /// * It handles the code generation for reduction variables.
372 /// * Scalarization (implementation using scalars) of un-vectorizable
373 /// instructions.
374 /// InnerLoopVectorizer does not perform any vectorization-legality
375 /// checks, and relies on the caller to check for the different legality
376 /// aspects. The InnerLoopVectorizer relies on the
377 /// LoopVectorizationLegality class to provide information about the induction
378 /// and reduction variables that were found to a given vectorization factor.
379 class InnerLoopVectorizer {
380 public:
381  InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
382  LoopInfo *LI, DominatorTree *DT,
383  const TargetLibraryInfo *TLI,
384  const TargetTransformInfo *TTI, AssumptionCache *AC,
385  OptimizationRemarkEmitter *ORE, unsigned VecWidth,
386  unsigned UnrollFactor, LoopVectorizationLegality *LVL,
387  LoopVectorizationCostModel *CM)
388  : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
389  AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
390  Builder(PSE.getSE()->getContext()), Induction(nullptr),
391  OldInduction(nullptr), VectorLoopValueMap(UnrollFactor, VecWidth),
392  TripCount(nullptr), VectorTripCount(nullptr), Legal(LVL), Cost(CM),
393  AddedSafetyChecks(false) {}
394 
395  /// Create a new empty loop. Unlink the old loop and connect the new one.
396  void createVectorizedLoopSkeleton();
397 
398  /// Vectorize a single instruction within the innermost loop.
399  void vectorizeInstruction(Instruction &I);
400 
401  /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
402  void fixVectorizedLoop();
403 
404  // Return true if any runtime check is added.
405  bool areSafetyChecksAdded() { return AddedSafetyChecks; }
406 
407  virtual ~InnerLoopVectorizer() {}
408 
409 protected:
410  /// A small list of PHINodes.
411  typedef SmallVector<PHINode *, 4> PhiVector;
412 
413  /// A type for vectorized values in the new loop. Each value from the
414  /// original loop, when vectorized, is represented by UF vector values in the
415  /// new unrolled loop, where UF is the unroll factor.
416  typedef SmallVector<Value *, 2> VectorParts;
417 
418  /// A type for scalarized values in the new loop. Each value from the
419  /// original loop, when scalarized, is represented by UF x VF scalar values
420  /// in the new unrolled loop, where UF is the unroll factor and VF is the
421  /// vectorization factor.
422  typedef SmallVector<SmallVector<Value *, 4>, 2> ScalarParts;
423 
424  // When we if-convert we need to create edge masks. We have to cache values
425  // so that we don't end up with exponential recursion/IR.
427  EdgeMaskCacheTy;
428  typedef DenseMap<BasicBlock *, VectorParts> BlockMaskCacheTy;
429 
430  /// Set up the values of the IVs correctly when exiting the vector loop.
431  void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
432  Value *CountRoundDown, Value *EndValue,
433  BasicBlock *MiddleBlock);
434 
435  /// Create a new induction variable inside L.
436  PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
437  Value *Step, Instruction *DL);
438 
439  /// Handle all cross-iteration phis in the header.
440  void fixCrossIterationPHIs();
441 
442  /// Fix a first-order recurrence. This is the second phase of vectorizing
443  /// this phi node.
444  void fixFirstOrderRecurrence(PHINode *Phi);
445 
446  /// Fix a reduction cross-iteration phi. This is the second phase of
447  /// vectorizing this phi node.
448  void fixReduction(PHINode *Phi);
449 
450  /// \brief The Loop exit block may have single value PHI nodes with some
451  /// incoming value. While vectorizing we only handled real values
452  /// that were defined inside the loop and we should have one value for
453  /// each predecessor of its parent basic block. See PR14725.
454  void fixLCSSAPHIs();
455 
456  /// Iteratively sink the scalarized operands of a predicated instruction into
457  /// the block that was created for it.
458  void sinkScalarOperands(Instruction *PredInst);
459 
460  /// Predicate conditional instructions that require predication on their
461  /// respective conditions.
462  void predicateInstructions();
463 
464  /// Shrinks vector element sizes to the smallest bitwidth they can be legally
465  /// represented as.
466  void truncateToMinimalBitwidths();
467 
468  /// A helper function that computes the predicate of the block BB, assuming
469  /// that the header block of the loop is set to True. It returns the *entry*
470  /// mask for the block BB.
471  VectorParts createBlockInMask(BasicBlock *BB);
472  /// A helper function that computes the predicate of the edge between SRC
473  /// and DST.
474  VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
475 
476  /// Vectorize a single PHINode in a block. This method handles the induction
477  /// variable canonicalization. It supports both VF = 1 for unrolled loops and
478  /// arbitrary length vectors.
479  void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
480 
481  /// Insert the new loop to the loop hierarchy and pass manager
482  /// and update the analysis passes.
483  void updateAnalysis();
484 
485  /// This instruction is un-vectorizable. Implement it as a sequence
486  /// of scalars. If \p IfPredicateInstr is true we need to 'hide' each
487  /// scalarized instruction behind an if block predicated on the control
488  /// dependence of the instruction.
489  void scalarizeInstruction(Instruction *Instr, bool IfPredicateInstr = false);
490 
491  /// Vectorize Load and Store instructions,
492  virtual void vectorizeMemoryInstruction(Instruction *Instr);
493 
494  /// Create a broadcast instruction. This method generates a broadcast
495  /// instruction (shuffle) for loop invariant values and for the induction
496  /// value. If this is the induction variable then we extend it to N, N+1, ...
497  /// this is needed because each iteration in the loop corresponds to a SIMD
498  /// element.
499  virtual Value *getBroadcastInstrs(Value *V);
500 
501  /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
502  /// to each vector element of Val. The sequence starts at StartIndex.
503  /// \p Opcode is relevant for FP induction variable.
504  virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
505  Instruction::BinaryOps Opcode =
506  Instruction::BinaryOpsEnd);
507 
508  /// Compute scalar induction steps. \p ScalarIV is the scalar induction
509  /// variable on which to base the steps, \p Step is the size of the step, and
510  /// \p EntryVal is the value from the original loop that maps to the steps.
511  /// Note that \p EntryVal doesn't have to be an induction variable (e.g., it
512  /// can be a truncate instruction).
513  void buildScalarSteps(Value *ScalarIV, Value *Step, Value *EntryVal,
514  const InductionDescriptor &ID);
515 
516  /// Create a vector induction phi node based on an existing scalar one. \p
517  /// EntryVal is the value from the original loop that maps to the vector phi
518  /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
519  /// truncate instruction, instead of widening the original IV, we widen a
520  /// version of the IV truncated to \p EntryVal's type.
521  void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
522  Value *Step, Instruction *EntryVal);
523 
524  /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
525  /// is provided, the integer induction variable will first be truncated to
526  /// the corresponding type.
527  void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
528 
529  /// Returns true if an instruction \p I should be scalarized instead of
530  /// vectorized for the chosen vectorization factor.
531  bool shouldScalarizeInstruction(Instruction *I) const;
532 
533  /// Returns true if we should generate a scalar version of \p IV.
534  bool needsScalarInduction(Instruction *IV) const;
535 
536  /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
537  /// vector or scalar value on-demand if one is not yet available. When
538  /// vectorizing a loop, we visit the definition of an instruction before its
539  /// uses. When visiting the definition, we either vectorize or scalarize the
540  /// instruction, creating an entry for it in the corresponding map. (In some
541  /// cases, such as induction variables, we will create both vector and scalar
542  /// entries.) Then, as we encounter uses of the definition, we derive values
543  /// for each scalar or vector use unless such a value is already available.
544  /// For example, if we scalarize a definition and one of its uses is vector,
545  /// we build the required vector on-demand with an insertelement sequence
546  /// when visiting the use. Otherwise, if the use is scalar, we can use the
547  /// existing scalar definition.
548  ///
549  /// Return a value in the new loop corresponding to \p V from the original
550  /// loop at unroll index \p Part. If the value has already been vectorized,
551  /// the corresponding vector entry in VectorLoopValueMap is returned. If,
552  /// however, the value has a scalar entry in VectorLoopValueMap, we construct
553  /// a new vector value on-demand by inserting the scalar values into a vector
554  /// with an insertelement sequence. If the value has been neither vectorized
555  /// nor scalarized, it must be loop invariant, so we simply broadcast the
556  /// value into a vector.
557  Value *getOrCreateVectorValue(Value *V, unsigned Part);
558 
559  /// Return a value in the new loop corresponding to \p V from the original
560  /// loop at unroll index \p Part and vector index \p Lane. If the value has
561  /// been vectorized but not scalarized, the necessary extractelement
562  /// instruction will be generated.
563  Value *getOrCreateScalarValue(Value *V, unsigned Part, unsigned Lane);
564 
565  /// Try to vectorize the interleaved access group that \p Instr belongs to.
566  void vectorizeInterleaveGroup(Instruction *Instr);
567 
568  /// Generate a shuffle sequence that will reverse the vector Vec.
569  virtual Value *reverseVector(Value *Vec);
570 
571  /// Returns (and creates if needed) the original loop trip count.
572  Value *getOrCreateTripCount(Loop *NewLoop);
573 
574  /// Returns (and creates if needed) the trip count of the widened loop.
575  Value *getOrCreateVectorTripCount(Loop *NewLoop);
576 
577  /// Emit a bypass check to see if the vector trip count is zero, including if
578  /// it overflows.
579  void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
580  /// Emit a bypass check to see if all of the SCEV assumptions we've
581  /// had to make are correct.
582  void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
583  /// Emit bypass checks to check any memory assumptions we may have made.
584  void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
585 
586  /// Add additional metadata to \p To that was not present on \p Orig.
587  ///
588  /// Currently this is used to add the noalias annotations based on the
589  /// inserted memchecks. Use this for instructions that are *cloned* into the
590  /// vector loop.
591  void addNewMetadata(Instruction *To, const Instruction *Orig);
592 
593  /// Add metadata from one instruction to another.
594  ///
595  /// This includes both the original MDs from \p From and additional ones (\see
596  /// addNewMetadata). Use this for *newly created* instructions in the vector
597  /// loop.
598  void addMetadata(Instruction *To, Instruction *From);
599 
600  /// \brief Similar to the previous function but it adds the metadata to a
601  /// vector of instructions.
602  void addMetadata(ArrayRef<Value *> To, Instruction *From);
603 
604  /// \brief Set the debug location in the builder using the debug location in
605  /// the instruction.
606  void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
607 
608  /// This is a helper class for maintaining vectorization state. It's used for
609  /// mapping values from the original loop to their corresponding values in
610  /// the new loop. Two mappings are maintained: one for vectorized values and
611  /// one for scalarized values. Vectorized values are represented with UF
612  /// vector values in the new loop, and scalarized values are represented with
613  /// UF x VF scalar values in the new loop. UF and VF are the unroll and
614  /// vectorization factors, respectively.
615  ///
616  /// Entries can be added to either map with setVectorValue and setScalarValue,
617  /// which assert that an entry was not already added before. If an entry is to
618  /// replace an existing one, call resetVectorValue. This is currently needed
619  /// to modify the mapped values during "fix-up" operations that occur once the
620  /// first phase of widening is complete. These operations include type
621  /// truncation and the second phase of recurrence widening.
622  ///
623  /// Entries from either map can be retrieved using the getVectorValue and
624  /// getScalarValue functions, which assert that the desired value exists.
625 
626  struct ValueMap {
627 
628  /// Construct an empty map with the given unroll and vectorization factors.
629  ValueMap(unsigned UF, unsigned VF) : UF(UF), VF(VF) {}
630 
631  /// \return True if the map has any vector entry for \p Key.
632  bool hasAnyVectorValue(Value *Key) const {
633  return VectorMapStorage.count(Key);
634  }
635 
636  /// \return True if the map has a vector entry for \p Key and \p Part.
637  bool hasVectorValue(Value *Key, unsigned Part) const {
638  assert(Part < UF && "Queried Vector Part is too large.");
639  if (!hasAnyVectorValue(Key))
640  return false;
641  const VectorParts &Entry = VectorMapStorage.find(Key)->second;
642  assert(Entry.size() == UF && "VectorParts has wrong dimensions.");
643  return Entry[Part] != nullptr;
644  }
645 
646  /// \return True if the map has any scalar entry for \p Key.
647  bool hasAnyScalarValue(Value *Key) const {
648  return ScalarMapStorage.count(Key);
649  }
650 
651  /// \return True if the map has a scalar entry for \p Key, \p Part and
652  /// \p Part.
653  bool hasScalarValue(Value *Key, unsigned Part, unsigned Lane) const {
654  assert(Part < UF && "Queried Scalar Part is too large.");
655  assert(Lane < VF && "Queried Scalar Lane is too large.");
656  if (!hasAnyScalarValue(Key))
657  return false;
658  const ScalarParts &Entry = ScalarMapStorage.find(Key)->second;
659  assert(Entry.size() == UF && "ScalarParts has wrong dimensions.");
660  assert(Entry[Part].size() == VF && "ScalarParts has wrong dimensions.");
661  return Entry[Part][Lane] != nullptr;
662  }
663 
664  /// Retrieve the existing vector value that corresponds to \p Key and
665  /// \p Part.
666  Value *getVectorValue(Value *Key, unsigned Part) {
667  assert(hasVectorValue(Key, Part) && "Getting non-existent value.");
668  return VectorMapStorage[Key][Part];
669  }
670 
671  /// Retrieve the existing scalar value that corresponds to \p Key, \p Part
672  /// and \p Lane.
673  Value *getScalarValue(Value *Key, unsigned Part, unsigned Lane) {
674  assert(hasScalarValue(Key, Part, Lane) && "Getting non-existent value.");
675  return ScalarMapStorage[Key][Part][Lane];
676  }
677 
678  /// Set a vector value associated with \p Key and \p Part. Assumes such a
679  /// value is not already set. If it is, use resetVectorValue() instead.
680  void setVectorValue(Value *Key, unsigned Part, Value *Vector) {
681  assert(!hasVectorValue(Key, Part) && "Vector value already set for part");
682  if (!VectorMapStorage.count(Key)) {
683  VectorParts Entry(UF);
684  VectorMapStorage[Key] = Entry;
685  }
686  VectorMapStorage[Key][Part] = Vector;
687  }
688 
689  /// Set a scalar value associated with \p Key for \p Part and \p Lane.
690  /// Assumes such a value is not already set.
691  void setScalarValue(Value *Key, unsigned Part, unsigned Lane,
692  Value *Scalar) {
693  assert(!hasScalarValue(Key, Part, Lane) && "Scalar value already set");
694  if (!ScalarMapStorage.count(Key)) {
695  ScalarParts Entry(UF);
696  for (unsigned Part = 0; Part < UF; ++Part)
697  Entry[Part].resize(VF, nullptr);
698  // TODO: Consider storing uniform values only per-part, as they occupy
699  // lane 0 only, keeping the other VF-1 redundant entries null.
700  ScalarMapStorage[Key] = Entry;
701  }
702  ScalarMapStorage[Key][Part][Lane] = Scalar;
703  }
704 
705  /// Reset the vector value associated with \p Key for the given \p Part.
706  /// This function can be used to update values that have already been
707  /// vectorized. This is the case for "fix-up" operations including type
708  /// truncation and the second phase of recurrence vectorization.
709  void resetVectorValue(Value *Key, unsigned Part, Value *Vector) {
710  assert(hasVectorValue(Key, Part) && "Vector value not set for part");
711  VectorMapStorage[Key][Part] = Vector;
712  }
713 
714  private:
715  /// The unroll factor. Each entry in the vector map contains UF vector
716  /// values.
717  unsigned UF;
718 
719  /// The vectorization factor. Each entry in the scalar map contains UF x VF
720  /// scalar values.
721  unsigned VF;
722 
723  /// The vector and scalar map storage. We use std::map and not DenseMap
724  /// because insertions to DenseMap invalidate its iterators.
725  std::map<Value *, VectorParts> VectorMapStorage;
726  std::map<Value *, ScalarParts> ScalarMapStorage;
727  };
728 
729  /// The original loop.
730  Loop *OrigLoop;
731  /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
732  /// dynamic knowledge to simplify SCEV expressions and converts them to a
733  /// more usable form.
735  /// Loop Info.
736  LoopInfo *LI;
737  /// Dominator Tree.
738  DominatorTree *DT;
739  /// Alias Analysis.
740  AliasAnalysis *AA;
741  /// Target Library Info.
742  const TargetLibraryInfo *TLI;
743  /// Target Transform Info.
744  const TargetTransformInfo *TTI;
745  /// Assumption Cache.
746  AssumptionCache *AC;
747  /// Interface to emit optimization remarks.
749 
750  /// \brief LoopVersioning. It's only set up (non-null) if memchecks were
751  /// used.
752  ///
753  /// This is currently only used to add no-alias metadata based on the
754  /// memchecks. The actually versioning is performed manually.
755  std::unique_ptr<LoopVersioning> LVer;
756 
757  /// The vectorization SIMD factor to use. Each vector will have this many
758  /// vector elements.
759  unsigned VF;
760 
761 protected:
762  /// The vectorization unroll factor to use. Each scalar is vectorized to this
763  /// many different vector instructions.
764  unsigned UF;
765 
766  /// The builder that we use
767  IRBuilder<> Builder;
768 
769  // --- Vectorization state ---
770 
771  /// The vector-loop preheader.
772  BasicBlock *LoopVectorPreHeader;
773  /// The scalar-loop preheader.
774  BasicBlock *LoopScalarPreHeader;
775  /// Middle Block between the vector and the scalar.
776  BasicBlock *LoopMiddleBlock;
777  /// The ExitBlock of the scalar loop.
778  BasicBlock *LoopExitBlock;
779  /// The vector loop body.
780  BasicBlock *LoopVectorBody;
781  /// The scalar loop body.
782  BasicBlock *LoopScalarBody;
783  /// A list of all bypass blocks. The first block is the entry of the loop.
784  SmallVector<BasicBlock *, 4> LoopBypassBlocks;
785 
786  /// The new Induction variable which was added to the new block.
787  PHINode *Induction;
788  /// The induction variable of the old basic block.
789  PHINode *OldInduction;
790 
791  /// Maps values from the original loop to their corresponding values in the
792  /// vectorized loop. A key value can map to either vector values, scalar
793  /// values or both kinds of values, depending on whether the key was
794  /// vectorized and scalarized.
795  ValueMap VectorLoopValueMap;
796 
797  /// Store instructions that should be predicated, as a pair
798  /// <StoreInst, Predicate>
799  SmallVector<std::pair<Instruction *, Value *>, 4> PredicatedInstructions;
800  EdgeMaskCacheTy EdgeMaskCache;
801  BlockMaskCacheTy BlockMaskCache;
802  /// Trip count of the original loop.
803  Value *TripCount;
804  /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
805  Value *VectorTripCount;
806 
807  /// The legality analysis.
808  LoopVectorizationLegality *Legal;
809 
810  /// The profitablity analysis.
811  LoopVectorizationCostModel *Cost;
812 
813  // Record whether runtime checks are added.
814  bool AddedSafetyChecks;
815 
816  // Holds the end values for each induction variable. We save the end values
817  // so we can later fix-up the external users of the induction variables.
818  DenseMap<PHINode *, Value *> IVEndValues;
819 };
820 
821 class InnerLoopUnroller : public InnerLoopVectorizer {
822 public:
823  InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
824  LoopInfo *LI, DominatorTree *DT,
825  const TargetLibraryInfo *TLI,
826  const TargetTransformInfo *TTI, AssumptionCache *AC,
827  OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
828  LoopVectorizationLegality *LVL,
829  LoopVectorizationCostModel *CM)
830  : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
831  UnrollFactor, LVL, CM) {}
832 
833 private:
834  void vectorizeMemoryInstruction(Instruction *Instr) override;
835  Value *getBroadcastInstrs(Value *V) override;
836  Value *getStepVector(Value *Val, int StartIdx, Value *Step,
837  Instruction::BinaryOps Opcode =
838  Instruction::BinaryOpsEnd) override;
839  Value *reverseVector(Value *Vec) override;
840 };
841 
842 /// \brief Look for a meaningful debug location on the instruction or it's
843 /// operands.
844 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
845  if (!I)
846  return I;
847 
848  DebugLoc Empty;
849  if (I->getDebugLoc() != Empty)
850  return I;
851 
852  for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
853  if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
854  if (OpInst->getDebugLoc() != Empty)
855  return OpInst;
856  }
857 
858  return I;
859 }
860 
861 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
862  if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
863  const DILocation *DIL = Inst->getDebugLoc();
864  if (DIL && Inst->getFunction()->isDebugInfoForProfiling())
866  else
868  } else
870 }
871 
872 #ifndef NDEBUG
873 /// \return string containing a file name and a line # for the given loop.
874 static std::string getDebugLocString(const Loop *L) {
875  std::string Result;
876  if (L) {
877  raw_string_ostream OS(Result);
878  if (const DebugLoc LoopDbgLoc = L->getStartLoc())
879  LoopDbgLoc.print(OS);
880  else
881  // Just print the module name.
882  OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
883  OS.flush();
884  }
885  return Result;
886 }
887 #endif
888 
889 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
890  const Instruction *Orig) {
891  // If the loop was versioned with memchecks, add the corresponding no-alias
892  // metadata.
893  if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
894  LVer->annotateInstWithNoAlias(To, Orig);
895 }
896 
897 void InnerLoopVectorizer::addMetadata(Instruction *To,
898  Instruction *From) {
899  propagateMetadata(To, From);
900  addNewMetadata(To, From);
901 }
902 
903 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
904  Instruction *From) {
905  for (Value *V : To) {
906  if (Instruction *I = dyn_cast<Instruction>(V))
907  addMetadata(I, From);
908  }
909 }
910 
911 /// \brief The group of interleaved loads/stores sharing the same stride and
912 /// close to each other.
913 ///
914 /// Each member in this group has an index starting from 0, and the largest
915 /// index should be less than interleaved factor, which is equal to the absolute
916 /// value of the access's stride.
917 ///
918 /// E.g. An interleaved load group of factor 4:
919 /// for (unsigned i = 0; i < 1024; i+=4) {
920 /// a = A[i]; // Member of index 0
921 /// b = A[i+1]; // Member of index 1
922 /// d = A[i+3]; // Member of index 3
923 /// ...
924 /// }
925 ///
926 /// An interleaved store group of factor 4:
927 /// for (unsigned i = 0; i < 1024; i+=4) {
928 /// ...
929 /// A[i] = a; // Member of index 0
930 /// A[i+1] = b; // Member of index 1
931 /// A[i+2] = c; // Member of index 2
932 /// A[i+3] = d; // Member of index 3
933 /// }
934 ///
935 /// Note: the interleaved load group could have gaps (missing members), but
936 /// the interleaved store group doesn't allow gaps.
937 class InterleaveGroup {
938 public:
939  InterleaveGroup(Instruction *Instr, int Stride, unsigned Align)
940  : Align(Align), SmallestKey(0), LargestKey(0), InsertPos(Instr) {
941  assert(Align && "The alignment should be non-zero");
942 
943  Factor = std::abs(Stride);
944  assert(Factor > 1 && "Invalid interleave factor");
945 
946  Reverse = Stride < 0;
947  Members[0] = Instr;
948  }
949 
950  bool isReverse() const { return Reverse; }
951  unsigned getFactor() const { return Factor; }
952  unsigned getAlignment() const { return Align; }
953  unsigned getNumMembers() const { return Members.size(); }
954 
955  /// \brief Try to insert a new member \p Instr with index \p Index and
956  /// alignment \p NewAlign. The index is related to the leader and it could be
957  /// negative if it is the new leader.
958  ///
959  /// \returns false if the instruction doesn't belong to the group.
960  bool insertMember(Instruction *Instr, int Index, unsigned NewAlign) {
961  assert(NewAlign && "The new member's alignment should be non-zero");
962 
963  int Key = Index + SmallestKey;
964 
965  // Skip if there is already a member with the same index.
966  if (Members.count(Key))
967  return false;
968 
969  if (Key > LargestKey) {
970  // The largest index is always less than the interleave factor.
971  if (Index >= static_cast<int>(Factor))
972  return false;
973 
974  LargestKey = Key;
975  } else if (Key < SmallestKey) {
976  // The largest index is always less than the interleave factor.
977  if (LargestKey - Key >= static_cast<int>(Factor))
978  return false;
979 
980  SmallestKey = Key;
981  }
982 
983  // It's always safe to select the minimum alignment.
984  Align = std::min(Align, NewAlign);
985  Members[Key] = Instr;
986  return true;
987  }
988 
989  /// \brief Get the member with the given index \p Index
990  ///
991  /// \returns nullptr if contains no such member.
992  Instruction *getMember(unsigned Index) const {
993  int Key = SmallestKey + Index;
994  if (!Members.count(Key))
995  return nullptr;
996 
997  return Members.find(Key)->second;
998  }
999 
1000  /// \brief Get the index for the given member. Unlike the key in the member
1001  /// map, the index starts from 0.
1002  unsigned getIndex(Instruction *Instr) const {
1003  for (auto I : Members)
1004  if (I.second == Instr)
1005  return I.first - SmallestKey;
1006 
1007  llvm_unreachable("InterleaveGroup contains no such member");
1008  }
1009 
1010  Instruction *getInsertPos() const { return InsertPos; }
1011  void setInsertPos(Instruction *Inst) { InsertPos = Inst; }
1012 
1013 private:
1014  unsigned Factor; // Interleave Factor.
1015  bool Reverse;
1016  unsigned Align;
1018  int SmallestKey;
1019  int LargestKey;
1020 
1021  // To avoid breaking dependences, vectorized instructions of an interleave
1022  // group should be inserted at either the first load or the last store in
1023  // program order.
1024  //
1025  // E.g. %even = load i32 // Insert Position
1026  // %add = add i32 %even // Use of %even
1027  // %odd = load i32
1028  //
1029  // store i32 %even
1030  // %odd = add i32 // Def of %odd
1031  // store i32 %odd // Insert Position
1032  Instruction *InsertPos;
1033 };
1034 
1035 /// \brief Drive the analysis of interleaved memory accesses in the loop.
1036 ///
1037 /// Use this class to analyze interleaved accesses only when we can vectorize
1038 /// a loop. Otherwise it's meaningless to do analysis as the vectorization
1039 /// on interleaved accesses is unsafe.
1040 ///
1041 /// The analysis collects interleave groups and records the relationships
1042 /// between the member and the group in a map.
1043 class InterleavedAccessInfo {
1044 public:
1045  InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,
1046  DominatorTree *DT, LoopInfo *LI)
1047  : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(nullptr),
1048  RequiresScalarEpilogue(false) {}
1049 
1050  ~InterleavedAccessInfo() {
1052  // Avoid releasing a pointer twice.
1053  for (auto &I : InterleaveGroupMap)
1054  DelSet.insert(I.second);
1055  for (auto *Ptr : DelSet)
1056  delete Ptr;
1057  }
1058 
1059  /// \brief Analyze the interleaved accesses and collect them in interleave
1060  /// groups. Substitute symbolic strides using \p Strides.
1061  void analyzeInterleaving(const ValueToValueMap &Strides);
1062 
1063  /// \brief Check if \p Instr belongs to any interleave group.
1064  bool isInterleaved(Instruction *Instr) const {
1065  return InterleaveGroupMap.count(Instr);
1066  }
1067 
1068  /// \brief Return the maximum interleave factor of all interleaved groups.
1069  unsigned getMaxInterleaveFactor() const {
1070  unsigned MaxFactor = 1;
1071  for (auto &Entry : InterleaveGroupMap)
1072  MaxFactor = std::max(MaxFactor, Entry.second->getFactor());
1073  return MaxFactor;
1074  }
1075 
1076  /// \brief Get the interleave group that \p Instr belongs to.
1077  ///
1078  /// \returns nullptr if doesn't have such group.
1079  InterleaveGroup *getInterleaveGroup(Instruction *Instr) const {
1080  if (InterleaveGroupMap.count(Instr))
1081  return InterleaveGroupMap.find(Instr)->second;
1082  return nullptr;
1083  }
1084 
1085  /// \brief Returns true if an interleaved group that may access memory
1086  /// out-of-bounds requires a scalar epilogue iteration for correctness.
1087  bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; }
1088 
1089  /// \brief Initialize the LoopAccessInfo used for dependence checking.
1090  void setLAI(const LoopAccessInfo *Info) { LAI = Info; }
1091 
1092 private:
1093  /// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
1094  /// Simplifies SCEV expressions in the context of existing SCEV assumptions.
1095  /// The interleaved access analysis can also add new predicates (for example
1096  /// by versioning strides of pointers).
1098  Loop *TheLoop;
1099  DominatorTree *DT;
1100  LoopInfo *LI;
1101  const LoopAccessInfo *LAI;
1102 
1103  /// True if the loop may contain non-reversed interleaved groups with
1104  /// out-of-bounds accesses. We ensure we don't speculatively access memory
1105  /// out-of-bounds by executing at least one scalar epilogue iteration.
1106  bool RequiresScalarEpilogue;
1107 
1108  /// Holds the relationships between the members and the interleave group.
1109  DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap;
1110 
1111  /// Holds dependences among the memory accesses in the loop. It maps a source
1112  /// access to a set of dependent sink accesses.
1114 
1115  /// \brief The descriptor for a strided memory access.
1116  struct StrideDescriptor {
1117  StrideDescriptor(int64_t Stride, const SCEV *Scev, uint64_t Size,
1118  unsigned Align)
1119  : Stride(Stride), Scev(Scev), Size(Size), Align(Align) {}
1120 
1121  StrideDescriptor() = default;
1122 
1123  // The access's stride. It is negative for a reverse access.
1124  int64_t Stride = 0;
1125  const SCEV *Scev = nullptr; // The scalar expression of this access
1126  uint64_t Size = 0; // The size of the memory object.
1127  unsigned Align = 0; // The alignment of this access.
1128  };
1129 
1130  /// \brief A type for holding instructions and their stride descriptors.
1131  typedef std::pair<Instruction *, StrideDescriptor> StrideEntry;
1132 
1133  /// \brief Create a new interleave group with the given instruction \p Instr,
1134  /// stride \p Stride and alignment \p Align.
1135  ///
1136  /// \returns the newly created interleave group.
1137  InterleaveGroup *createInterleaveGroup(Instruction *Instr, int Stride,
1138  unsigned Align) {
1139  assert(!InterleaveGroupMap.count(Instr) &&
1140  "Already in an interleaved access group");
1141  InterleaveGroupMap[Instr] = new InterleaveGroup(Instr, Stride, Align);
1142  return InterleaveGroupMap[Instr];
1143  }
1144 
1145  /// \brief Release the group and remove all the relationships.
1146  void releaseGroup(InterleaveGroup *Group) {
1147  for (unsigned i = 0; i < Group->getFactor(); i++)
1148  if (Instruction *Member = Group->getMember(i))
1149  InterleaveGroupMap.erase(Member);
1150 
1151  delete Group;
1152  }
1153 
1154  /// \brief Collect all the accesses with a constant stride in program order.
1155  void collectConstStrideAccesses(
1157  const ValueToValueMap &Strides);
1158 
1159  /// \brief Returns true if \p Stride is allowed in an interleaved group.
1160  static bool isStrided(int Stride) {
1161  unsigned Factor = std::abs(Stride);
1162  return Factor >= 2 && Factor <= MaxInterleaveGroupFactor;
1163  }
1164 
1165  /// \brief Returns true if \p BB is a predicated block.
1166  bool isPredicated(BasicBlock *BB) const {
1167  return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
1168  }
1169 
1170  /// \brief Returns true if LoopAccessInfo can be used for dependence queries.
1171  bool areDependencesValid() const {
1172  return LAI && LAI->getDepChecker().getDependences();
1173  }
1174 
1175  /// \brief Returns true if memory accesses \p A and \p B can be reordered, if
1176  /// necessary, when constructing interleaved groups.
1177  ///
1178  /// \p A must precede \p B in program order. We return false if reordering is
1179  /// not necessary or is prevented because \p A and \p B may be dependent.
1180  bool canReorderMemAccessesForInterleavedGroups(StrideEntry *A,
1181  StrideEntry *B) const {
1182 
1183  // Code motion for interleaved accesses can potentially hoist strided loads
1184  // and sink strided stores. The code below checks the legality of the
1185  // following two conditions:
1186  //
1187  // 1. Potentially moving a strided load (B) before any store (A) that
1188  // precedes B, or
1189  //
1190  // 2. Potentially moving a strided store (A) after any load or store (B)
1191  // that A precedes.
1192  //
1193  // It's legal to reorder A and B if we know there isn't a dependence from A
1194  // to B. Note that this determination is conservative since some
1195  // dependences could potentially be reordered safely.
1196 
1197  // A is potentially the source of a dependence.
1198  auto *Src = A->first;
1199  auto SrcDes = A->second;
1200 
1201  // B is potentially the sink of a dependence.
1202  auto *Sink = B->first;
1203  auto SinkDes = B->second;
1204 
1205  // Code motion for interleaved accesses can't violate WAR dependences.
1206  // Thus, reordering is legal if the source isn't a write.
1207  if (!Src->mayWriteToMemory())
1208  return true;
1209 
1210  // At least one of the accesses must be strided.
1211  if (!isStrided(SrcDes.Stride) && !isStrided(SinkDes.Stride))
1212  return true;
1213 
1214  // If dependence information is not available from LoopAccessInfo,
1215  // conservatively assume the instructions can't be reordered.
1216  if (!areDependencesValid())
1217  return false;
1218 
1219  // If we know there is a dependence from source to sink, assume the
1220  // instructions can't be reordered. Otherwise, reordering is legal.
1221  return !Dependences.count(Src) || !Dependences.lookup(Src).count(Sink);
1222  }
1223 
1224  /// \brief Collect the dependences from LoopAccessInfo.
1225  ///
1226  /// We process the dependences once during the interleaved access analysis to
1227  /// enable constant-time dependence queries.
1228  void collectDependences() {
1229  if (!areDependencesValid())
1230  return;
1231  auto *Deps = LAI->getDepChecker().getDependences();
1232  for (auto Dep : *Deps)
1233  Dependences[Dep.getSource(*LAI)].insert(Dep.getDestination(*LAI));
1234  }
1235 };
1236 
1237 /// Utility class for getting and setting loop vectorizer hints in the form
1238 /// of loop metadata.
1239 /// This class keeps a number of loop annotations locally (as member variables)
1240 /// and can, upon request, write them back as metadata on the loop. It will
1241 /// initially scan the loop for existing metadata, and will update the local
1242 /// values based on information in the loop.
1243 /// We cannot write all values to metadata, as the mere presence of some info,
1244 /// for example 'force', means a decision has been made. So, we need to be
1245 /// careful NOT to add them if the user hasn't specifically asked so.
1246 class LoopVectorizeHints {
1247  enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE };
1248 
1249  /// Hint - associates name and validation with the hint value.
1250  struct Hint {
1251  const char *Name;
1252  unsigned Value; // This may have to change for non-numeric values.
1253  HintKind Kind;
1254 
1255  Hint(const char *Name, unsigned Value, HintKind Kind)
1256  : Name(Name), Value(Value), Kind(Kind) {}
1257 
1258  bool validate(unsigned Val) {
1259  switch (Kind) {
1260  case HK_WIDTH:
1261  return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth;
1262  case HK_UNROLL:
1263  return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
1264  case HK_FORCE:
1265  return (Val <= 1);
1266  }
1267  return false;
1268  }
1269  };
1270 
1271  /// Vectorization width.
1272  Hint Width;
1273  /// Vectorization interleave factor.
1274  Hint Interleave;
1275  /// Vectorization forced
1276  Hint Force;
1277 
1278  /// Return the loop metadata prefix.
1279  static StringRef Prefix() { return "llvm.loop."; }
1280 
1281  /// True if there is any unsafe math in the loop.
1282  bool PotentiallyUnsafe;
1283 
1284 public:
1285  enum ForceKind {
1286  FK_Undefined = -1, ///< Not selected.
1287  FK_Disabled = 0, ///< Forcing disabled.
1288  FK_Enabled = 1, ///< Forcing enabled.
1289  };
1290 
1291  LoopVectorizeHints(const Loop *L, bool DisableInterleaving,
1293  : Width("vectorize.width", VectorizerParams::VectorizationFactor,
1294  HK_WIDTH),
1295  Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
1296  Force("vectorize.enable", FK_Undefined, HK_FORCE),
1297  PotentiallyUnsafe(false), TheLoop(L), ORE(ORE) {
1298  // Populate values with existing loop metadata.
1299  getHintsFromMetadata();
1300 
1301  // force-vector-interleave overrides DisableInterleaving.
1303  Interleave.Value = VectorizerParams::VectorizationInterleave;
1304 
1305  DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()
1306  << "LV: Interleaving disabled by the pass manager\n");
1307  }
1308 
1309  /// Mark the loop L as already vectorized by setting the width to 1.
1310  void setAlreadyVectorized() {
1311  Width.Value = Interleave.Value = 1;
1312  Hint Hints[] = {Width, Interleave};
1313  writeHintsToMetadata(Hints);
1314  }
1315 
1316  bool allowVectorization(Function *F, Loop *L, bool AlwaysVectorize) const {
1317  if (getForce() == LoopVectorizeHints::FK_Disabled) {
1318  DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
1319  emitRemarkWithHints();
1320  return false;
1321  }
1322 
1323  if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) {
1324  DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
1325  emitRemarkWithHints();
1326  return false;
1327  }
1328 
1329  if (getWidth() == 1 && getInterleave() == 1) {
1330  // FIXME: Add a separate metadata to indicate when the loop has already
1331  // been vectorized instead of setting width and count to 1.
1332  DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
1333  // FIXME: Add interleave.disable metadata. This will allow
1334  // vectorize.disable to be used without disabling the pass and errors
1335  // to differentiate between disabled vectorization and a width of 1.
1336  ORE.emit(OptimizationRemarkAnalysis(vectorizeAnalysisPassName(),
1337  "AllDisabled", L->getStartLoc(),
1338  L->getHeader())
1339  << "loop not vectorized: vectorization and interleaving are "
1340  "explicitly disabled, or vectorize width and interleave "
1341  "count are both set to 1");
1342  return false;
1343  }
1344 
1345  return true;
1346  }
1347 
1348  /// Dumps all the hint information.
1349  void emitRemarkWithHints() const {
1350  using namespace ore;
1351  if (Force.Value == LoopVectorizeHints::FK_Disabled)
1352  ORE.emit(OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled",
1353  TheLoop->getStartLoc(),
1354  TheLoop->getHeader())
1355  << "loop not vectorized: vectorization is explicitly disabled");
1356  else {
1357  OptimizationRemarkMissed R(LV_NAME, "MissedDetails",
1358  TheLoop->getStartLoc(), TheLoop->getHeader());
1359  R << "loop not vectorized";
1360  if (Force.Value == LoopVectorizeHints::FK_Enabled) {
1361  R << " (Force=" << NV("Force", true);
1362  if (Width.Value != 0)
1363  R << ", Vector Width=" << NV("VectorWidth", Width.Value);
1364  if (Interleave.Value != 0)
1365  R << ", Interleave Count=" << NV("InterleaveCount", Interleave.Value);
1366  R << ")";
1367  }
1368  ORE.emit(R);
1369  }
1370  }
1371 
1372  unsigned getWidth() const { return Width.Value; }
1373  unsigned getInterleave() const { return Interleave.Value; }
1374  enum ForceKind getForce() const { return (ForceKind)Force.Value; }
1375 
1376  /// \brief If hints are provided that force vectorization, use the AlwaysPrint
1377  /// pass name to force the frontend to print the diagnostic.
1378  const char *vectorizeAnalysisPassName() const {
1379  if (getWidth() == 1)
1380  return LV_NAME;
1381  if (getForce() == LoopVectorizeHints::FK_Disabled)
1382  return LV_NAME;
1383  if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0)
1384  return LV_NAME;
1386  }
1387 
1388  bool allowReordering() const {
1389  // When enabling loop hints are provided we allow the vectorizer to change
1390  // the order of operations that is given by the scalar loop. This is not
1391  // enabled by default because can be unsafe or inefficient. For example,
1392  // reordering floating-point operations will change the way round-off
1393  // error accumulates in the loop.
1394  return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1;
1395  }
1396 
1397  bool isPotentiallyUnsafe() const {
1398  // Avoid FP vectorization if the target is unsure about proper support.
1399  // This may be related to the SIMD unit in the target not handling
1400  // IEEE 754 FP ops properly, or bad single-to-double promotions.
1401  // Otherwise, a sequence of vectorized loops, even without reduction,
1402  // could lead to different end results on the destination vectors.
1403  return getForce() != LoopVectorizeHints::FK_Enabled && PotentiallyUnsafe;
1404  }
1405 
1406  void setPotentiallyUnsafe() { PotentiallyUnsafe = true; }
1407 
1408 private:
1409  /// Find hints specified in the loop metadata and update local values.
1410  void getHintsFromMetadata() {
1411  MDNode *LoopID = TheLoop->getLoopID();
1412  if (!LoopID)
1413  return;
1414 
1415  // First operand should refer to the loop id itself.
1416  assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
1417  assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
1418 
1419  for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
1420  const MDString *S = nullptr;
1422 
1423  // The expected hint is either a MDString or a MDNode with the first
1424  // operand a MDString.
1425  if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
1426  if (!MD || MD->getNumOperands() == 0)
1427  continue;
1428  S = dyn_cast<MDString>(MD->getOperand(0));
1429  for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
1430  Args.push_back(MD->getOperand(i));
1431  } else {
1432  S = dyn_cast<MDString>(LoopID->getOperand(i));
1433  assert(Args.size() == 0 && "too many arguments for MDString");
1434  }
1435 
1436  if (!S)
1437  continue;
1438 
1439  // Check if the hint starts with the loop metadata prefix.
1440  StringRef Name = S->getString();
1441  if (Args.size() == 1)
1442  setHint(Name, Args[0]);
1443  }
1444  }
1445 
1446  /// Checks string hint with one operand and set value if valid.
1447  void setHint(StringRef Name, Metadata *Arg) {
1448  if (!Name.startswith(Prefix()))
1449  return;
1450  Name = Name.substr(Prefix().size(), StringRef::npos);
1451 
1452  const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
1453  if (!C)
1454  return;
1455  unsigned Val = C->getZExtValue();
1456 
1457  Hint *Hints[] = {&Width, &Interleave, &Force};
1458  for (auto H : Hints) {
1459  if (Name == H->Name) {
1460  if (H->validate(Val))
1461  H->Value = Val;
1462  else
1463  DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
1464  break;
1465  }
1466  }
1467  }
1468 
1469  /// Create a new hint from name / value pair.
1470  MDNode *createHintMetadata(StringRef Name, unsigned V) const {
1471  LLVMContext &Context = TheLoop->getHeader()->getContext();
1472  Metadata *MDs[] = {MDString::get(Context, Name),
1474  ConstantInt::get(Type::getInt32Ty(Context), V))};
1475  return MDNode::get(Context, MDs);
1476  }
1477 
1478  /// Matches metadata with hint name.
1479  bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) {
1480  MDString *Name = dyn_cast<MDString>(Node->getOperand(0));
1481  if (!Name)
1482  return false;
1483 
1484  for (auto H : HintTypes)
1485  if (Name->getString().endswith(H.Name))
1486  return true;
1487  return false;
1488  }
1489 
1490  /// Sets current hints into loop metadata, keeping other values intact.
1491  void writeHintsToMetadata(ArrayRef<Hint> HintTypes) {
1492  if (HintTypes.size() == 0)
1493  return;
1494 
1495  // Reserve the first element to LoopID (see below).
1497  // If the loop already has metadata, then ignore the existing operands.
1498  MDNode *LoopID = TheLoop->getLoopID();
1499  if (LoopID) {
1500  for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
1501  MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
1502  // If node in update list, ignore old value.
1503  if (!matchesHintMetadataName(Node, HintTypes))
1504  MDs.push_back(Node);
1505  }
1506  }
1507 
1508  // Now, add the missing hints.
1509  for (auto H : HintTypes)
1510  MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));
1511 
1512  // Replace current metadata node with new one.
1513  LLVMContext &Context = TheLoop->getHeader()->getContext();
1514  MDNode *NewLoopID = MDNode::get(Context, MDs);
1515  // Set operand 0 to refer to the loop id itself.
1516  NewLoopID->replaceOperandWith(0, NewLoopID);
1517 
1518  TheLoop->setLoopID(NewLoopID);
1519  }
1520 
1521  /// The loop these hints belong to.
1522  const Loop *TheLoop;
1523 
1524  /// Interface to emit optimization remarks.
1526 };
1527 
1528 static void emitMissedWarning(Function *F, Loop *L,
1529  const LoopVectorizeHints &LH,
1531  LH.emitRemarkWithHints();
1532 
1533  if (LH.getForce() == LoopVectorizeHints::FK_Enabled) {
1534  if (LH.getWidth() != 1)
1536  DEBUG_TYPE, "FailedRequestedVectorization",
1537  L->getStartLoc(), L->getHeader())
1538  << "loop not vectorized: "
1539  << "failed explicitly specified loop vectorization");
1540  else if (LH.getInterleave() != 1)
1542  DEBUG_TYPE, "FailedRequestedInterleaving", L->getStartLoc(),
1543  L->getHeader())
1544  << "loop not interleaved: "
1545  << "failed explicitly specified loop interleaving");
1546  }
1547 }
1548 
1549 /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
1550 /// to what vectorization factor.
1551 /// This class does not look at the profitability of vectorization, only the
1552 /// legality. This class has two main kinds of checks:
1553 /// * Memory checks - The code in canVectorizeMemory checks if vectorization
1554 /// will change the order of memory accesses in a way that will change the
1555 /// correctness of the program.
1556 /// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory
1557 /// checks for a number of different conditions, such as the availability of a
1558 /// single induction variable, that all types are supported and vectorize-able,
1559 /// etc. This code reflects the capabilities of InnerLoopVectorizer.
1560 /// This class is also used by InnerLoopVectorizer for identifying
1561 /// induction variable and the different reduction variables.
1562 class LoopVectorizationLegality {
1563 public:
1564  LoopVectorizationLegality(
1567  const TargetTransformInfo *TTI,
1568  std::function<const LoopAccessInfo &(Loop &)> *GetLAA, LoopInfo *LI,
1569  OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R,
1570  LoopVectorizeHints *H)
1571  : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TTI(TTI), DT(DT),
1572  GetLAA(GetLAA), LAI(nullptr), ORE(ORE), InterleaveInfo(PSE, L, DT, LI),
1573  PrimaryInduction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false),
1574  Requirements(R), Hints(H) {}
1575 
1576  /// ReductionList contains the reduction descriptors for all
1577  /// of the reductions that were found in the loop.
1578  typedef DenseMap<PHINode *, RecurrenceDescriptor> ReductionList;
1579 
1580  /// InductionList saves induction variables and maps them to the
1581  /// induction descriptor.
1582  typedef MapVector<PHINode *, InductionDescriptor> InductionList;
1583 
1584  /// RecurrenceSet contains the phi nodes that are recurrences other than
1585  /// inductions and reductions.
1586  typedef SmallPtrSet<const PHINode *, 8> RecurrenceSet;
1587 
1588  /// Returns true if it is legal to vectorize this loop.
1589  /// This does not mean that it is profitable to vectorize this
1590  /// loop, only that it is legal to do so.
1591  bool canVectorize();
1592 
1593  /// Returns the primary induction variable.
1594  PHINode *getPrimaryInduction() { return PrimaryInduction; }
1595 
1596  /// Returns the reduction variables found in the loop.
1597  ReductionList *getReductionVars() { return &Reductions; }
1598 
1599  /// Returns the induction variables found in the loop.
1600  InductionList *getInductionVars() { return &Inductions; }
1601 
1602  /// Return the first-order recurrences found in the loop.
1603  RecurrenceSet *getFirstOrderRecurrences() { return &FirstOrderRecurrences; }
1604 
1605  /// Return the set of instructions to sink to handle first-order recurrences.
1606  DenseMap<Instruction *, Instruction *> &getSinkAfter() { return SinkAfter; }
1607 
1608  /// Returns the widest induction type.
1609  Type *getWidestInductionType() { return WidestIndTy; }
1610 
1611  /// Returns True if V is an induction variable in this loop.
1612  bool isInductionVariable(const Value *V);
1613 
1614  /// Returns True if PN is a reduction variable in this loop.
1615  bool isReductionVariable(PHINode *PN) { return Reductions.count(PN); }
1616 
1617  /// Returns True if Phi is a first-order recurrence in this loop.
1618  bool isFirstOrderRecurrence(const PHINode *Phi);
1619 
1620  /// Return true if the block BB needs to be predicated in order for the loop
1621  /// to be vectorized.
1622  bool blockNeedsPredication(BasicBlock *BB);
1623 
1624  /// Check if this pointer is consecutive when vectorizing. This happens
1625  /// when the last index of the GEP is the induction variable, or that the
1626  /// pointer itself is an induction variable.
1627  /// This check allows us to vectorize A[idx] into a wide load/store.
1628  /// Returns:
1629  /// 0 - Stride is unknown or non-consecutive.
1630  /// 1 - Address is consecutive.
1631  /// -1 - Address is consecutive, and decreasing.
1632  int isConsecutivePtr(Value *Ptr);
1633 
1634  /// Returns true if the value V is uniform within the loop.
1635  bool isUniform(Value *V);
1636 
1637  /// Returns the information that we collected about runtime memory check.
1638  const RuntimePointerChecking *getRuntimePointerChecking() const {
1639  return LAI->getRuntimePointerChecking();
1640  }
1641 
1642  const LoopAccessInfo *getLAI() const { return LAI; }
1643 
1644  /// \brief Check if \p Instr belongs to any interleaved access group.
1645  bool isAccessInterleaved(Instruction *Instr) {
1646  return InterleaveInfo.isInterleaved(Instr);
1647  }
1648 
1649  /// \brief Return the maximum interleave factor of all interleaved groups.
1650  unsigned getMaxInterleaveFactor() const {
1651  return InterleaveInfo.getMaxInterleaveFactor();
1652  }
1653 
1654  /// \brief Get the interleaved access group that \p Instr belongs to.
1655  const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) {
1656  return InterleaveInfo.getInterleaveGroup(Instr);
1657  }
1658 
1659  /// \brief Returns true if an interleaved group requires a scalar iteration
1660  /// to handle accesses with gaps.
1661  bool requiresScalarEpilogue() const {
1662  return InterleaveInfo.requiresScalarEpilogue();
1663  }
1664 
1665  unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); }
1666 
1667  bool hasStride(Value *V) { return LAI->hasStride(V); }
1668 
1669  /// Returns true if the target machine supports masked store operation
1670  /// for the given \p DataType and kind of access to \p Ptr.
1671  bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
1672  return isConsecutivePtr(Ptr) && TTI->isLegalMaskedStore(DataType);
1673  }
1674  /// Returns true if the target machine supports masked load operation
1675  /// for the given \p DataType and kind of access to \p Ptr.
1676  bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
1677  return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType);
1678  }
1679  /// Returns true if the target machine supports masked scatter operation
1680  /// for the given \p DataType.
1681  bool isLegalMaskedScatter(Type *DataType) {
1682  return TTI->isLegalMaskedScatter(DataType);
1683  }
1684  /// Returns true if the target machine supports masked gather operation
1685  /// for the given \p DataType.
1686  bool isLegalMaskedGather(Type *DataType) {
1687  return TTI->isLegalMaskedGather(DataType);
1688  }
1689  /// Returns true if the target machine can represent \p V as a masked gather
1690  /// or scatter operation.
1691  bool isLegalGatherOrScatter(Value *V) {
1692  auto *LI = dyn_cast<LoadInst>(V);
1693  auto *SI = dyn_cast<StoreInst>(V);
1694  if (!LI && !SI)
1695  return false;
1696  auto *Ptr = getPointerOperand(V);
1697  auto *Ty = cast<PointerType>(Ptr->getType())->getElementType();
1698  return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
1699  }
1700 
1701  /// Returns true if vector representation of the instruction \p I
1702  /// requires mask.
1703  bool isMaskRequired(const Instruction *I) { return (MaskedOp.count(I) != 0); }
1704  unsigned getNumStores() const { return LAI->getNumStores(); }
1705  unsigned getNumLoads() const { return LAI->getNumLoads(); }
1706  unsigned getNumPredStores() const { return NumPredStores; }
1707 
1708  /// Returns true if \p I is an instruction that will be scalarized with
1709  /// predication. Such instructions include conditional stores and
1710  /// instructions that may divide by zero.
1711  bool isScalarWithPredication(Instruction *I);
1712 
1713  /// Returns true if \p I is a memory instruction with consecutive memory
1714  /// access that can be widened.
1715  bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1716 
1717  // Returns true if the NoNaN attribute is set on the function.
1718  bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; }
1719 
1720 private:
1721  /// Check if a single basic block loop is vectorizable.
1722  /// At this point we know that this is a loop with a constant trip count
1723  /// and we only need to check individual instructions.
1724  bool canVectorizeInstrs();
1725 
1726  /// When we vectorize loops we may change the order in which
1727  /// we read and write from memory. This method checks if it is
1728  /// legal to vectorize the code, considering only memory constrains.
1729  /// Returns true if the loop is vectorizable
1730  bool canVectorizeMemory();
1731 
1732  /// Return true if we can vectorize this loop using the IF-conversion
1733  /// transformation.
1734  bool canVectorizeWithIfConvert();
1735 
1736  /// Return true if all of the instructions in the block can be speculatively
1737  /// executed. \p SafePtrs is a list of addresses that are known to be legal
1738  /// and we know that we can read from them without segfault.
1739  bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs);
1740 
1741  /// Updates the vectorization state by adding \p Phi to the inductions list.
1742  /// This can set \p Phi as the main induction of the loop if \p Phi is a
1743  /// better choice for the main induction than the existing one.
1744  void addInductionPhi(PHINode *Phi, const InductionDescriptor &ID,
1745  SmallPtrSetImpl<Value *> &AllowedExit);
1746 
1747  /// Create an analysis remark that explains why vectorization failed
1748  ///
1749  /// \p RemarkName is the identifier for the remark. If \p I is passed it is
1750  /// an instruction that prevents vectorization. Otherwise the loop is used
1751  /// for the location of the remark. \return the remark object that can be
1752  /// streamed to.
1754  createMissedAnalysis(StringRef RemarkName, Instruction *I = nullptr) const {
1755  return ::createMissedAnalysis(Hints->vectorizeAnalysisPassName(),
1756  RemarkName, TheLoop, I);
1757  }
1758 
1759  /// \brief If an access has a symbolic strides, this maps the pointer value to
1760  /// the stride symbol.
1761  const ValueToValueMap *getSymbolicStrides() {
1762  // FIXME: Currently, the set of symbolic strides is sometimes queried before
1763  // it's collected. This happens from canVectorizeWithIfConvert, when the
1764  // pointer is checked to reference consecutive elements suitable for a
1765  // masked access.
1766  return LAI ? &LAI->getSymbolicStrides() : nullptr;
1767  }
1768 
1769  unsigned NumPredStores;
1770 
1771  /// The loop that we evaluate.
1772  Loop *TheLoop;
1773  /// A wrapper around ScalarEvolution used to add runtime SCEV checks.
1774  /// Applies dynamic knowledge to simplify SCEV expressions in the context
1775  /// of existing SCEV assumptions. The analysis will also add a minimal set
1776  /// of new predicates if this is required to enable vectorization and
1777  /// unrolling.
1779  /// Target Library Info.
1780  TargetLibraryInfo *TLI;
1781  /// Target Transform Info
1782  const TargetTransformInfo *TTI;
1783  /// Dominator Tree.
1784  DominatorTree *DT;
1785  // LoopAccess analysis.
1786  std::function<const LoopAccessInfo &(Loop &)> *GetLAA;
1787  // And the loop-accesses info corresponding to this loop. This pointer is
1788  // null until canVectorizeMemory sets it up.
1789  const LoopAccessInfo *LAI;
1790  /// Interface to emit optimization remarks.
1792 
1793  /// The interleave access information contains groups of interleaved accesses
1794  /// with the same stride and close to each other.
1795  InterleavedAccessInfo InterleaveInfo;
1796 
1797  // --- vectorization state --- //
1798 
1799  /// Holds the primary induction variable. This is the counter of the
1800  /// loop.
1801  PHINode *PrimaryInduction;
1802  /// Holds the reduction variables.
1803  ReductionList Reductions;
1804  /// Holds all of the induction variables that we found in the loop.
1805  /// Notice that inductions don't need to start at zero and that induction
1806  /// variables can be pointers.
1807  InductionList Inductions;
1808  /// Holds the phi nodes that are first-order recurrences.
1809  RecurrenceSet FirstOrderRecurrences;
1810  /// Holds instructions that need to sink past other instructions to handle
1811  /// first-order recurrences.
1813  /// Holds the widest induction type encountered.
1814  Type *WidestIndTy;
1815 
1816  /// Allowed outside users. This holds the induction and reduction
1817  /// vars which can be accessed from outside the loop.
1818  SmallPtrSet<Value *, 4> AllowedExit;
1819 
1820  /// Can we assume the absence of NaNs.
1821  bool HasFunNoNaNAttr;
1822 
1823  /// Vectorization requirements that will go through late-evaluation.
1824  LoopVectorizationRequirements *Requirements;
1825 
1826  /// Used to emit an analysis of any legality issues.
1827  LoopVectorizeHints *Hints;
1828 
1829  /// While vectorizing these instructions we have to generate a
1830  /// call to the appropriate masked intrinsic
1832 };
1833 
1834 /// LoopVectorizationCostModel - estimates the expected speedups due to
1835 /// vectorization.
1836 /// In many cases vectorization is not profitable. This can happen because of
1837 /// a number of reasons. In this class we mainly attempt to predict the
1838 /// expected speedup/slowdowns due to the supported instruction set. We use the
1839 /// TargetTransformInfo to query the different backends for the cost of
1840 /// different operations.
1841 class LoopVectorizationCostModel {
1842 public:
1843  LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE,
1844  LoopInfo *LI, LoopVectorizationLegality *Legal,
1845  const TargetTransformInfo &TTI,
1846  const TargetLibraryInfo *TLI, DemandedBits *DB,
1847  AssumptionCache *AC,
1848  OptimizationRemarkEmitter *ORE, const Function *F,
1849  const LoopVectorizeHints *Hints)
1850  : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
1851  AC(AC), ORE(ORE), TheFunction(F), Hints(Hints) {}
1852 
1853  /// \return An upper bound for the vectorization factor, or None if
1854  /// vectorization should be avoided up front.
1855  Optional<unsigned> computeMaxVF(bool OptForSize);
1856 
1857  /// Information about vectorization costs
1858  struct VectorizationFactor {
1859  unsigned Width; // Vector width with best cost
1860  unsigned Cost; // Cost of the loop with that width
1861  };
1862  /// \return The most profitable vectorization factor and the cost of that VF.
1863  /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
1864  /// then this vectorization factor will be selected if vectorization is
1865  /// possible.
1866  VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
1867 
1868  /// Setup cost-based decisions for user vectorization factor.
1869  void selectUserVectorizationFactor(unsigned UserVF) {
1870  collectUniformsAndScalars(UserVF);
1871  collectInstsToScalarize(UserVF);
1872  }
1873 
1874  /// \return The size (in bits) of the smallest and widest types in the code
1875  /// that needs to be vectorized. We ignore values that remain scalar such as
1876  /// 64 bit loop indices.
1877  std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1878 
1879  /// \return The desired interleave count.
1880  /// If interleave count has been specified by metadata it will be returned.
1881  /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1882  /// are the selected vectorization factor and the cost of the selected VF.
1883  unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
1884  unsigned LoopCost);
1885 
1886  /// Memory access instruction may be vectorized in more than one way.
1887  /// Form of instruction after vectorization depends on cost.
1888  /// This function takes cost-based decisions for Load/Store instructions
1889  /// and collects them in a map. This decisions map is used for building
1890  /// the lists of loop-uniform and loop-scalar instructions.
1891  /// The calculated cost is saved with widening decision in order to
1892  /// avoid redundant calculations.
1893  void setCostBasedWideningDecision(unsigned VF);
1894 
1895  /// \brief A struct that represents some properties of the register usage
1896  /// of a loop.
1897  struct RegisterUsage {
1898  /// Holds the number of loop invariant values that are used in the loop.
1899  unsigned LoopInvariantRegs;
1900  /// Holds the maximum number of concurrent live intervals in the loop.
1901  unsigned MaxLocalUsers;
1902  /// Holds the number of instructions in the loop.
1903  unsigned NumInstructions;
1904  };
1905 
1906  /// \return Returns information about the register usages of the loop for the
1907  /// given vectorization factors.
1908  SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
1909 
1910  /// Collect values we want to ignore in the cost model.
1911  void collectValuesToIgnore();
1912 
1913  /// \returns The smallest bitwidth each instruction can be represented with.
1914  /// The vector equivalents of these instructions should be truncated to this
1915  /// type.
1916  const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1917  return MinBWs;
1918  }
1919 
1920  /// \returns True if it is more profitable to scalarize instruction \p I for
1921  /// vectorization factor \p VF.
1922  bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1923  auto Scalars = InstsToScalarize.find(VF);
1924  assert(Scalars != InstsToScalarize.end() &&
1925  "VF not yet analyzed for scalarization profitability");
1926  return Scalars->second.count(I);
1927  }
1928 
1929  /// Returns true if \p I is known to be uniform after vectorization.
1930  bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1931  if (VF == 1)
1932  return true;
1933  assert(Uniforms.count(VF) && "VF not yet analyzed for uniformity");
1934  auto UniformsPerVF = Uniforms.find(VF);
1935  return UniformsPerVF->second.count(I);
1936  }
1937 
1938  /// Returns true if \p I is known to be scalar after vectorization.
1939  bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1940  if (VF == 1)
1941  return true;
1942  assert(Scalars.count(VF) && "Scalar values are not calculated for VF");
1943  auto ScalarsPerVF = Scalars.find(VF);
1944  return ScalarsPerVF->second.count(I);
1945  }
1946 
1947  /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1948  /// for vectorization factor \p VF.
1949  bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1950  return VF > 1 && MinBWs.count(I) && !isProfitableToScalarize(I, VF) &&
1951  !isScalarAfterVectorization(I, VF);
1952  }
1953 
1954  /// Decision that was taken during cost calculation for memory instruction.
1955  enum InstWidening {
1956  CM_Unknown,
1957  CM_Widen,
1958  CM_Interleave,
1959  CM_GatherScatter,
1960  CM_Scalarize
1961  };
1962 
1963  /// Save vectorization decision \p W and \p Cost taken by the cost model for
1964  /// instruction \p I and vector width \p VF.
1965  void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1966  unsigned Cost) {
1967  assert(VF >= 2 && "Expected VF >=2");
1968  WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1969  }
1970 
1971  /// Save vectorization decision \p W and \p Cost taken by the cost model for
1972  /// interleaving group \p Grp and vector width \p VF.
1973  void setWideningDecision(const InterleaveGroup *Grp, unsigned VF,
1974  InstWidening W, unsigned Cost) {
1975  assert(VF >= 2 && "Expected VF >=2");
1976  /// Broadcast this decicion to all instructions inside the group.
1977  /// But the cost will be assigned to one instruction only.
1978  for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1979  if (auto *I = Grp->getMember(i)) {
1980  if (Grp->getInsertPos() == I)
1981  WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1982  else
1983  WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1984  }
1985  }
1986  }
1987 
1988  /// Return the cost model decision for the given instruction \p I and vector
1989  /// width \p VF. Return CM_Unknown if this instruction did not pass
1990  /// through the cost modeling.
1991  InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1992  assert(VF >= 2 && "Expected VF >=2");
1993  std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1994  auto Itr = WideningDecisions.find(InstOnVF);
1995  if (Itr == WideningDecisions.end())
1996  return CM_Unknown;
1997  return Itr->second.first;
1998  }
1999 
2000  /// Return the vectorization cost for the given instruction \p I and vector
2001  /// width \p VF.
2002  unsigned getWideningCost(Instruction *I, unsigned VF) {
2003  assert(VF >= 2 && "Expected VF >=2");
2004  std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
2005  assert(WideningDecisions.count(InstOnVF) && "The cost is not calculated");
2006  return WideningDecisions[InstOnVF].second;
2007  }
2008 
2009  /// Return True if instruction \p I is an optimizable truncate whose operand
2010  /// is an induction variable. Such a truncate will be removed by adding a new
2011  /// induction variable with the destination type.
2012  bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
2013 
2014  // If the instruction is not a truncate, return false.
2015  auto *Trunc = dyn_cast<TruncInst>(I);
2016  if (!Trunc)
2017  return false;
2018 
2019  // Get the source and destination types of the truncate.
2020  Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
2021  Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
2022 
2023  // If the truncate is free for the given types, return false. Replacing a
2024  // free truncate with an induction variable would add an induction variable
2025  // update instruction to each iteration of the loop. We exclude from this
2026  // check the primary induction variable since it will need an update
2027  // instruction regardless.
2028  Value *Op = Trunc->getOperand(0);
2029  if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
2030  return false;
2031 
2032  // If the truncated value is not an induction variable, return false.
2033  return Legal->isInductionVariable(Op);
2034  }
2035 
2036 private:
2037  /// \return An upper bound for the vectorization factor, larger than zero.
2038  /// One is returned if vectorization should best be avoided due to cost.
2039  unsigned computeFeasibleMaxVF(bool OptForSize);
2040 
2041  /// The vectorization cost is a combination of the cost itself and a boolean
2042  /// indicating whether any of the contributing operations will actually
2043  /// operate on
2044  /// vector values after type legalization in the backend. If this latter value
2045  /// is
2046  /// false, then all operations will be scalarized (i.e. no vectorization has
2047  /// actually taken place).
2048  typedef std::pair<unsigned, bool> VectorizationCostTy;
2049 
2050  /// Returns the expected execution cost. The unit of the cost does
2051  /// not matter because we use the 'cost' units to compare different
2052  /// vector widths. The cost that is returned is *not* normalized by
2053  /// the factor width.
2054  VectorizationCostTy expectedCost(unsigned VF);
2055 
2056  /// Returns the execution time cost of an instruction for a given vector
2057  /// width. Vector width of one means scalar.
2058  VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
2059 
2060  /// The cost-computation logic from getInstructionCost which provides
2061  /// the vector type as an output parameter.
2062  unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
2063 
2064  /// Calculate vectorization cost of memory instruction \p I.
2065  unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
2066 
2067  /// The cost computation for scalarized memory instruction.
2068  unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
2069 
2070  /// The cost computation for interleaving group of memory instructions.
2071  unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
2072 
2073  /// The cost computation for Gather/Scatter instruction.
2074  unsigned getGatherScatterCost(Instruction *I, unsigned VF);
2075 
2076  /// The cost computation for widening instruction \p I with consecutive
2077  /// memory access.
2078  unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
2079 
2080  /// The cost calculation for Load instruction \p I with uniform pointer -
2081  /// scalar load + broadcast.
2082  unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
2083 
2084  /// Returns whether the instruction is a load or store and will be a emitted
2085  /// as a vector operation.
2086  bool isConsecutiveLoadOrStore(Instruction *I);
2087 
2088  /// Create an analysis remark that explains why vectorization failed
2089  ///
2090  /// \p RemarkName is the identifier for the remark. \return the remark object
2091  /// that can be streamed to.
2093  return ::createMissedAnalysis(Hints->vectorizeAnalysisPassName(),
2094  RemarkName, TheLoop);
2095  }
2096 
2097  /// Map of scalar integer values to the smallest bitwidth they can be legally
2098  /// represented as. The vector equivalents of these values should be truncated
2099  /// to this type.
2101 
2102  /// A type representing the costs for instructions if they were to be
2103  /// scalarized rather than vectorized. The entries are Instruction-Cost
2104  /// pairs.
2105  typedef DenseMap<Instruction *, unsigned> ScalarCostsTy;
2106 
2107  /// A set containing all BasicBlocks that are known to present after
2108  /// vectorization as a predicated block.
2109  SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
2110 
2111  /// A map holding scalar costs for different vectorization factors. The
2112  /// presence of a cost for an instruction in the mapping indicates that the
2113  /// instruction will be scalarized when vectorizing with the associated
2114  /// vectorization factor. The entries are VF-ScalarCostTy pairs.
2115  DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
2116 
2117  /// Holds the instructions known to be uniform after vectorization.
2118  /// The data is collected per VF.
2120 
2121  /// Holds the instructions known to be scalar after vectorization.
2122  /// The data is collected per VF.
2124 
2125  /// Holds the instructions (address computations) that are forced to be
2126  /// scalarized.
2128 
2129  /// Returns the expected difference in cost from scalarizing the expression
2130  /// feeding a predicated instruction \p PredInst. The instructions to
2131  /// scalarize and their scalar costs are collected in \p ScalarCosts. A
2132  /// non-negative return value implies the expression will be scalarized.
2133  /// Currently, only single-use chains are considered for scalarization.
2134  int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
2135  unsigned VF);
2136 
2137  /// Collects the instructions to scalarize for each predicated instruction in
2138  /// the loop.
2139  void collectInstsToScalarize(unsigned VF);
2140 
2141  /// Collect the instructions that are uniform after vectorization. An
2142  /// instruction is uniform if we represent it with a single scalar value in
2143  /// the vectorized loop corresponding to each vector iteration. Examples of
2144  /// uniform instructions include pointer operands of consecutive or
2145  /// interleaved memory accesses. Note that although uniformity implies an
2146  /// instruction will be scalar, the reverse is not true. In general, a
2147  /// scalarized instruction will be represented by VF scalar values in the
2148  /// vectorized loop, each corresponding to an iteration of the original
2149  /// scalar loop.
2150  void collectLoopUniforms(unsigned VF);
2151 
2152  /// Collect the instructions that are scalar after vectorization. An
2153  /// instruction is scalar if it is known to be uniform or will be scalarized
2154  /// during vectorization. Non-uniform scalarized instructions will be
2155  /// represented by VF values in the vectorized loop, each corresponding to an
2156  /// iteration of the original scalar loop.
2157  void collectLoopScalars(unsigned VF);
2158 
2159  /// Collect Uniform and Scalar values for the given \p VF.
2160  /// The sets depend on CM decision for Load/Store instructions
2161  /// that may be vectorized as interleave, gather-scatter or scalarized.
2162  void collectUniformsAndScalars(unsigned VF) {
2163  // Do the analysis once.
2164  if (VF == 1 || Uniforms.count(VF))
2165  return;
2166  setCostBasedWideningDecision(VF);
2167  collectLoopUniforms(VF);
2168  collectLoopScalars(VF);
2169  }
2170 
2171  /// Keeps cost model vectorization decision and cost for instructions.
2172  /// Right now it is used for memory instructions only.
2174  std::pair<InstWidening, unsigned>>
2175  DecisionList;
2176 
2177  DecisionList WideningDecisions;
2178 
2179 public:
2180  /// The loop that we evaluate.
2181  Loop *TheLoop;
2182  /// Predicated scalar evolution analysis.
2184  /// Loop Info analysis.
2185  LoopInfo *LI;
2186  /// Vectorization legality.
2187  LoopVectorizationLegality *Legal;
2188  /// Vector target information.
2189  const TargetTransformInfo &TTI;
2190  /// Target Library Info.
2191  const TargetLibraryInfo *TLI;
2192  /// Demanded bits analysis.
2193  DemandedBits *DB;
2194  /// Assumption cache.
2195  AssumptionCache *AC;
2196  /// Interface to emit optimization remarks.
2198 
2199  const Function *TheFunction;
2200  /// Loop Vectorize Hint.
2201  const LoopVectorizeHints *Hints;
2202  /// Values to ignore in the cost model.
2203  SmallPtrSet<const Value *, 16> ValuesToIgnore;
2204  /// Values to ignore in the cost model when VF > 1.
2205  SmallPtrSet<const Value *, 16> VecValuesToIgnore;
2206 };
2207 
2208 /// LoopVectorizationPlanner - drives the vectorization process after having
2209 /// passed Legality checks.
2210 class LoopVectorizationPlanner {
2211 public:
2212  LoopVectorizationPlanner(Loop *OrigLoop, LoopInfo *LI,
2213  LoopVectorizationLegality *Legal,
2214  LoopVectorizationCostModel &CM)
2215  : OrigLoop(OrigLoop), LI(LI), Legal(Legal), CM(CM) {}
2216 
2217  ~LoopVectorizationPlanner() {}
2218 
2219  /// Plan how to best vectorize, return the best VF and its cost.
2221  unsigned UserVF);
2222 
2223  /// Generate the IR code for the vectorized loop.
2224  void executePlan(InnerLoopVectorizer &ILV);
2225 
2226 protected:
2227  /// Collect the instructions from the original loop that would be trivially
2228  /// dead in the vectorized loop if generated.
2229  void collectTriviallyDeadInstructions(
2230  SmallPtrSetImpl<Instruction *> &DeadInstructions);
2231 
2232 private:
2233  /// The loop that we evaluate.
2234  Loop *OrigLoop;
2235 
2236  /// Loop Info analysis.
2237  LoopInfo *LI;
2238 
2239  /// The legality analysis.
2240  LoopVectorizationLegality *Legal;
2241 
2242  /// The profitablity analysis.
2243  LoopVectorizationCostModel &CM;
2244 };
2245 
2246 /// \brief This holds vectorization requirements that must be verified late in
2247 /// the process. The requirements are set by legalize and costmodel. Once
2248 /// vectorization has been determined to be possible and profitable the
2249 /// requirements can be verified by looking for metadata or compiler options.
2250 /// For example, some loops require FP commutativity which is only allowed if
2251 /// vectorization is explicitly specified or if the fast-math compiler option
2252 /// has been provided.
2253 /// Late evaluation of these requirements allows helpful diagnostics to be
2254 /// composed that tells the user what need to be done to vectorize the loop. For
2255 /// example, by specifying #pragma clang loop vectorize or -ffast-math. Late
2256 /// evaluation should be used only when diagnostics can generated that can be
2257 /// followed by a non-expert user.
2258 class LoopVectorizationRequirements {
2259 public:
2260  LoopVectorizationRequirements(OptimizationRemarkEmitter &ORE)
2261  : NumRuntimePointerChecks(0), UnsafeAlgebraInst(nullptr), ORE(ORE) {}
2262 
2263  void addUnsafeAlgebraInst(Instruction *I) {
2264  // First unsafe algebra instruction.
2265  if (!UnsafeAlgebraInst)
2266  UnsafeAlgebraInst = I;
2267  }
2268 
2269  void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; }
2270 
2271  bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints) {
2272  const char *PassName = Hints.vectorizeAnalysisPassName();
2273  bool Failed = false;
2274  if (UnsafeAlgebraInst && !Hints.allowReordering()) {
2275  ORE.emit(
2276  OptimizationRemarkAnalysisFPCommute(PassName, "CantReorderFPOps",
2277  UnsafeAlgebraInst->getDebugLoc(),
2278  UnsafeAlgebraInst->getParent())
2279  << "loop not vectorized: cannot prove it is safe to reorder "
2280  "floating-point operations");
2281  Failed = true;
2282  }
2283 
2284  // Test if runtime memcheck thresholds are exceeded.
2285  bool PragmaThresholdReached =
2286  NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
2287  bool ThresholdReached =
2288  NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
2289  if ((ThresholdReached && !Hints.allowReordering()) ||
2290  PragmaThresholdReached) {
2291  ORE.emit(OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps",
2292  L->getStartLoc(),
2293  L->getHeader())
2294  << "loop not vectorized: cannot prove it is safe to reorder "
2295  "memory operations");
2296  DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
2297  Failed = true;
2298  }
2299 
2300  return Failed;
2301  }
2302 
2303 private:
2304  unsigned NumRuntimePointerChecks;
2305  Instruction *UnsafeAlgebraInst;
2306 
2307  /// Interface to emit optimization remarks.
2309 };
2310 
2311 static void addAcyclicInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {
2312  if (L.empty()) {
2313  if (!hasCyclesInLoopBody(L))
2314  V.push_back(&L);
2315  return;
2316  }
2317  for (Loop *InnerL : L)
2318  addAcyclicInnerLoop(*InnerL, V);
2319 }
2320 
2321 /// The LoopVectorize Pass.
2322 struct LoopVectorize : public FunctionPass {
2323  /// Pass identification, replacement for typeid
2324  static char ID;
2325 
2326  explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true)
2327  : FunctionPass(ID) {
2328  Impl.DisableUnrolling = NoUnrolling;
2329  Impl.AlwaysVectorize = AlwaysVectorize;
2331  }
2332 
2333  LoopVectorizePass Impl;
2334 
2335  bool runOnFunction(Function &F) override {
2336  if (skipFunction(F))
2337  return false;
2338 
2339  auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2340  auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2341  auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2342  auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2343  auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2344  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2345  auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
2346  auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2347  auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2348  auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2349  auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2350  auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2351 
2352  std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2353  [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2354 
2355  return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2356  GetLAA, *ORE);
2357  }
2358 
2359  void getAnalysisUsage(AnalysisUsage &AU) const override {
2374  }
2375 };
2376 
2377 } // end anonymous namespace
2378 
2379 //===----------------------------------------------------------------------===//
2380 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2381 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2382 //===----------------------------------------------------------------------===//
2383 
2384 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2385  // We need to place the broadcast of invariant variables outside the loop.
2386  Instruction *Instr = dyn_cast<Instruction>(V);
2387  bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody);
2388  bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
2389 
2390  // Place the code for broadcasting invariant variables in the new preheader.
2391  IRBuilder<>::InsertPointGuard Guard(Builder);
2392  if (Invariant)
2393  Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2394 
2395  // Broadcast the scalar into all locations in the vector.
2396  Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2397 
2398  return Shuf;
2399 }
2400 
2401 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
2402  const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
2403  Value *Start = II.getStartValue();
2404 
2405  // Construct the initial value of the vector IV in the vector loop preheader
2406  auto CurrIP = Builder.saveIP();
2407  Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2408  if (isa<TruncInst>(EntryVal)) {
2409  assert(Start->getType()->isIntegerTy() &&
2410  "Truncation requires an integer type");
2411  auto *TruncType = cast<IntegerType>(EntryVal->getType());
2412  Step = Builder.CreateTrunc(Step, TruncType);
2413  Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2414  }
2415  Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
2416  Value *SteppedStart =
2417  getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
2418 
2419  // We create vector phi nodes for both integer and floating-point induction
2420  // variables. Here, we determine the kind of arithmetic we will perform.
2421  Instruction::BinaryOps AddOp;
2422  Instruction::BinaryOps MulOp;
2423  if (Step->getType()->isIntegerTy()) {
2424  AddOp = Instruction::Add;
2425  MulOp = Instruction::Mul;
2426  } else {
2427  AddOp = II.getInductionOpcode();
2428  MulOp = Instruction::FMul;
2429  }
2430 
2431  // Multiply the vectorization factor by the step using integer or
2432  // floating-point arithmetic as appropriate.
2433  Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
2434  Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
2435 
2436  // Create a vector splat to use in the induction update.
2437  //
2438  // FIXME: If the step is non-constant, we create the vector splat with
2439  // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2440  // handle a constant vector splat.
2441  Value *SplatVF = isa<Constant>(Mul)
2442  ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
2443  : Builder.CreateVectorSplat(VF, Mul);
2444  Builder.restoreIP(CurrIP);
2445 
2446  // We may need to add the step a number of times, depending on the unroll
2447  // factor. The last of those goes into the PHI.
2448  PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2449  &*LoopVectorBody->getFirstInsertionPt());
2450  Instruction *LastInduction = VecInd;
2451  for (unsigned Part = 0; Part < UF; ++Part) {
2452  VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
2453  if (isa<TruncInst>(EntryVal))
2454  addMetadata(LastInduction, EntryVal);
2455  LastInduction = cast<Instruction>(addFastMathFlag(
2456  Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
2457  }
2458 
2459  // Move the last step to the end of the latch block. This ensures consistent
2460  // placement of all induction updates.
2461  auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2462  auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2463  auto *ICmp = cast<Instruction>(Br->getCondition());
2464  LastInduction->moveBefore(ICmp);
2465  LastInduction->setName("vec.ind.next");
2466 
2467  VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2468  VecInd->addIncoming(LastInduction, LoopVectorLatch);
2469 }
2470 
2471 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
2472  return Cost->isScalarAfterVectorization(I, VF) ||
2473  Cost->isProfitableToScalarize(I, VF);
2474 }
2475 
2476 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
2477  if (shouldScalarizeInstruction(IV))
2478  return true;
2479  auto isScalarInst = [&](User *U) -> bool {
2480  auto *I = cast<Instruction>(U);
2481  return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
2482  };
2483  return any_of(IV->users(), isScalarInst);
2484 }
2485 
2486 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
2487 
2488  assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
2489  "Primary induction variable must have an integer type");
2490 
2491  auto II = Legal->getInductionVars()->find(IV);
2492  assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
2493 
2494  auto ID = II->second;
2495  assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
2496 
2497  // The scalar value to broadcast. This will be derived from the canonical
2498  // induction variable.
2499  Value *ScalarIV = nullptr;
2500 
2501  // The value from the original loop to which we are mapping the new induction
2502  // variable.
2503  Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2504 
2505  // True if we have vectorized the induction variable.
2506  auto VectorizedIV = false;
2507 
2508  // Determine if we want a scalar version of the induction variable. This is
2509  // true if the induction variable itself is not widened, or if it has at
2510  // least one user in the loop that is not widened.
2511  auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
2512 
2513  // Generate code for the induction step. Note that induction steps are
2514  // required to be loop-invariant
2515  assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
2516  "Induction step should be loop invariant");
2517  auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2518  Value *Step = nullptr;
2519  if (PSE.getSE()->isSCEVable(IV->getType())) {
2520  SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2521  Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
2522  LoopVectorPreHeader->getTerminator());
2523  } else {
2524  Step = cast<SCEVUnknown>(ID.getStep())->getValue();
2525  }
2526 
2527  // Try to create a new independent vector induction variable. If we can't
2528  // create the phi node, we will splat the scalar induction variable in each
2529  // loop iteration.
2530  if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
2531  createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2532  VectorizedIV = true;
2533  }
2534 
2535  // If we haven't yet vectorized the induction variable, or if we will create
2536  // a scalar one, we need to define the scalar induction variable and step
2537  // values. If we were given a truncation type, truncate the canonical
2538  // induction variable and step. Otherwise, derive these values from the
2539  // induction descriptor.
2540  if (!VectorizedIV || NeedsScalarIV) {
2541  ScalarIV = Induction;
2542  if (IV != OldInduction) {
2543  ScalarIV = IV->getType()->isIntegerTy()
2544  ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
2545  : Builder.CreateCast(Instruction::SIToFP, Induction,
2546  IV->getType());
2547  ScalarIV = ID.transform(Builder, ScalarIV, PSE.getSE(), DL);
2548  ScalarIV->setName("offset.idx");
2549  }
2550  if (Trunc) {
2551  auto *TruncType = cast<IntegerType>(Trunc->getType());
2552  assert(Step->getType()->isIntegerTy() &&
2553  "Truncation requires an integer step");
2554  ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2555  Step = Builder.CreateTrunc(Step, TruncType);
2556  }
2557  }
2558 
2559  // If we haven't yet vectorized the induction variable, splat the scalar
2560  // induction variable, and build the necessary step vectors.
2561  if (!VectorizedIV) {
2562  Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2563  for (unsigned Part = 0; Part < UF; ++Part) {
2564  Value *EntryPart =
2565  getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
2566  VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
2567  if (Trunc)
2568  addMetadata(EntryPart, Trunc);
2569  }
2570  }
2571 
2572  // If an induction variable is only used for counting loop iterations or
2573  // calculating addresses, it doesn't need to be widened. Create scalar steps
2574  // that can be used by instructions we will later scalarize. Note that the
2575  // addition of the scalar steps will not increase the number of instructions
2576  // in the loop in the common case prior to InstCombine. We will be trading
2577  // one vector extract for each scalar step.
2578  if (NeedsScalarIV)
2579  buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2580 }
2581 
2582 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2583  Instruction::BinaryOps BinOp) {
2584  // Create and check the types.
2585  assert(Val->getType()->isVectorTy() && "Must be a vector");
2586  int VLen = Val->getType()->getVectorNumElements();
2587 
2588  Type *STy = Val->getType()->getScalarType();
2589  assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2590  "Induction Step must be an integer or FP");
2591  assert(Step->getType() == STy && "Step has wrong type");
2592 
2594 
2595  if (STy->isIntegerTy()) {
2596  // Create a vector of consecutive numbers from zero to VF.
2597  for (int i = 0; i < VLen; ++i)
2598  Indices.push_back(ConstantInt::get(STy, StartIdx + i));
2599 
2600  // Add the consecutive indices to the vector value.
2601  Constant *Cv = ConstantVector::get(Indices);
2602  assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
2603  Step = Builder.CreateVectorSplat(VLen, Step);
2604  assert(Step->getType() == Val->getType() && "Invalid step vec");
2605  // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2606  // which can be found from the original scalar operations.
2607  Step = Builder.CreateMul(Cv, Step);
2608  return Builder.CreateAdd(Val, Step, "induction");
2609  }
2610 
2611  // Floating point induction.
2612  assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2613  "Binary Opcode should be specified for FP induction");
2614  // Create a vector of consecutive numbers from zero to VF.
2615  for (int i = 0; i < VLen; ++i)
2616  Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
2617 
2618  // Add the consecutive indices to the vector value.
2619  Constant *Cv = ConstantVector::get(Indices);
2620 
2621  Step = Builder.CreateVectorSplat(VLen, Step);
2622 
2623  // Floating point operations had to be 'fast' to enable the induction.
2625  Flags.setUnsafeAlgebra();
2626 
2627  Value *MulOp = Builder.CreateFMul(Cv, Step);
2628  if (isa<Instruction>(MulOp))
2629  // Have to check, MulOp may be a constant
2630  cast<Instruction>(MulOp)->setFastMathFlags(Flags);
2631 
2632  Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2633  if (isa<Instruction>(BOp))
2634  cast<Instruction>(BOp)->setFastMathFlags(Flags);
2635  return BOp;
2636 }
2637 
2638 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2639  Value *EntryVal,
2640  const InductionDescriptor &ID) {
2641 
2642  // We shouldn't have to build scalar steps if we aren't vectorizing.
2643  assert(VF > 1 && "VF should be greater than one");
2644 
2645  // Get the value type and ensure it and the step have the same integer type.
2646  Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2647  assert(ScalarIVTy == Step->getType() &&
2648  "Val and Step should have the same type");
2649 
2650  // We build scalar steps for both integer and floating-point induction
2651  // variables. Here, we determine the kind of arithmetic we will perform.
2652  Instruction::BinaryOps AddOp;
2653  Instruction::BinaryOps MulOp;
2654  if (ScalarIVTy->isIntegerTy()) {
2655  AddOp = Instruction::Add;
2656  MulOp = Instruction::Mul;
2657  } else {
2658  AddOp = ID.getInductionOpcode();
2659  MulOp = Instruction::FMul;
2660  }
2661 
2662  // Determine the number of scalars we need to generate for each unroll
2663  // iteration. If EntryVal is uniform, we only need to generate the first
2664  // lane. Otherwise, we generate all VF values.
2665  unsigned Lanes =
2666  Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 : VF;
2667 
2668  // Compute the scalar steps and save the results in VectorLoopValueMap.
2669  for (unsigned Part = 0; Part < UF; ++Part) {
2670  for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2671  auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
2672  auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
2673  auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
2674  VectorLoopValueMap.setScalarValue(EntryVal, Part, Lane, Add);
2675  }
2676  }
2677 }
2678 
2679 int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
2680 
2681  const ValueToValueMap &Strides = getSymbolicStrides() ? *getSymbolicStrides() :
2682  ValueToValueMap();
2683 
2684  int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, true, false);
2685  if (Stride == 1 || Stride == -1)
2686  return Stride;
2687  return 0;
2688 }
2689 
2690 bool LoopVectorizationLegality::isUniform(Value *V) {
2691  return LAI->isUniform(V);
2692 }
2693 
2694 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2695  assert(V != Induction && "The new induction variable should not be used.");
2696  assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2697  assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2698 
2699  // If we have a stride that is replaced by one, do it here.
2700  if (Legal->hasStride(V))
2701  V = ConstantInt::get(V->getType(), 1);
2702 
2703  // If we have a vector mapped to this value, return it.
2704  if (VectorLoopValueMap.hasVectorValue(V, Part))
2705  return VectorLoopValueMap.getVectorValue(V, Part);
2706 
2707  // If the value has not been vectorized, check if it has been scalarized
2708  // instead. If it has been scalarized, and we actually need the value in
2709  // vector form, we will construct the vector values on demand.
2710  if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2711 
2712  Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, Part, 0);
2713 
2714  // If we've scalarized a value, that value should be an instruction.
2715  auto *I = cast<Instruction>(V);
2716 
2717  // If we aren't vectorizing, we can just copy the scalar map values over to
2718  // the vector map.
2719  if (VF == 1) {
2720  VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2721  return ScalarValue;
2722  }
2723 
2724  // Get the last scalar instruction we generated for V and Part. If the value
2725  // is known to be uniform after vectorization, this corresponds to lane zero
2726  // of the Part unroll iteration. Otherwise, the last instruction is the one
2727  // we created for the last vector lane of the Part unroll iteration.
2728  unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
2729  auto *LastInst =
2730  cast<Instruction>(VectorLoopValueMap.getScalarValue(V, Part, LastLane));
2731 
2732  // Set the insert point after the last scalarized instruction. This ensures
2733  // the insertelement sequence will directly follow the scalar definitions.
2734  auto OldIP = Builder.saveIP();
2735  auto NewIP = std::next(BasicBlock::iterator(LastInst));
2736  Builder.SetInsertPoint(&*NewIP);
2737 
2738  // However, if we are vectorizing, we need to construct the vector values.
2739  // If the value is known to be uniform after vectorization, we can just
2740  // broadcast the scalar value corresponding to lane zero for each unroll
2741  // iteration. Otherwise, we construct the vector values using insertelement
2742  // instructions. Since the resulting vectors are stored in
2743  // VectorLoopValueMap, we will only generate the insertelements once.
2744  Value *VectorValue = nullptr;
2745  if (Cost->isUniformAfterVectorization(I, VF)) {
2746  VectorValue = getBroadcastInstrs(ScalarValue);
2747  } else {
2748  VectorValue = UndefValue::get(VectorType::get(V->getType(), VF));
2749  for (unsigned Lane = 0; Lane < VF; ++Lane)
2750  VectorValue = Builder.CreateInsertElement(
2751  VectorValue, getOrCreateScalarValue(V, Part, Lane),
2752  Builder.getInt32(Lane));
2753  }
2754  VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2755  Builder.restoreIP(OldIP);
2756  return VectorValue;
2757  }
2758 
2759  // If this scalar is unknown, assume that it is a constant or that it is
2760  // loop invariant. Broadcast V and save the value for future uses.
2761  Value *B = getBroadcastInstrs(V);
2762  VectorLoopValueMap.setVectorValue(V, Part, B);
2763  return B;
2764 }
2765 
2766 Value *InnerLoopVectorizer::getOrCreateScalarValue(Value *V, unsigned Part,
2767  unsigned Lane) {
2768 
2769  // If the value is not an instruction contained in the loop, it should
2770  // already be scalar.
2771  if (OrigLoop->isLoopInvariant(V))
2772  return V;
2773 
2774  assert(Lane > 0 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2775  : true && "Uniform values only have lane zero");
2776 
2777  // If the value from the original loop has not been vectorized, it is
2778  // represented by UF x VF scalar values in the new loop. Return the requested
2779  // scalar value.
2780  if (VectorLoopValueMap.hasScalarValue(V, Part, Lane))
2781  return VectorLoopValueMap.getScalarValue(V, Part, Lane);
2782 
2783  // If the value has not been scalarized, get its entry in VectorLoopValueMap
2784  // for the given unroll part. If this entry is not a vector type (i.e., the
2785  // vectorization factor is one), there is no need to generate an
2786  // extractelement instruction.
2787  auto *U = getOrCreateVectorValue(V, Part);
2788  if (!U->getType()->isVectorTy()) {
2789  assert(VF == 1 && "Value not scalarized has non-vector type");
2790  return U;
2791  }
2792 
2793  // Otherwise, the value from the original loop has been vectorized and is
2794  // represented by UF vector values. Extract and return the requested scalar
2795  // value from the appropriate vector lane.
2796  return Builder.CreateExtractElement(U, Builder.getInt32(Lane));
2797 }
2798 
2799 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2800  assert(Vec->getType()->isVectorTy() && "Invalid type");
2801  SmallVector<Constant *, 8> ShuffleMask;
2802  for (unsigned i = 0; i < VF; ++i)
2803  ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2804 
2805  return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2806  ConstantVector::get(ShuffleMask),
2807  "reverse");
2808 }
2809 
2810 // Try to vectorize the interleave group that \p Instr belongs to.
2811 //
2812 // E.g. Translate following interleaved load group (factor = 3):
2813 // for (i = 0; i < N; i+=3) {
2814 // R = Pic[i]; // Member of index 0
2815 // G = Pic[i+1]; // Member of index 1
2816 // B = Pic[i+2]; // Member of index 2
2817 // ... // do something to R, G, B
2818 // }
2819 // To:
2820 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2821 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements
2822 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements
2823 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements
2824 //
2825 // Or translate following interleaved store group (factor = 3):
2826 // for (i = 0; i < N; i+=3) {
2827 // ... do something to R, G, B
2828 // Pic[i] = R; // Member of index 0
2829 // Pic[i+1] = G; // Member of index 1
2830 // Pic[i+2] = B; // Member of index 2
2831 // }
2832 // To:
2833 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2834 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2835 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2836 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2837 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2838 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
2839  const InterleaveGroup *Group = Legal->getInterleavedAccessGroup(Instr);
2840  assert(Group && "Fail to get an interleaved access group.");
2841 
2842  // Skip if current instruction is not the insert position.
2843  if (Instr != Group->getInsertPos())
2844  return;
2845 
2846  Value *Ptr = getPointerOperand(Instr);
2847 
2848  // Prepare for the vector type of the interleaved load/store.
2849  Type *ScalarTy = getMemInstValueType(Instr);
2850  unsigned InterleaveFactor = Group->getFactor();
2851  Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2852  Type *PtrTy = VecTy->getPointerTo(getMemInstAddressSpace(Instr));
2853 
2854  // Prepare for the new pointers.
2855  setDebugLocFromInst(Builder, Ptr);
2856  SmallVector<Value *, 2> NewPtrs;
2857  unsigned Index = Group->getIndex(Instr);
2858 
2859  // If the group is reverse, adjust the index to refer to the last vector lane
2860  // instead of the first. We adjust the index from the first vector lane,
2861  // rather than directly getting the pointer for lane VF - 1, because the
2862  // pointer operand of the interleaved access is supposed to be uniform. For
2863  // uniform instructions, we're only required to generate a value for the
2864  // first vector lane in each unroll iteration.
2865  if (Group->isReverse())
2866  Index += (VF - 1) * Group->getFactor();
2867 
2868  for (unsigned Part = 0; Part < UF; Part++) {
2869  Value *NewPtr = getOrCreateScalarValue(Ptr, Part, 0);
2870 
2871  // Notice current instruction could be any index. Need to adjust the address
2872  // to the member of index 0.
2873  //
2874  // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2875  // b = A[i]; // Member of index 0
2876  // Current pointer is pointed to A[i+1], adjust it to A[i].
2877  //
2878  // E.g. A[i+1] = a; // Member of index 1
2879  // A[i] = b; // Member of index 0
2880  // A[i+2] = c; // Member of index 2 (Current instruction)
2881  // Current pointer is pointed to A[i+2], adjust it to A[i].
2882  NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index));
2883 
2884  // Cast to the vector pointer type.
2885  NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
2886  }
2887 
2888  setDebugLocFromInst(Builder, Instr);
2889  Value *UndefVec = UndefValue::get(VecTy);
2890 
2891  // Vectorize the interleaved load group.
2892  if (isa<LoadInst>(Instr)) {
2893 
2894  // For each unroll part, create a wide load for the group.
2895  SmallVector<Value *, 2> NewLoads;
2896  for (unsigned Part = 0; Part < UF; Part++) {
2897  auto *NewLoad = Builder.CreateAlignedLoad(
2898  NewPtrs[Part], Group->getAlignment(), "wide.vec");
2899  addMetadata(NewLoad, Instr);
2900  NewLoads.push_back(NewLoad);
2901  }
2902 
2903  // For each member in the group, shuffle out the appropriate data from the
2904  // wide loads.
2905  for (unsigned I = 0; I < InterleaveFactor; ++I) {
2906  Instruction *Member = Group->getMember(I);
2907 
2908  // Skip the gaps in the group.
2909  if (!Member)
2910  continue;
2911 
2912  Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2913  for (unsigned Part = 0; Part < UF; Part++) {
2914  Value *StridedVec = Builder.CreateShuffleVector(
2915  NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2916 
2917  // If this member has different type, cast the result type.
2918  if (Member->getType() != ScalarTy) {
2919  VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2920  StridedVec = Builder.CreateBitOrPointerCast(StridedVec, OtherVTy);
2921  }
2922 
2923  if (Group->isReverse())
2924  StridedVec = reverseVector(StridedVec);
2925 
2926  VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2927  }
2928  }
2929  return;
2930  }
2931 
2932  // The sub vector type for current instruction.
2933  VectorType *SubVT = VectorType::get(ScalarTy, VF);
2934 
2935  // Vectorize the interleaved store group.
2936  for (unsigned Part = 0; Part < UF; Part++) {
2937  // Collect the stored vector from each member.
2938  SmallVector<Value *, 4> StoredVecs;
2939  for (unsigned i = 0; i < InterleaveFactor; i++) {
2940  // Interleaved store group doesn't allow a gap, so each index has a member
2941  Instruction *Member = Group->getMember(i);
2942  assert(Member && "Fail to get a member from an interleaved store group");
2943 
2944  Value *StoredVec = getOrCreateVectorValue(
2945  cast<StoreInst>(Member)->getValueOperand(), Part);
2946  if (Group->isReverse())
2947  StoredVec = reverseVector(StoredVec);
2948 
2949  // If this member has different type, cast it to an unified type.
2950  if (StoredVec->getType() != SubVT)
2951  StoredVec = Builder.CreateBitOrPointerCast(StoredVec, SubVT);
2952 
2953  StoredVecs.push_back(StoredVec);
2954  }
2955 
2956  // Concatenate all vectors into a wide vector.
2957  Value *WideVec = concatenateVectors(Builder, StoredVecs);
2958 
2959  // Interleave the elements in the wide vector.
2960  Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2961  Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2962  "interleaved.vec");
2963 
2964  Instruction *NewStoreInstr =
2965  Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment());
2966  addMetadata(NewStoreInstr, Instr);
2967  }
2968 }
2969 
2970 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
2971  // Attempt to issue a wide load.
2972  LoadInst *LI = dyn_cast<LoadInst>(Instr);
2973  StoreInst *SI = dyn_cast<StoreInst>(Instr);
2974 
2975  assert((LI || SI) && "Invalid Load/Store instruction");
2976 
2977  LoopVectorizationCostModel::InstWidening Decision =
2978  Cost->getWideningDecision(Instr, VF);
2979  assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
2980  "CM decision should be taken at this point");
2981  if (Decision == LoopVectorizationCostModel::CM_Interleave)
2982  return vectorizeInterleaveGroup(Instr);
2983 
2984  Type *ScalarDataTy = getMemInstValueType(Instr);
2985  Type *DataTy = VectorType::get(ScalarDataTy, VF);
2986  Value *Ptr = getPointerOperand(Instr);
2987  unsigned Alignment = getMemInstAlignment(Instr);
2988  // An alignment of 0 means target abi alignment. We need to use the scalar's
2989  // target abi alignment in such a case.
2990  const DataLayout &DL = Instr->getModule()->getDataLayout();
2991  if (!Alignment)
2992  Alignment = DL.getABITypeAlignment(ScalarDataTy);
2993  unsigned AddressSpace = getMemInstAddressSpace(Instr);
2994 
2995  // Scalarize the memory instruction if necessary.
2996  if (Decision == LoopVectorizationCostModel::CM_Scalarize)
2997  return scalarizeInstruction(Instr, Legal->isScalarWithPredication(Instr));
2998 
2999  // Determine if the pointer operand of the access is either consecutive or
3000  // reverse consecutive.
3001  int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
3002  bool Reverse = ConsecutiveStride < 0;
3003  bool CreateGatherScatter =
3004  (Decision == LoopVectorizationCostModel::CM_GatherScatter);
3005 
3006  // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
3007  // gather/scatter. Otherwise Decision should have been to Scalarize.
3008  assert((ConsecutiveStride || CreateGatherScatter) &&
3009  "The instruction should be scalarized");
3010 
3011  // Handle consecutive loads/stores.
3012  if (ConsecutiveStride)
3013  Ptr = getOrCreateScalarValue(Ptr, 0, 0);
3014 
3015  VectorParts Mask = createBlockInMask(Instr->getParent());
3016  // Handle Stores:
3017  if (SI) {
3018  assert(!Legal->isUniform(SI->getPointerOperand()) &&
3019  "We do not allow storing to uniform addresses");
3020  setDebugLocFromInst(Builder, SI);
3021 
3022  for (unsigned Part = 0; Part < UF; ++Part) {
3023  Instruction *NewSI = nullptr;
3024  Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
3025  if (CreateGatherScatter) {
3026  Value *MaskPart = Legal->isMaskRequired(SI) ? Mask[Part] : nullptr;
3027  Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
3028  NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
3029  MaskPart);
3030  } else {
3031  // Calculate the pointer for the specific unroll-part.
3032  Value *PartPtr =
3033  Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
3034 
3035  if (Reverse) {
3036  // If we store to reverse consecutive memory locations, then we need
3037  // to reverse the order of elements in the stored value.
3038  StoredVal = reverseVector(StoredVal);
3039  // We don't want to update the value in the map as it might be used in
3040  // another expression. So don't call resetVectorValue(StoredVal).
3041 
3042  // If the address is consecutive but reversed, then the
3043  // wide store needs to start at the last vector element.
3044  PartPtr =
3045  Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
3046  PartPtr =
3047  Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
3048  Mask[Part] = reverseVector(Mask[Part]);
3049  }
3050 
3051  Value *VecPtr =
3052  Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
3053 
3054  if (Legal->isMaskRequired(SI))
3055  NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
3056  Mask[Part]);
3057  else
3058  NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
3059  }
3060  addMetadata(NewSI, SI);
3061  }
3062  return;
3063  }
3064 
3065  // Handle loads.
3066  assert(LI && "Must have a load instruction");
3067  setDebugLocFromInst(Builder, LI);
3068  for (unsigned Part = 0; Part < UF; ++Part) {
3069  Value *NewLI;
3070  if (CreateGatherScatter) {
3071  Value *MaskPart = Legal->isMaskRequired(LI) ? Mask[Part] : nullptr;
3072  Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
3073  NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
3074  nullptr, "wide.masked.gather");
3075  addMetadata(NewLI, LI);
3076  } else {
3077  // Calculate the pointer for the specific unroll-part.
3078  Value *PartPtr =
3079  Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
3080 
3081  if (Reverse) {
3082  // If the address is consecutive but reversed, then the
3083  // wide load needs to start at the last vector element.
3084  PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
3085  PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
3086  Mask[Part] = reverseVector(Mask[Part]);
3087  }
3088 
3089  Value *VecPtr =
3090  Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
3091  if (Legal->isMaskRequired(LI))
3092  NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
3093  UndefValue::get(DataTy),
3094  "wide.masked.load");
3095  else
3096  NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");
3097 
3098  // Add metadata to the load, but setVectorValue to the reverse shuffle.
3099  addMetadata(NewLI, LI);
3100  if (Reverse)
3101  NewLI = reverseVector(NewLI);
3102  }
3103  VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
3104  }
3105 }
3106 
3107 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
3108  bool IfPredicateInstr) {
3109  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
3110  DEBUG(dbgs() << "LV: Scalarizing"
3111  << (IfPredicateInstr ? " and predicating:" : ":") << *Instr
3112  << '\n');
3113  // Holds vector parameters or scalars, in case of uniform vals.
3115 
3116  setDebugLocFromInst(Builder, Instr);
3117 
3118  // Does this instruction return a value ?
3119  bool IsVoidRetTy = Instr->getType()->isVoidTy();
3120 
3121  VectorParts Cond;
3122  if (IfPredicateInstr)
3123  Cond = createBlockInMask(Instr->getParent());
3124 
3125  // Determine the number of scalars we need to generate for each unroll
3126  // iteration. If the instruction is uniform, we only need to generate the
3127  // first lane. Otherwise, we generate all VF values.
3128  unsigned Lanes = Cost->isUniformAfterVectorization(Instr, VF) ? 1 : VF;
3129 
3130  // For each vector unroll 'part':
3131  for (unsigned Part = 0; Part < UF; ++Part) {
3132  // For each scalar that we create:
3133  for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
3134 
3135  // Start if-block.
3136  Value *Cmp = nullptr;
3137  if (IfPredicateInstr) {
3138  Cmp = Cond[Part];
3139  if (Cmp->getType()->isVectorTy())
3140  Cmp = Builder.CreateExtractElement(Cmp, Builder.getInt32(Lane));
3141  Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp,
3142  ConstantInt::get(Cmp->getType(), 1));
3143  }
3144 
3145  Instruction *Cloned = Instr->clone();
3146  if (!IsVoidRetTy)
3147  Cloned->setName(Instr->getName() + ".cloned");
3148 
3149  // Replace the operands of the cloned instructions with their scalar
3150  // equivalents in the new loop.
3151  for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
3152  auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Part, Lane);
3153  Cloned->setOperand(op, NewOp);
3154  }
3155  addNewMetadata(Cloned, Instr);
3156 
3157  // Place the cloned scalar in the new loop.
3158  Builder.Insert(Cloned);
3159 
3160  // Add the cloned scalar to the scalar map entry.
3161  VectorLoopValueMap.setScalarValue(Instr, Part, Lane, Cloned);
3162 
3163  // If we just cloned a new assumption, add it the assumption cache.
3164  if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
3165  if (II->getIntrinsicID() == Intrinsic::assume)
3166  AC->registerAssumption(II);
3167 
3168  // End if-block.
3169  if (IfPredicateInstr)
3170  PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp));
3171  }
3172  }
3173 }
3174 
3175 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
3176  Value *End, Value *Step,
3177  Instruction *DL) {
3178  BasicBlock *Header = L->getHeader();
3179  BasicBlock *Latch = L->getLoopLatch();
3180  // As we're just creating this loop, it's possible no latch exists
3181  // yet. If so, use the header as this will be a single block loop.
3182  if (!Latch)
3183  Latch = Header;
3184 
3185  IRBuilder<> Builder(&*Header->getFirstInsertionPt());
3186  Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
3187  setDebugLocFromInst(Builder, OldInst);
3188  auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
3189 
3190  Builder.SetInsertPoint(Latch->getTerminator());
3191  setDebugLocFromInst(Builder, OldInst);
3192 
3193  // Create i+1 and fill the PHINode.
3194  Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
3195  Induction->addIncoming(Start, L->getLoopPreheader());
3196  Induction->addIncoming(Next, Latch);
3197  // Create the compare.
3198  Value *ICmp = Builder.CreateICmpEQ(Next, End);
3199  Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
3200 
3201  // Now we have two terminators. Remove the old one from the block.
3202  Latch->getTerminator()->eraseFromParent();
3203 
3204  return Induction;
3205 }
3206 
3207 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
3208  if (TripCount)
3209  return TripCount;
3210 
3211  IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3212  // Find the loop boundaries.
3213  ScalarEvolution *SE = PSE.getSE();
3214  const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
3215  assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
3216  "Invalid loop count");
3217 
3218  Type *IdxTy = Legal->getWidestInductionType();
3219 
3220  // The exit count might have the type of i64 while the phi is i32. This can
3221  // happen if we have an induction variable that is sign extended before the
3222  // compare. The only way that we get a backedge taken count is that the
3223  // induction variable was signed and as such will not overflow. In such a case
3224  // truncation is legal.
3225  if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
3226  IdxTy->getPrimitiveSizeInBits())
3227  BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
3228  BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
3229 
3230  // Get the total trip count from the count by adding 1.
3231  const SCEV *ExitCount = SE->getAddExpr(
3232  BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3233 
3234  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
3235 
3236  // Expand the trip count and place the new instructions in the preheader.
3237  // Notice that the pre-header does not change, only the loop body.
3238  SCEVExpander Exp(*SE, DL, "induction");
3239 
3240  // Count holds the overall loop count (N).
3241  TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
3243 
3244  if (TripCount->getType()->isPointerTy())
3245  TripCount =
3246  CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
3248 
3249  return TripCount;
3250 }
3251 
3252 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
3253  if (VectorTripCount)
3254  return VectorTripCount;
3255 
3256  Value *TC = getOrCreateTripCount(L);
3257  IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3258 
3259  // Now we need to generate the expression for the part of the loop that the
3260  // vectorized body will execute. This is equal to N - (N % Step) if scalar
3261  // iterations are not required for correctness, or N - Step, otherwise. Step
3262  // is equal to the vectorization factor (number of SIMD elements) times the
3263  // unroll factor (number of SIMD instructions).
3264  Constant *Step = ConstantInt::get(TC->getType(), VF * UF);
3265  Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
3266 
3267  // If there is a non-reversed interleaved group that may speculatively access
3268  // memory out-of-bounds, we need to ensure that there will be at least one
3269  // iteration of the scalar epilogue loop. Thus, if the step evenly divides
3270  // the trip count, we set the remainder to be equal to the step. If the step
3271  // does not evenly divide the trip count, no adjustment is necessary since
3272  // there will already be scalar iterations. Note that the minimum iterations
3273  // check ensures that N >= Step.
3274  if (VF > 1 && Legal->requiresScalarEpilogue()) {
3275  auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3276  R = Builder.CreateSelect(IsZero, Step, R);
3277  }
3278 
3279  VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3280 
3281  return VectorTripCount;
3282 }
3283 
3284 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3285  BasicBlock *Bypass) {
3286  Value *Count = getOrCreateTripCount(L);
3287  BasicBlock *BB = L->getLoopPreheader();
3288  IRBuilder<> Builder(BB->getTerminator());
3289 
3290  // Generate code to check if the loop's trip count is less than VF * UF, or
3291  // equal to it in case a scalar epilogue is required; this implies that the
3292  // vector trip count is zero. This check also covers the case where adding one
3293  // to the backedge-taken count overflowed leading to an incorrect trip count
3294  // of zero. In this case we will also jump to the scalar loop.
3295  auto P = Legal->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
3297  Value *CheckMinIters = Builder.CreateICmp(
3298  P, Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check");
3299 
3300  BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
3301  // Update dominator tree immediately if the generated block is a
3302  // LoopBypassBlock because SCEV expansions to generate loop bypass
3303  // checks may query it before the current function is finished.
3304  DT->addNewBlock(NewBB, BB);
3305  if (L->getParentLoop())
3306  L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
3308  BranchInst::Create(Bypass, NewBB, CheckMinIters));
3309  LoopBypassBlocks.push_back(BB);
3310 }
3311 
3312 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3313  BasicBlock *BB = L->getLoopPreheader();
3314 
3315  // Generate the code to check that the SCEV assumptions that we made.
3316  // We want the new basic block to start at the first instruction in a
3317  // sequence of instructions that form a check.
3318  SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
3319  "scev.check");
3320  Value *SCEVCheck =
3321  Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
3322 
3323  if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
3324  if (C->isZero())
3325  return;
3326 
3327  // Create a new block containing the stride check.
3328  BB->setName("vector.scevcheck");
3329  auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
3330  // Update dominator tree immediately if the generated block is a
3331  // LoopBypassBlock because SCEV expansions to generate loop bypass
3332  // checks may query it before the current function is finished.
3333  DT->addNewBlock(NewBB, BB);
3334  if (L->getParentLoop())
3335  L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
3337  BranchInst::Create(Bypass, NewBB, SCEVCheck));
3338  LoopBypassBlocks.push_back(BB);
3339  AddedSafetyChecks = true;
3340 }
3341 
3342 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
3343  BasicBlock *BB = L->getLoopPreheader();
3344 
3345  // Generate the code that checks in runtime if arrays overlap. We put the
3346  // checks into a separate block to make the more common case of few elements
3347  // faster.
3348  Instruction *FirstCheckInst;
3349  Instruction *MemRuntimeCheck;
3350  std::tie(FirstCheckInst, MemRuntimeCheck) =
3351  Legal->getLAI()->addRuntimeChecks(BB->getTerminator());
3352  if (!MemRuntimeCheck)
3353  return;
3354 
3355  // Create a new block containing the memory check.
3356  BB->setName("vector.memcheck");
3357  auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
3358  // Update dominator tree immediately if the generated block is a
3359  // LoopBypassBlock because SCEV expansions to generate loop bypass
3360  // checks may query it before the current function is finished.
3361  DT->addNewBlock(NewBB, BB);
3362  if (L->getParentLoop())
3363  L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
3365  BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
3366  LoopBypassBlocks.push_back(BB);
3367  AddedSafetyChecks = true;
3368 
3369  // We currently don't use LoopVersioning for the actual loop cloning but we
3370  // still use it to add the noalias metadata.
3371  LVer = llvm::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
3372  PSE.getSE());
3373  LVer->prepareNoAliasMetadata();
3374 }
3375 
3376 void InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3377  /*
3378  In this function we generate a new loop. The new loop will contain
3379  the vectorized instructions while the old loop will continue to run the
3380  scalar remainder.
3381 
3382  [ ] <-- loop iteration number check.
3383  / |
3384  / v
3385  | [ ] <-- vector loop bypass (may consist of multiple blocks).
3386  | / |
3387  | / v
3388  || [ ] <-- vector pre header.
3389  |/ |
3390  | v
3391  | [ ] \
3392  | [ ]_| <-- vector loop.
3393  | |
3394  | v
3395  | -[ ] <--- middle-block.
3396  | / |
3397  | / v
3398  -|- >[ ] <--- new preheader.
3399  | |
3400  | v
3401  | [ ] \
3402  | [ ]_| <-- old scalar loop to handle remainder.
3403  \ |
3404  \ v
3405  >[ ] <-- exit block.
3406  ...
3407  */
3408 
3409  BasicBlock *OldBasicBlock = OrigLoop->getHeader();
3410  BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
3411  BasicBlock *ExitBlock = OrigLoop->getExitBlock();
3412  assert(VectorPH && "Invalid loop structure");
3413  assert(ExitBlock && "Must have an exit block");
3414 
3415  // Some loops have a single integer induction variable, while other loops
3416  // don't. One example is c++ iterators that often have multiple pointer
3417  // induction variables. In the code below we also support a case where we
3418  // don't have a single induction variable.
3419  //
3420  // We try to obtain an induction variable from the original loop as hard
3421  // as possible. However if we don't find one that:
3422  // - is an integer
3423  // - counts from zero, stepping by one
3424  // - is the size of the widest induction variable type
3425  // then we create a new one.
3426  OldInduction = Legal->getPrimaryInduction();
3427  Type *IdxTy = Legal->getWidestInductionType();
3428 
3429  // Split the single block loop into the two loop structure described above.
3430  BasicBlock *VecBody =
3431  VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
3432  BasicBlock *MiddleBlock =
3433  VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
3434  BasicBlock *ScalarPH =
3435  MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
3436 
3437  // Create and register the new vector loop.
3438  Loop *Lp = new Loop();
3439  Loop *ParentLoop = OrigLoop->getParentLoop();
3440 
3441  // Insert the new loop into the loop nest and register the new basic blocks
3442  // before calling any utilities such as SCEV that require valid LoopInfo.
3443  if (ParentLoop) {
3444  ParentLoop->addChildLoop(Lp);
3445  ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
3446  ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
3447  } else {
3448  LI->addTopLevelLoop(Lp);
3449  }
3450  Lp->addBasicBlockToLoop(VecBody, *LI);
3451 
3452  // Find the loop boundaries.
3453  Value *Count = getOrCreateTripCount(Lp);
3454 
3455  Value *StartIdx = ConstantInt::get(IdxTy, 0);
3456 
3457  // Now, compare the new count to zero. If it is zero skip the vector loop and
3458  // jump to the scalar loop. This check also covers the case where the
3459  // backedge-taken count is uint##_max: adding one to it will overflow leading
3460  // to an incorrect trip count of zero. In this (rare) case we will also jump
3461  // to the scalar loop.
3462  emitMinimumIterationCountCheck(Lp, ScalarPH);
3463 
3464  // Generate the code to check any assumptions that we've made for SCEV
3465  // expressions.
3466  emitSCEVChecks(Lp, ScalarPH);
3467 
3468  // Generate the code that checks in runtime if arrays overlap. We put the
3469  // checks into a separate block to make the more common case of few elements
3470  // faster.
3471  emitMemRuntimeChecks(Lp, ScalarPH);
3472 
3473  // Generate the induction variable.
3474  // The loop step is equal to the vectorization factor (num of SIMD elements)
3475  // times the unroll factor (num of SIMD instructions).
3476  Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3477  Constant *Step = ConstantInt::get(IdxTy, VF * UF);
3478  Induction =
3479  createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3480  getDebugLocFromInstOrOperands(OldInduction));
3481 
3482  // We are going to resume the execution of the scalar loop.
3483  // Go over all of the induction variables that we found and fix the
3484  // PHIs that are left in the scalar version of the loop.
3485  // The starting values of PHI nodes depend on the counter of the last
3486  // iteration in the vectorized loop.
3487  // If we come from a bypass edge then we need to start from the original
3488  // start value.
3489 
3490  // This variable saves the new starting index for the scalar loop. It is used
3491  // to test if there are any tail iterations left once the vector loop has
3492  // completed.
3493  LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
3494  for (auto &InductionEntry : *List) {
3495  PHINode *OrigPhi = InductionEntry.first;
3496  InductionDescriptor II = InductionEntry.second;
3497 
3498  // Create phi nodes to merge from the backedge-taken check block.
3499  PHINode *BCResumeVal = PHINode::Create(
3500  OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
3501  Value *&EndValue = IVEndValues[OrigPhi];
3502  if (OrigPhi == OldInduction) {
3503  // We know what the end value is.
3504  EndValue = CountRoundDown;
3505  } else {
3507  Type *StepType = II.getStep()->getType();
3508  Instruction::CastOps CastOp =
3509  CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
3510  Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
3511  const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
3512  EndValue = II.transform(B, CRD, PSE.getSE(), DL);
3513  EndValue->setName("ind.end");
3514  }
3515 
3516  // The new PHI merges the original incoming value, in case of a bypass,
3517  // or the value at the end of the vectorized loop.
3518  BCResumeVal->addIncoming(EndValue, MiddleBlock);
3519 
3520  // Fix the scalar body counter (PHI node).
3521  unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
3522 
3523  // The old induction's phi node in the scalar body needs the truncated
3524  // value.
3525  for (BasicBlock *BB : LoopBypassBlocks)
3526  BCResumeVal->addIncoming(II.getStartValue(), BB);
3527  OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
3528  }
3529 
3530  // Add a check in the middle block to see if we have completed
3531  // all of the iterations in the first vector loop.
3532  // If (N - N%VF) == N, then we *don't* need to run the remainder.
3533  Value *CmpN =
3534  CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3535  CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
3536  ReplaceInstWithInst(MiddleBlock->getTerminator(),
3537  BranchInst::Create(ExitBlock, ScalarPH, CmpN));
3538 
3539  // Get ready to start creating new instructions into the vectorized body.
3540  Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());
3541 
3542  // Save the state.
3543  LoopVectorPreHeader = Lp->getLoopPreheader();
3544  LoopScalarPreHeader = ScalarPH;
3545  LoopMiddleBlock = MiddleBlock;
3546  LoopExitBlock = ExitBlock;
3547  LoopVectorBody = VecBody;
3548  LoopScalarBody = OldBasicBlock;
3549 
3550  // Keep all loop hints from the original loop on the vector loop (we'll
3551  // replace the vectorizer-specific hints below).
3552  if (MDNode *LID = OrigLoop->getLoopID())
3553  Lp->setLoopID(LID);
3554 
3555  LoopVectorizeHints Hints(Lp, true, *ORE);
3556  Hints.setAlreadyVectorized();
3557 }
3558 
3559 // Fix up external users of the induction variable. At this point, we are
3560 // in LCSSA form, with all external PHIs that use the IV having one input value,
3561 // coming from the remainder loop. We need those PHIs to also have a correct
3562 // value for the IV when arriving directly from the middle block.
3563 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3564  const InductionDescriptor &II,
3565  Value *CountRoundDown, Value *EndValue,
3566  BasicBlock *MiddleBlock) {
3567  // There are two kinds of external IV usages - those that use the value
3568  // computed in the last iteration (the PHI) and those that use the penultimate
3569  // value (the value that feeds into the phi from the loop latch).
3570  // We allow both, but they, obviously, have different values.
3571 
3572  assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3573 
3574  DenseMap<Value *, Value *> MissingVals;
3575 
3576  // An external user of the last iteration's value should see the value that
3577  // the remainder loop uses to initialize its own IV.
3578  Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3579  for (User *U : PostInc->users()) {
3580  Instruction *UI = cast<Instruction>(U);
3581  if (!OrigLoop->contains(UI)) {
3582  assert(isa<PHINode>(UI) && "Expected LCSSA form");
3583  MissingVals[UI] = EndValue;
3584  }
3585  }
3586 
3587  // An external user of the penultimate value need to see EndValue - Step.
3588  // The simplest way to get this is to recompute it from the constituent SCEVs,
3589  // that is Start + (Step * (CRD - 1)).
3590  for (User *U : OrigPhi->users()) {
3591  auto *UI = cast<Instruction>(U);
3592  if (!OrigLoop->contains(UI)) {
3593  const DataLayout &DL =
3594  OrigLoop->getHeader()->getModule()->getDataLayout();
3595  assert(isa<PHINode>(UI) && "Expected LCSSA form");
3596 
3597  IRBuilder<> B(MiddleBlock->getTerminator());
3598  Value *CountMinusOne = B.CreateSub(
3599  CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3600  Value *CMO =
3601  !II.getStep()->getType()->isIntegerTy()
3602  ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3603  II.getStep()->getType())
3604  : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3605  CMO->setName("cast.cmo");
3606  Value *Escape = II.transform(B, CMO, PSE.getSE(), DL);
3607  Escape->setName("ind.escape");
3608  MissingVals[UI] = Escape;
3609  }
3610  }
3611 
3612  for (auto &I : MissingVals) {
3613  PHINode *PHI = cast<PHINode>(I.first);
3614  // One corner case we have to handle is two IVs "chasing" each-other,
3615  // that is %IV2 = phi [...], [ %IV1, %latch ]
3616  // In this case, if IV1 has an external use, we need to avoid adding both
3617  // "last value of IV1" and "penultimate value of IV2". So, verify that we
3618  // don't already have an incoming value for the middle block.
3619  if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3620  PHI->addIncoming(I.second, MiddleBlock);
3621  }
3622 }
3623 
3624 namespace {
3625 struct CSEDenseMapInfo {
3626  static bool canHandle(const Instruction *I) {
3627  return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3628  isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3629  }
3630  static inline Instruction *getEmptyKey() {
3632  }
3633  static inline Instruction *getTombstoneKey() {
3635  }
3636  static unsigned getHashValue(const Instruction *I) {
3637  assert(canHandle(I) && "Unknown instruction!");
3639  I->value_op_end()));
3640  }
3641  static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3642  if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3643  LHS == getTombstoneKey() || RHS == getTombstoneKey())
3644  return LHS == RHS;
3645  return LHS->isIdenticalTo(RHS);
3646  }
3647 };
3648 }
3649 
3650 ///\brief Perform cse of induction variable instructions.
3651 static void cse(BasicBlock *BB) {
3652  // Perform simple cse.
3654  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3655  Instruction *In = &*I++;
3656 
3657  if (!CSEDenseMapInfo::canHandle(In))
3658  continue;
3659 
3660  // Check if we can replace this instruction with any of the
3661  // visited instructions.
3662  if (Instruction *V = CSEMap.lookup(In)) {
3663  In->replaceAllUsesWith(V);
3664  In->eraseFromParent();
3665  continue;
3666  }
3667 
3668  CSEMap[In] = In;
3669  }
3670 }
3671 
3672 /// \brief Estimate the overhead of scalarizing an instruction. This is a
3673 /// convenience wrapper for the type-based getScalarizationOverhead API.
3674 static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,
3675  const TargetTransformInfo &TTI) {
3676  if (VF == 1)
3677  return 0;
3678 
3679  unsigned Cost = 0;
3680  Type *RetTy = ToVectorTy(I->getType(), VF);
3681  if (!RetTy->isVoidTy() &&
3682  (!isa<LoadInst>(I) ||
3684  Cost += TTI.getScalarizationOverhead(RetTy, true, false);
3685 
3686  if (CallInst *CI = dyn_cast<CallInst>(I)) {
3687  SmallVector<const Value *, 4> Operands(CI->arg_operands());
3688  Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
3689  }
3690  else if (!isa<StoreInst>(I) ||
3693  Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
3694  }
3695 
3696  return Cost;
3697 }
3698 
3699 // Estimate cost of a call instruction CI if it were vectorized with factor VF.
3700 // Return the cost of the instruction, including scalarization overhead if it's
3701 // needed. The flag NeedToScalarize shows if the call needs to be scalarized -
3702 // i.e. either vector version isn't available, or is too expensive.
3703 static unsigned getVectorCallCost(CallInst *CI, unsigned VF,
3704  const TargetTransformInfo &TTI,
3705  const TargetLibraryInfo *TLI,
3706  bool &NeedToScalarize) {
3707  Function *F = CI->getCalledFunction();
3708  StringRef FnName = CI->getCalledFunction()->getName();
3709  Type *ScalarRetTy = CI->getType();
3710  SmallVector<Type *, 4> Tys, ScalarTys;
3711  for (auto &ArgOp : CI->arg_operands())
3712  ScalarTys.push_back(ArgOp->getType());
3713 
3714  // Estimate cost of scalarized vector call. The source operands are assumed
3715  // to be vectors, so we need to extract individual elements from there,
3716  // execute VF scalar calls, and then gather the result into the vector return
3717  // value.
3718  unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3719  if (VF == 1)
3720  return ScalarCallCost;
3721 
3722  // Compute corresponding vector type for return value and arguments.
3723  Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3724  for (Type *ScalarTy : ScalarTys)
3725  Tys.push_back(ToVectorTy(ScalarTy, VF));
3726 
3727  // Compute costs of unpacking argument values for the scalar calls and
3728  // packing the return values to a vector.
3729  unsigned ScalarizationCost = getScalarizationOverhead(CI, VF, TTI);
3730 
3731  unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3732 
3733  // If we can't emit a vector call for this function, then the currently found
3734  // cost is the cost we need to return.
3735  NeedToScalarize = true;
3736  if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
3737  return Cost;
3738 
3739  // If the corresponding vector cost is cheaper, return its cost.
3740  unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3741  if (VectorCallCost < Cost) {
3742  NeedToScalarize = false;
3743  return VectorCallCost;
3744  }
3745  return Cost;
3746 }
3747 
3748 // Estimate cost of an intrinsic call instruction CI if it were vectorized with
3749 // factor VF. Return the cost of the instruction, including scalarization
3750 // overhead if it's needed.
3751 static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF,
3752  const TargetTransformInfo &TTI,
3753  const TargetLibraryInfo *TLI) {
3755  assert(ID && "Expected intrinsic call!");
3756 
3757  FastMathFlags FMF;
3758  if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3759  FMF = FPMO->getFastMathFlags();
3760 
3761  SmallVector<Value *, 4> Operands(CI->arg_operands());
3762  return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3763 }
3764 
3766  auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3767  auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3768  return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3769 }
3771  auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3772  auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3773  return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3774 }
3775 
3776 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3777  // For every instruction `I` in MinBWs, truncate the operands, create a
3778  // truncated version of `I` and reextend its result. InstCombine runs
3779  // later and will remove any ext/trunc pairs.
3780  //
3781  SmallPtrSet<Value *, 4> Erased;
3782  for (const auto &KV : Cost->getMinimalBitwidths()) {
3783  // If the value wasn't vectorized, we must maintain the original scalar
3784  // type. The absence of the value from VectorLoopValueMap indicates that it
3785  // wasn't vectorized.
3786  if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3787  continue;
3788  for (unsigned Part = 0; Part < UF; ++Part) {
3789  Value *I = getOrCreateVectorValue(KV.first, Part);
3790  if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3791  continue;
3792  Type *OriginalTy = I->getType();
3793  Type *ScalarTruncatedTy =
3794  IntegerType::get(OriginalTy->getContext(), KV.second);
3795  Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3796  OriginalTy->getVectorNumElements());
3797  if (TruncatedTy == OriginalTy)
3798  continue;
3799 
3800  IRBuilder<> B(cast<Instruction>(I));
3801  auto ShrinkOperand = [&](Value *V) -> Value * {
3802  if (auto *ZI = dyn_cast<ZExtInst>(V))
3803  if (ZI->getSrcTy() == TruncatedTy)
3804  return ZI->getOperand(0);
3805  return B.CreateZExtOrTrunc(V, TruncatedTy);
3806  };
3807 
3808  // The actual instruction modification depends on the instruction type,
3809  // unfortunately.
3810  Value *NewI = nullptr;
3811  if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3812  NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3813  ShrinkOperand(BO->getOperand(1)));
3814 
3815  // Any wrapping introduced by shrinking this operation shouldn't be
3816  // considered undefined behavior. So, we can't unconditionally copy
3817  // arithmetic wrapping flags to NewI.
3818  cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3819  } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3820  NewI =
3821  B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3822  ShrinkOperand(CI->getOperand(1)));
3823  } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3824  NewI = B.CreateSelect(SI->getCondition(),
3825  ShrinkOperand(SI->getTrueValue()),
3826  ShrinkOperand(SI->getFalseValue()));
3827  } else if (auto *CI = dyn_cast<CastInst>(I)) {
3828  switch (CI->getOpcode()) {
3829  default:
3830  llvm_unreachable("Unhandled cast!");
3831  case Instruction::Trunc:
3832  NewI = ShrinkOperand(CI->getOperand(0));
3833  break;
3834  case Instruction::SExt:
3835  NewI = B.CreateSExtOrTrunc(
3836  CI->getOperand(0),
3837  smallestIntegerVectorType(OriginalTy, TruncatedTy));
3838  break;
3839  case Instruction::ZExt:
3840  NewI = B.CreateZExtOrTrunc(
3841  CI->getOperand(0),
3842  smallestIntegerVectorType(OriginalTy, TruncatedTy));
3843  break;
3844  }
3845  } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3846  auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3847  auto *O0 = B.CreateZExtOrTrunc(
3848  SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3849  auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3850  auto *O1 = B.CreateZExtOrTrunc(
3851  SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3852 
3853  NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3854  } else if (isa<LoadInst>(I)) {
3855  // Don't do anything with the operands, just extend the result.
3856  continue;
3857  } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3858  auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3859  auto *O0 = B.CreateZExtOrTrunc(
3860  IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3861  auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3862  NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3863  } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3864  auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3865  auto *O0 = B.CreateZExtOrTrunc(
3866  EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3867  NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3868  } else {
3869  llvm_unreachable("Unhandled instruction type!");
3870  }
3871 
3872  // Lastly, extend the result.
3873  NewI->takeName(cast<Instruction>(I));
3874  Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3875  I->replaceAllUsesWith(Res);
3876  cast<Instruction>(I)->eraseFromParent();
3877  Erased.insert(I);
3878  VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3879  }
3880  }
3881 
3882  // We'll have created a bunch of ZExts that are now parentless. Clean up.
3883  for (const auto &KV : Cost->getMinimalBitwidths()) {
3884  // If the value wasn't vectorized, we must maintain the original scalar
3885  // type. The absence of the value from VectorLoopValueMap indicates that it
3886  // wasn't vectorized.
3887  if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3888  continue;
3889  for (unsigned Part = 0; Part < UF; ++Part) {
3890  Value *I = getOrCreateVectorValue(KV.first, Part);
3891  ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3892  if (Inst && Inst->use_empty()) {
3893  Value *NewI = Inst->getOperand(0);
3894  Inst->eraseFromParent();
3895  VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3896  }
3897  }
3898  }
3899 }
3900 
3901 void InnerLoopVectorizer::fixVectorizedLoop() {
3902  // Insert truncates and extends for any truncated instructions as hints to
3903  // InstCombine.
3904  if (VF > 1)
3905  truncateToMinimalBitwidths();
3906 
3907  // At this point every instruction in the original loop is widened to a
3908  // vector form. Now we need to fix the recurrences in the loop. These PHI
3909  // nodes are currently empty because we did not want to introduce cycles.
3910  // This is the second stage of vectorizing recurrences.
3911  fixCrossIterationPHIs();
3912 
3913  // Update the dominator tree.
3914  //
3915  // FIXME: After creating the structure of the new loop, the dominator tree is
3916  // no longer up-to-date, and it remains that way until we update it
3917  // here. An out-of-date dominator tree is problematic for SCEV,
3918  // because SCEVExpander uses it to guide code generation. The
3919  // vectorizer use SCEVExpanders in several places. Instead, we should
3920  // keep the dominator tree up-to-date as we go.
3921  updateAnalysis();
3922 
3923  // Fix-up external users of the induction variables.
3924  for (auto &Entry : *Legal->getInductionVars())
3925  fixupIVUsers(Entry.first, Entry.second,
3926  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3927  IVEndValues[Entry.first], LoopMiddleBlock);
3928 
3929  fixLCSSAPHIs();
3930  predicateInstructions();
3931 
3932  // Remove redundant induction instructions.
3933  cse(LoopVectorBody);
3934 }
3935 
3936 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3937  // In order to support recurrences we need to be able to vectorize Phi nodes.
3938  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3939  // stage #2: We now need to fix the recurrences by adding incoming edges to
3940  // the currently empty PHI nodes. At this point every instruction in the
3941  // original loop is widened to a vector form so we can use them to construct
3942  // the incoming edges.
3943  for (Instruction &I : *OrigLoop->getHeader()) {
3944  PHINode *Phi = dyn_cast<PHINode>(&I);
3945  if (!Phi)
3946  break;
3947  // Handle first-order recurrences and reductions that need to be fixed.
3948  if (Legal->isFirstOrderRecurrence(Phi))
3949  fixFirstOrderRecurrence(Phi);
3950  else if (Legal->isReductionVariable(Phi))
3951  fixReduction(Phi);
3952  }
3953 }
3954 
3955 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3956 
3957  // This is the second phase of vectorizing first-order recurrences. An
3958  // overview of the transformation is described below. Suppose we have the
3959  // following loop.
3960  //
3961  // for (int i = 0; i < n; ++i)
3962  // b[i] = a[i] - a[i - 1];
3963  //
3964  // There is a first-order recurrence on "a". For this loop, the shorthand
3965  // scalar IR looks like:
3966  //
3967  // scalar.ph:
3968  // s_init = a[-1]
3969  // br scalar.body
3970  //
3971  // scalar.body:
3972  // i = phi [0, scalar.ph], [i+1, scalar.body]
3973  // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3974  // s2 = a[i]
3975  // b[i] = s2 - s1
3976  // br cond, scalar.body, ...
3977  //
3978  // In this example, s1 is a recurrence because it's value depends on the
3979  // previous iteration. In the first phase of vectorization, we created a
3980  // temporary value for s1. We now complete the vectorization and produce the
3981  // shorthand vector IR shown below (for VF = 4, UF = 1).
3982  //
3983  // vector.ph:
3984  // v_init = vector(..., ..., ..., a[-1])
3985  // br vector.body
3986  //
3987  // vector.body
3988  // i = phi [0, vector.ph], [i+4, vector.body]
3989  // v1 = phi [v_init, vector.ph], [v2, vector.body]
3990  // v2 = a[i, i+1, i+2, i+3];
3991  // v3 = vector(v1(3), v2(0, 1, 2))
3992  // b[i, i+1, i+2, i+3] = v2 - v3
3993  // br cond, vector.body, middle.block
3994  //
3995  // middle.block:
3996  // x = v2(3)
3997  // br scalar.ph
3998  //
3999  // scalar.ph:
4000  // s_init = phi [x, middle.block], [a[-1], otherwise]
4001  // br scalar.body
4002  //
4003  // After execution completes the vector loop, we extract the next value of
4004  // the recurrence (x) to use as the initial value in the scalar loop.
4005 
4006  // Get the original loop preheader and single loop latch.
4007  auto *Preheader = OrigLoop->getLoopPreheader();
4008  auto *Latch = OrigLoop->getLoopLatch();
4009 
4010  // Get the initial and previous values of the scalar recurrence.
4011  auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
4012  auto *Previous = Phi->getIncomingValueForBlock(Latch);
4013 
4014  // Create a vector from the initial value.
4015  auto *VectorInit = ScalarInit;
4016  if (VF > 1) {
4017  Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4018  VectorInit = Builder.CreateInsertElement(
4019  UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
4020  Builder.getInt32(VF - 1), "vector.recur.init");
4021  }
4022 
4023  // We constructed a temporary phi node in the first phase of vectorization.
4024  // This phi node will eventually be deleted.
4025  Builder.SetInsertPoint(
4026  cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
4027 
4028  // Create a phi node for the new recurrence. The current value will either be
4029  // the initial value inserted into a vector or loop-varying vector value.
4030  auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
4031  VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
4032 
4033  // Get the vectorized previous value of the last part UF - 1. It appears last
4034  // among all unrolled iterations, due to the order of their construction.
4035  Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
4036 
4037  // Set the insertion point after the previous value if it is an instruction.
4038  // Note that the previous value may have been constant-folded so it is not
4039  // guaranteed to be an instruction in the vector loop. Also, if the previous
4040  // value is a phi node, we should insert after all the phi nodes to avoid
4041  // breaking basic block verification.
4042  if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||
4043  isa<PHINode>(PreviousLastPart))
4044  Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
4045  else
4046  Builder.SetInsertPoint(
4047  &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));
4048 
4049  // We will construct a vector for the recurrence by combining the values for
4050  // the current and previous iterations. This is the required shuffle mask.
4051  SmallVector<Constant *, 8> ShuffleMask(VF);
4052  ShuffleMask[0] = Builder.getInt32(VF - 1);
4053  for (unsigned I = 1; I < VF; ++I)
4054  ShuffleMask[I] = Builder.getInt32(I + VF - 1);
4055 
4056  // The vector from which to take the initial value for the current iteration
4057  // (actual or unrolled). Initially, this is the vector phi node.
4058  Value *Incoming = VecPhi;
4059 
4060  // Shuffle the current and previous vector and update the vector parts.
4061  for (unsigned Part = 0; Part < UF; ++Part) {
4062  Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
4063  Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
4064  auto *Shuffle =
4065  VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
4066  ConstantVector::get(ShuffleMask))
4067  : Incoming;
4068  PhiPart->replaceAllUsesWith(Shuffle);
4069  cast<Instruction>(PhiPart)->eraseFromParent();
4070  VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
4071  Incoming = PreviousPart;
4072  }
4073 
4074  // Fix the latch value of the new recurrence in the vector loop.
4075  VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4076 
4077  // Extract the last vector element in the middle block. This will be the
4078  // initial value for the recurrence when jumping to the scalar loop.
4079  auto *ExtractForScalar = Incoming;
4080  if (VF > 1) {
4081  Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4082  ExtractForScalar = Builder.CreateExtractElement(
4083  ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
4084  }
4085  // Extract the second last element in the middle block if the
4086  // Phi is used outside the loop. We need to extract the phi itself
4087  // and not the last element (the phi update in the current iteration). This
4088  // will be the value when jumping to the exit block from the LoopMiddleBlock,
4089  // when the scalar loop is not run at all.
4090  Value *ExtractForPhiUsedOutsideLoop = nullptr;
4091  if (VF > 1)
4092  ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4093  Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
4094  // When loop is unrolled without vectorizing, initialize
4095  // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
4096  // `Incoming`. This is analogous to the vectorized case above: extracting the
4097  // second last element when VF > 1.
4098  else if (UF > 1)
4099  ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
4100 
4101  // Fix the initial value of the original recurrence in the scalar loop.
4102  Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4103  auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4104  for (auto *BB : predecessors(LoopScalarPreHeader)) {
4105  auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4106  Start->addIncoming(Incoming, BB);
4107  }
4108 
4109  Phi->setIncomingValue(Phi->getBasicBlockIndex(LoopScalarPreHeader), Start);
4110  Phi->setName("scalar.recur");
4111 
4112  // Finally, fix users of the recurrence outside the loop. The users will need
4113  // either the last value of the scalar recurrence or the last value of the
4114  // vector recurrence we extracted in the middle block. Since the loop is in
4115  // LCSSA form, we just need to find the phi node for the original scalar
4116  // recurrence in the exit block, and then add an edge for the middle block.
4117  for (auto &I : *LoopExitBlock) {
4118  auto *LCSSAPhi = dyn_cast<PHINode>(&I);
4119  if (!LCSSAPhi)
4120  break;
4121  if (LCSSAPhi->getIncomingValue(0) == Phi) {
4122  LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4123  break;
4124  }
4125  }
4126 }
4127 
4128 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
4129  Constant *Zero = Builder.getInt32(0);
4130 
4131  // Get it's reduction variable descriptor.
4132  assert(Legal->isReductionVariable(Phi) &&
4133  "Unable to find the reduction variable");
4134  RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
4135 
4137  TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4138  Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4140  RdxDesc.getMinMaxRecurrenceKind();
4141  setDebugLocFromInst(Builder, ReductionStartValue);
4142 
4143  // We need to generate a reduction vector from the incoming scalar.
4144  // To do so, we need to generate the 'identity' vector and override
4145  // one of the elements with the incoming scalar reduction. We need
4146  // to do it in the vector-loop preheader.
4147  Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4148 
4149  // This is the vector-clone of the value that leaves the loop.
4150  Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
4151 
4152  // Find the reduction identity variable. Zero for addition, or, xor,
4153  // one for multiplication, -1 for And.
4154  Value *Identity;
4155  Value *VectorStart;
4158  // MinMax reduction have the start value as their identify.
4159  if (VF == 1) {
4160  VectorStart = Identity = ReductionStartValue;
4161  } else {
4162  VectorStart = Identity =
4163  Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
4164  }
4165  } else {
4166  // Handle other reduction kinds:
4168  RK, VecTy->getScalarType());
4169  if (VF == 1) {
4170  Identity = Iden;
4171  // This vector is the Identity vector where the first element is the
4172  // incoming scalar reduction.
4173  VectorStart = ReductionStartValue;
4174  } else {
4175  Identity = ConstantVector::getSplat(VF, Iden);
4176 
4177  // This vector is the Identity vector where the first element is the
4178  // incoming scalar reduction.
4179  VectorStart =
4180  Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
4181  }
4182  }
4183 
4184  // Fix the vector-loop phi.
4185 
4186  // Reductions do not have to start at zero. They can start with
4187  // any loop invariant values.
4188  BasicBlock *Latch = OrigLoop->getLoopLatch();
4189  Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
4190  for (unsigned Part = 0; Part < UF; ++Part) {
4191  Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
4192  Value *Val = getOrCreateVectorValue(LoopVal, Part);
4193  // Make sure to add the reduction stat value only to the
4194  // first unroll part.
4195  Value *StartVal = (Part == 0) ? VectorStart : Identity;
4196  cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
4197  cast<PHINode>(VecRdxPhi)
4198  ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4199  }
4200 
4201  // Before each round, move the insertion point right between
4202  // the PHIs and the values we are going to write.
4203  // This allows us to write both PHINodes and the extractelement
4204  // instructions.
4205  Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4206 
4207  setDebugLocFromInst(Builder, LoopExitInst);
4208 
4209  // If the vector reduction can be performed in a smaller type, we truncate
4210  // then extend the loop exit value to enable InstCombine to evaluate the
4211  // entire expression in the smaller type.
4212  if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
4213  Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4214  Builder.SetInsertPoint(LoopVectorBody->getTerminator());
4215  VectorParts RdxParts(UF);
4216  for (unsigned Part = 0; Part < UF; ++Part) {
4217  RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4218  Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4219  Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4220  : Builder.CreateZExt(Trunc, VecTy);
4221  for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4222  UI != RdxParts[Part]->user_end();)
4223  if (*UI != Trunc) {
4224  (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4225  RdxParts[Part] = Extnd;
4226  } else {
4227  ++UI;
4228  }
4229  }
4230  Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4231  for (unsigned Part = 0; Part < UF; ++Part) {
4232  RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4233  VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
4234  }
4235  }
4236 
4237  // Reduce all of the unrolled parts into a single vector.
4238  Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
4240  setDebugLocFromInst(Builder, ReducedPartRdx);
4241  for (unsigned Part = 1; Part < UF; ++Part) {
4242  Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4243  if (Op != Instruction::ICmp && Op != Instruction::FCmp)
4244  // Floating point operations had to be 'fast' to enable the reduction.
4245  ReducedPartRdx = addFastMathFlag(
4246  Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
4247  ReducedPartRdx, "bin.rdx"));
4248  else
4249  ReducedPartRdx = RecurrenceDescriptor::createMinMaxOp(
4250  Builder, MinMaxKind, ReducedPartRdx, RdxPart);
4251  }
4252 
4253  if (VF > 1) {
4254  bool NoNaN = Legal->hasFunNoNaNAttr();
4255  ReducedPartRdx =
4256  createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
4257  // If the reduction can be performed in a smaller type, we need to extend
4258  // the reduction to the wider type before we branch to the original loop.
4259  if (Phi->getType() != RdxDesc.getRecurrenceType())
4260  ReducedPartRdx =
4261  RdxDesc.isSigned()
4262  ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
4263  : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
4264  }
4265 
4266  // Create a phi node that merges control-flow from the backedge-taken check
4267  // block and the middle block.
4268  PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
4269  LoopScalarPreHeader->getTerminator());
4270  for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4271  BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4272  BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4273 
4274  // Now, we need to fix the users of the reduction variable
4275  // inside and outside of the scalar remainder loop.
4276  // We know that the loop is in LCSSA form. We need to update the
4277  // PHI nodes in the exit blocks.
4278  for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
4279  LEE = LoopExitBlock->end();
4280  LEI != LEE; ++LEI) {
4281  PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
4282  if (!LCSSAPhi)
4283  break;
4284 
4285  // All PHINodes need to have a single entry edge, or two if
4286  // we already fixed them.
4287  assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
4288 
4289  // We found a reduction value exit-PHI. Update it with the
4290  // incoming bypass edge.
4291  if (LCSSAPhi->getIncomingValue(0) == LoopExitInst)
4292  LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4293  } // end of the LCSSA phi scan.
4294 
4295  // Fix the scalar loop reduction variable with the incoming reduction sum
4296  // from the vector body and from the backedge value.
4297  int IncomingEdgeBlockIdx =
4298  Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4299  assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4300  // Pick the other block.
4301  int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4302  Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4303  Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4304 }
4305 
4306 void InnerLoopVectorizer::fixLCSSAPHIs() {
4307  for (Instruction &LEI : *LoopExitBlock) {
4308  auto *LCSSAPhi = dyn_cast<PHINode>(&LEI);
4309  if (!LCSSAPhi)
4310  break;
4311  if (LCSSAPhi->getNumIncomingValues() == 1) {
4312  assert(OrigLoop->isLoopInvariant(LCSSAPhi->getIncomingValue(0)) &&
4313  "Incoming value isn't loop invariant");
4314  LCSSAPhi->addIncoming(LCSSAPhi->getIncomingValue(0), LoopMiddleBlock);
4315  }
4316  }
4317 }
4318 
4319 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4320 
4321  // The basic block and loop containing the predicated instruction.
4322  auto *PredBB = PredInst->getParent();
4323  auto *VectorLoop = LI->getLoopFor(PredBB);
4324 
4325  // Initialize a worklist with the operands of the predicated instruction.
4326  SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4327 
4328  // Holds instructions that we need to analyze again. An instruction may be
4329  // reanalyzed if we don't yet know if we can sink it or not.
4330  SmallVector<Instruction *, 8> InstsToReanalyze;
4331 
4332  // Returns true if a given use occurs in the predicated block. Phi nodes use
4333  // their operands in their corresponding predecessor blocks.
4334  auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4335  auto *I = cast<Instruction>(U.getUser());
4336  BasicBlock *BB = I->getParent();
4337  if (auto *Phi = dyn_cast<PHINode>(I))
4338  BB = Phi->getIncomingBlock(
4339  PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4340  return BB == PredBB;
4341  };
4342 
4343  // Iteratively sink the scalarized operands of the predicated instruction
4344  // into the block we created for it. When an instruction is sunk, it's
4345  // operands are then added to the worklist. The algorithm ends after one pass
4346  // through the worklist doesn't sink a single instruction.
4347  bool Changed;
4348  do {
4349 
4350  // Add the instructions that need to be reanalyzed to the worklist, and
4351  // reset the changed indicator.
4352  Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4353  InstsToReanalyze.clear();
4354  Changed = false;
4355 
4356  while (!Worklist.empty()) {
4357  auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4358 
4359  // We can't sink an instruction if it is a phi node, is already in the
4360  // predicated block, is not in the loop, or may have side effects.
4361  if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4362  !VectorLoop->contains(I) || I->mayHaveSideEffects())
4363  continue;
4364 
4365  // It's legal to sink the instruction if all its uses occur in the
4366  // predicated block. Otherwise, there's nothing to do yet, and we may
4367  // need to reanalyze the instruction.
4368  if (!all_of(I->uses(), isBlockOfUsePredicated)) {
4369  InstsToReanalyze.push_back(I);
4370  continue;
4371  }
4372 
4373  // Move the instruction to the beginning of the predicated block, and add
4374  // it's operands to the worklist.
4375  I->moveBefore(&*PredBB->getFirstInsertionPt());
4376  Worklist.insert(I->op_begin(), I->op_end());
4377 
4378  // The sinking may have enabled other instructions to be sunk, so we will
4379  // need to iterate.
4380  Changed = true;
4381  }
4382  } while (Changed);
4383 }
4384 
4385 void InnerLoopVectorizer::predicateInstructions() {
4386 
4387  // For each instruction I marked for predication on value C, split I into its
4388  // own basic block to form an if-then construct over C. Since I may be fed by
4389  // an extractelement instruction or other scalar operand, we try to
4390  // iteratively sink its scalar operands into the predicated block. If I feeds
4391  // an insertelement instruction, we try to move this instruction into the
4392  // predicated block as well. For non-void types, a phi node will be created
4393  // for the resulting value (either vector or scalar).
4394  //
4395  // So for some predicated instruction, e.g. the conditional sdiv in:
4396  //
4397  // for.body:
4398  // ...
4399  // %add = add nsw i32 %mul, %0
4400  // %cmp5 = icmp sgt i32 %2, 7
4401  // br i1 %cmp5, label %if.then, label %if.end
4402  //
4403  // if.then:
4404  // %div = sdiv i32 %0, %1
4405  // br label %if.end
4406  //
4407  // if.end:
4408  // %x.0 = phi i32 [ %div, %if.then ], [ %add, %for.body ]
4409  //
4410  // the sdiv at this point is scalarized and if-converted using a select.
4411  // The inactive elements in the vector are not used, but the predicated
4412  // instruction is still executed for all vector elements, essentially:
4413  //
4414  // vector.body:
4415  // ...
4416  // %17 = add nsw <2 x i32> %16, %wide.load
4417  // %29 = extractelement <2 x i32> %wide.load, i32 0
4418  // %30 = extractelement <2 x i32> %wide.load51, i32 0
4419  // %31 = sdiv i32 %29, %30
4420  // %32 = insertelement <2 x i32> undef, i32 %31, i32 0
4421  // %35 = extractelement <2 x i32> %wide.load, i32 1
4422  // %36 = extractelement <2 x i32> %wide.load51, i32 1
4423  // %37 = sdiv i32 %35, %36
4424  // %38 = insertelement <2 x i32> %32, i32 %37, i32 1
4425  // %predphi = select <2 x i1> %26, <2 x i32> %38, <2 x i32> %17
4426  //
4427  // Predication will now re-introduce the original control flow to avoid false
4428  // side-effects by the sdiv instructions on the inactive elements, yielding
4429  // (after cleanup):
4430  //
4431  // vector.body:
4432  // ...
4433  // %5 = add nsw <2 x i32> %4, %wide.load
4434  // %8 = icmp sgt <2 x i32> %wide.load52, <i32 7, i32 7>
4435  // %9 = extractelement <2 x i1> %8, i32 0
4436  // br i1 %9, label %pred.sdiv.if, label %pred.sdiv.continue
4437  //
4438  // pred.sdiv.if:
4439  // %10 = extractelement <2 x i32> %wide.load, i32 0
4440  // %11 = extractelement <2 x i32> %wide.load51, i32 0
4441  // %12 = sdiv i32 %10, %11
4442  // %13 = insertelement <2 x i32> undef, i32 %12, i32 0
4443  // br label %pred.sdiv.continue
4444  //
4445  // pred.sdiv.continue:
4446  // %14 = phi <2 x i32> [ undef, %vector.body ], [ %13, %pred.sdiv.if ]
4447  // %15 = extractelement <2 x i1> %8, i32 1
4448  // br i1 %15, label %pred.sdiv.if54, label %pred.sdiv.continue55
4449  //
4450  // pred.sdiv.if54:
4451  // %16 = extractelement <2 x i32> %wide.load, i32 1
4452  // %17 = extractelement <2 x i32> %wide.load51, i32 1
4453  // %18 = sdiv i32 %16, %17
4454  // %19 = insertelement <2 x i32> %14, i32 %18, i32 1
4455  // br label %pred.sdiv.continue55
4456  //
4457  // pred.sdiv.continue55:
4458  // %20 = phi <2 x i32> [ %14, %pred.sdiv.continue ], [ %19, %pred.sdiv.if54 ]
4459  // %predphi = select <2 x i1> %8, <2 x i32> %20, <2 x i32> %5
4460 
4461  for (auto KV : PredicatedInstructions) {
4462  BasicBlock::iterator I(KV.first);
4463  BasicBlock *Head = I->getParent();
4464  auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false,
4465  /*BranchWeights=*/nullptr, DT, LI);
4466  I->moveBefore(T);
4467  sinkScalarOperands(&*I);
4468 
4469  BasicBlock *PredicatedBlock = I->getParent();
4470  Twine BBNamePrefix = Twine("pred.") + I->getOpcodeName();
4471  PredicatedBlock->setName(BBNamePrefix + ".if");
4472  PredicatedBlock->getSingleSuccessor()->setName(BBNamePrefix + ".continue");
4473 
4474  // If the instruction is non-void create a Phi node at reconvergence point.
4475  if (!I->getType()->isVoidTy()) {
4476  Value *IncomingTrue = nullptr;
4477  Value *IncomingFalse = nullptr;
4478 
4479  if (I->hasOneUse() && isa<InsertElementInst>(*I->user_begin())) {
4480  // If the predicated instruction is feeding an insert-element, move it
4481  // into the Then block; Phi node will be created for the vector.
4482  InsertElementInst *IEI = cast<InsertElementInst>(*I->user_begin());
4483  IEI->moveBefore(T);
4484  IncomingTrue = IEI; // the new vector with the inserted element.
4485  IncomingFalse = IEI->getOperand(0); // the unmodified vector
4486  } else {
4487  // Phi node will be created for the scalar predicated instruction.
4488  IncomingTrue = &*I;
4489  IncomingFalse = UndefValue::get(I->getType());
4490  }
4491 
4492  BasicBlock *PostDom = I->getParent()->getSingleSuccessor();
4493  assert(PostDom && "Then block has multiple successors");
4494  PHINode *Phi =
4495  PHINode::Create(IncomingTrue->getType(), 2, "", &PostDom->front());
4496  IncomingTrue->replaceAllUsesWith(Phi);
4497  Phi->addIncoming(IncomingFalse, Head);
4498  Phi->addIncoming(IncomingTrue, I->getParent());
4499  }
4500  }
4501 
4502  DEBUG(DT->verifyDomTree());
4503 }
4504 
4506 InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
4507  assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
4508 
4509  // Look for cached value.
4510  std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
4511  EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
4512  if (ECEntryIt != EdgeMaskCache.end())
4513  return ECEntryIt->second;
4514 
4515  VectorParts SrcMask = createBlockInMask(Src);
4516 
4517  // The terminator has to be a branch inst!
4519  assert(BI && "Unexpected terminator found");
4520 
4521  if (BI->isConditional()) {
4522 
4523  VectorParts EdgeMask(UF);
4524  for (unsigned Part = 0; Part < UF; ++Part) {
4525  auto *EdgeMaskPart = getOrCreateVectorValue(BI->getCondition(), Part);
4526  if (BI->getSuccessor(0) != Dst)
4527  EdgeMaskPart = Builder.CreateNot(EdgeMaskPart);
4528 
4529  EdgeMaskPart = Builder.CreateAnd(EdgeMaskPart, SrcMask[Part]);
4530  EdgeMask[Part] = EdgeMaskPart;
4531  }
4532 
4533  EdgeMaskCache[Edge] = EdgeMask;
4534  return EdgeMask;
4535  }
4536 
4537  EdgeMaskCache[Edge] = SrcMask;
4538  return SrcMask;
4539 }
4540 
4542 InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
4543  assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
4544 
4545  // Look for cached value.
4546  BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
4547  if (BCEntryIt != BlockMaskCache.end())
4548  return BCEntryIt->second;
4549 
4550  VectorParts BlockMask(UF);
4551 
4552  // Loop incoming mask is all-one.
4553  if (OrigLoop->getHeader() == BB) {
4555  for (unsigned Part = 0; Part < UF; ++Part)
4556  BlockMask[Part] = getOrCreateVectorValue(C, Part);
4557  BlockMaskCache[BB] = BlockMask;
4558  return BlockMask;
4559  }
4560 
4561  // This is the block mask. We OR all incoming edges, and with zero.
4563  for (unsigned Part = 0; Part < UF; ++Part)
4564  BlockMask[Part] = getOrCreateVectorValue(Zero, Part);
4565 
4566  // For each pred:
4567  for (pred_iterator It = pred_begin(BB), E = pred_end(BB); It != E; ++It) {
4568  VectorParts EM = createEdgeMask(*It, BB);
4569  for (unsigned Part = 0; Part < UF; ++Part)
4570  BlockMask[Part] = Builder.CreateOr(BlockMask[Part], EM[Part]);
4571  }
4572 
4573  BlockMaskCache[BB] = BlockMask;
4574  return BlockMask;
4575 }
4576 
4577 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4578  unsigned VF) {
4579  PHINode *P = cast<PHINode>(PN);
4580  // In order to support recurrences we need to be able to vectorize Phi nodes.
4581  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4582  // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4583  // this value when we vectorize all of the instructions that use the PHI.
4584  if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4585  for (unsigned Part = 0; Part < UF; ++Part) {
4586  // This is phase one of vectorizing PHIs.
4587  Type *VecTy =
4588  (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4589  Value *EntryPart = PHINode::Create(
4590  VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4591  VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4592  }
4593  return;
4594  }
4595 
4596  setDebugLocFromInst(Builder, P);
4597  // Check for PHI nodes that are lowered to vector selects.
4598  if (P->getParent() != OrigLoop->getHeader()) {
4599  // We know that all PHIs in non-header blocks are converted into
4600  // selects, so we don't have to worry about the insertion order and we
4601  // can just use the builder.
4602  // At this point we generate the predication tree. There may be
4603  // duplications since this is a simple recursive scan, but future
4604  // optimizations will clean it up.
4605 
4606  unsigned NumIncoming = P->getNumIncomingValues();
4607 
4608  // Generate a sequence of selects of the form:
4609  // SELECT(Mask3, In3,
4610  // SELECT(Mask2, In2,
4611  // ( ...)))
4612  VectorParts Entry(UF);
4613  for (unsigned In = 0; In < NumIncoming; In++) {
4614  VectorParts Cond =
4615  createEdgeMask(P->getIncomingBlock(In), P->getParent());
4616 
4617  for (unsigned Part = 0; Part < UF; ++Part) {
4618  Value *In0 = getOrCreateVectorValue(P->getIncomingValue(In), Part);
4619  // We might have single edge PHIs (blocks) - use an identity
4620  // 'select' for the first PHI operand.
4621  if (In == 0)
4622  Entry[Part] = Builder.CreateSelect(Cond[Part], In0, In0);
4623  else
4624  // Select between the current value and the previous incoming edge
4625  // based on the incoming mask.
4626  Entry[Part] = Builder.CreateSelect(Cond[Part], In0, Entry[Part],
4627  "predphi");
4628  }
4629  }
4630  for (unsigned Part = 0; Part < UF; ++Part)
4631  VectorLoopValueMap.setVectorValue(P, Part, Entry[Part]);
4632  return;
4633  }
4634 
4635  // This PHINode must be an induction variable.
4636  // Make sure that we know about it.
4637  assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
4638 
4639  InductionDescriptor II = Legal->getInductionVars()->lookup(P);
4640  const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4641 
4642  // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4643  // which can be found from the original scalar operations.
4644  switch (II.getKind()) {
4646  llvm_unreachable("Unknown induction");
4649  return widenIntOrFpInduction(P);
4651  // Handle the pointer induction variable case.
4652  assert(P->getType()->isPointerTy() && "Unexpected type.");
4653  // This is the normalized GEP that starts counting at zero.
4654  Value *PtrInd = Induction;
4655  PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
4656  // Determine the number of scalars we need to generate for each unroll
4657  // iteration. If the instruction is uniform, we only need to generate the
4658  // first lane. Otherwise, we generate all VF values.
4659  unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
4660  // These are the scalar results. Notice that we don't generate vector GEPs
4661  // because scalar GEPs result in better code.
4662  for (unsigned Part = 0; Part < UF; ++Part) {
4663  for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4664  Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
4665  Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4666  Value *SclrGep = II.transform(Builder, GlobalIdx, PSE.getSE(), DL);
4667  SclrGep->setName("next.gep");
4668  VectorLoopValueMap.setScalarValue(P, Part, Lane, SclrGep);
4669  }
4670  }
4671  return;
4672  }
4673  }
4674 }
4675 
4676 /// A helper function for checking whether an integer division-related
4677 /// instruction may divide by zero (in which case it must be predicated if
4678 /// executed conditionally in the scalar code).
4679 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4680 /// Non-zero divisors that are non compile-time constants will not be
4681 /// converted into multiplication, so we will still end up scalarizing
4682 /// the division, but can do so w/o predication.
4683 static bool mayDivideByZero(Instruction &I) {
4684  assert((I.getOpcode() == Instruction::UDiv ||
4685  I.getOpcode() == Instruction::SDiv ||
4686  I.getOpcode() == Instruction::URem ||
4687  I.getOpcode() == Instruction::SRem) &&
4688  "Unexpected instruction");
4689  Value *Divisor = I.getOperand(1);
4690  auto *CInt = dyn_cast<ConstantInt>(Divisor);
4691  return !CInt || CInt->isZero();
4692 }
4693 
4694 void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
4695  // Scalarize instructions that should remain scalar after vectorization.
4696  if (VF > 1 &&
4697  !(isa<BranchInst>(&I) || isa<PHINode>(&I) || isa<DbgInfoIntrinsic>(&I)) &&
4698  shouldScalarizeInstruction(&I)) {
4699  scalarizeInstruction(&I, Legal->isScalarWithPredication(&I));
4700  return;
4701  }
4702 
4703  switch (I.getOpcode()) {
4704  case Instruction::Br:
4705  // Nothing to do for PHIs and BR, since we already took care of the
4706  // loop control flow instructions.
4707  break;
4708  case Instruction::PHI: {
4709  // Vectorize PHINodes.
4710  widenPHIInstruction(&I, UF, VF);
4711  break;
4712  } // End of PHI.
4713  case Instruction::GetElementPtr: {
4714  // Construct a vector GEP by widening the operands of the scalar GEP as
4715  // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4716  // results in a vector of pointers when at least one operand of the GEP
4717  // is vector-typed. Thus, to keep the representation compact, we only use
4718  // vector-typed operands for loop-varying values.
4719  auto *GEP = cast<GetElementPtrInst>(&I);
4720 
4721  if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
4722  // If we are vectorizing, but the GEP has only loop-invariant operands,
4723  // the GEP we build (by only using vector-typed operands for
4724  // loop-varying values) would be a scalar pointer. Thus, to ensure we
4725  // produce a vector of pointers, we need to either arbitrarily pick an
4726  // operand to broadcast, or broadcast a clone of the original GEP.
4727  // Here, we broadcast a clone of the original.
4728  //
4729  // TODO: If at some point we decide to scalarize instructions having
4730  // loop-invariant operands, this special case will no longer be
4731  // required. We would add the scalarization decision to
4732  // collectLoopScalars() and teach getVectorValue() to broadcast
4733  // the lane-zero scalar value.
4734  auto *Clone = Builder.Insert(GEP->clone());
4735  for (unsigned Part = 0; Part < UF; ++Part) {
4736  Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4737  VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
4738  addMetadata(EntryPart, GEP);
4739  }
4740  } else {
4741  // If the GEP has at least one loop-varying operand, we are sure to
4742  // produce a vector of pointers. But if we are only unrolling, we want
4743  // to produce a scalar GEP for each unroll part. Thus, the GEP we
4744  // produce with the code below will be scalar (if VF == 1) or vector
4745  // (otherwise). Note that for the unroll-only case, we still maintain
4746  // values in the vector mapping with initVector, as we do for other
4747  // instructions.
4748  for (unsigned Part = 0; Part < UF; ++Part) {
4749 
4750  // The pointer operand of the new GEP. If it's loop-invariant, we
4751  // won't broadcast it.
4752  auto *Ptr =
4753  OrigLoop->isLoopInvariant(GEP->getPointerOperand())
4754  ? GEP->getPointerOperand()
4755  : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
4756 
4757  // Collect all the indices for the new GEP. If any index is
4758  // loop-invariant, we won't broadcast it.
4759  SmallVector<Value *, 4> Indices;
4760  for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
4761  if (OrigLoop->isLoopInvariant(U.get()))
4762  Indices.push_back(U.get());
4763  else
4764  Indices.push_back(getOrCreateVectorValue(U.get(), Part));
4765  }
4766 
4767  // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4768  // but it should be a vector, otherwise.
4769  auto *NewGEP = GEP->isInBounds()
4770  ? Builder.CreateInBoundsGEP(Ptr, Indices)
4771  : Builder.CreateGEP(Ptr, Indices);
4772  assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4773  "NewGEP is not a pointer vector");
4774  VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
4775  addMetadata(NewGEP, GEP);
4776  }
4777  }
4778 
4779  break;
4780  }
4781  case Instruction::UDiv:
4782  case Instruction::SDiv:
4783  case Instruction::SRem:
4784  case Instruction::URem:
4785  // Scalarize with predication if this instruction may divide by zero and
4786  // block execution is conditional, otherwise fallthrough.
4787  if (Legal->isScalarWithPredication(&I)) {
4788  scalarizeInstruction(&I, true);
4789  break;
4790  }
4792  case Instruction::Add:
4793  case Instruction::FAdd:
4794  case Instruction::Sub:
4795  case Instruction::FSub:
4796  case Instruction::Mul:
4797  case Instruction::FMul:
4798  case Instruction::FDiv:
4799  case Instruction::FRem:
4800  case Instruction::Shl:
4801  case Instruction::LShr:
4802  case Instruction::AShr:
4803  case Instruction::And:
4804  case Instruction::Or:
4805  case Instruction::Xor: {
4806  // Just widen binops.
4807  auto *BinOp = cast<BinaryOperator>(&I);
4808  setDebugLocFromInst(Builder, BinOp);
4809 
4810  for (unsigned Part = 0; Part < UF; ++Part) {
4811  Value *A = getOrCreateVectorValue(BinOp->getOperand(0), Part);
4812  Value *B = getOrCreateVectorValue(BinOp->getOperand(1), Part);
4813  Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
4814 
4815  if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
4816  VecOp->copyIRFlags(BinOp);
4817 
4818  // Use this vector value for all users of the original instruction.
4819  VectorLoopValueMap.setVectorValue(&I, Part, V);
4820  addMetadata(V, BinOp);
4821  }
4822 
4823  break;
4824  }
4825  case Instruction::Select: {
4826  // Widen selects.
4827  // If the selector is loop invariant we can create a select
4828  // instruction with a scalar condition. Otherwise, use vector-select.
4829  auto *SE = PSE.getSE();
4830  bool InvariantCond =
4831  SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
4832  setDebugLocFromInst(Builder, &I);
4833 
4834  // The condition can be loop invariant but still defined inside the
4835  // loop. This means that we can't just use the original 'cond' value.
4836  // We have to take the 'vectorized' value and pick the first lane.
4837  // Instcombine will make this a no-op.
4838 
4839  auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), 0, 0);
4840 
4841  for (unsigned Part = 0; Part < UF; ++Part) {
4842  Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4843  Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4844  Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4845  Value *Sel =
4846  Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4847  VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4848  addMetadata(Sel, &I);
4849  }
4850 
4851  break;
4852  }
4853 
4854  case Instruction::ICmp:
4855  case Instruction::FCmp: {
4856  // Widen compares. Generate vector compares.
4857  bool FCmp = (I.getOpcode() == Instruction::FCmp);
4858  auto *Cmp = dyn_cast<CmpInst>(&I);
4859  setDebugLocFromInst(Builder, Cmp);
4860  for (unsigned Part = 0; Part < UF; ++Part) {
4861  Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4862  Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4863  Value *C = nullptr;
4864  if (FCmp) {
4865  C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4866  cast<FCmpInst>(C)->copyFastMathFlags(Cmp);
4867  } else {
4868  C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4869  }
4870  VectorLoopValueMap.setVectorValue(&I, Part, C);
4871  addMetadata(C, &I);
4872  }
4873 
4874  break;
4875  }
4876 
4877  case Instruction::Store:
4878  case Instruction::Load:
4879  vectorizeMemoryInstruction(&I);
4880  break;
4881  case Instruction::ZExt:
4882  case Instruction::SExt:
4883  case Instruction::FPToUI:
4884  case Instruction::FPToSI:
4885  case Instruction::FPExt:
4886  case Instruction::PtrToInt:
4887  case Instruction::IntToPtr:
4888  case Instruction::SIToFP:
4889  case Instruction::UIToFP:
4890  case Instruction::Trunc:
4891  case Instruction::FPTrunc:
4892  case Instruction::BitCast: {
4893  auto *CI = dyn_cast<CastInst>(&I);
4894  setDebugLocFromInst(Builder, CI);
4895 
4896  // Optimize the special case where the source is a constant integer
4897  // induction variable. Notice that we can only optimize the 'trunc' case
4898  // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
4899  // (c) other casts depend on pointer size.
4900  if (Cost->isOptimizableIVTruncate(CI, VF)) {
4901  widenIntOrFpInduction(cast<PHINode>(CI->getOperand(0)),
4902  cast<TruncInst>(CI));
4903  break;
4904  }
4905 
4906  /// Vectorize casts.
4907  Type *DestTy =
4908  (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4909 
4910  for (unsigned Part = 0; Part < UF; ++Part) {
4911  Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4912  Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4913  VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4914  addMetadata(Cast, &I);
4915  }
4916  break;
4917  }
4918 
4919  case Instruction::Call: {
4920  // Ignore dbg intrinsics.
4921  if (isa<DbgInfoIntrinsic>(I))
4922  break;
4923  setDebugLocFromInst(Builder, &I);
4924 
4925  Module *M = I.getParent()->getParent()->getParent();
4926  auto *CI = cast<CallInst>(&I);
4927 
4928  StringRef FnName = CI->getCalledFunction()->getName();
4929  Function *F = CI->getCalledFunction();
4930  Type *RetTy = ToVectorTy(CI->getType(), VF);
4932  for (Value *ArgOperand : CI->arg_operands())
4933  Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4934 
4936  if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
4937  ID == Intrinsic::lifetime_start)) {
4938  scalarizeInstruction(&I);
4939  break;
4940  }
4941  // The flag shows whether we use Intrinsic or a usual Call for vectorized
4942  // version of the instruction.
4943  // Is it beneficial to perform intrinsic call compared to lib call?
4944  bool NeedToScalarize;
4945  unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
4946  bool UseVectorIntrinsic =
4947  ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
4948  if (!UseVectorIntrinsic && NeedToScalarize) {
4949  scalarizeInstruction(&I);
4950  break;
4951  }
4952 
4953  for (unsigned Part = 0; Part < UF; ++Part) {
4955  for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4956  Value *Arg = CI->getArgOperand(i);
4957  // Some intrinsics have a scalar argument - don't replace it with a
4958  // vector.
4959  if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4960  Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4961  Args.push_back(Arg);
4962  }
4963 
4964  Function *VectorF;
4965  if (UseVectorIntrinsic) {
4966  // Use vector version of the intrinsic.
4967  Type *TysForDecl[] = {CI->getType()};
4968  if (VF > 1)
4969  TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4970  VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4971  } else {
4972  // Use vector version of the library call.
4973  StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
4974  assert(!VFnName.empty() && "Vector function name is empty.");
4975  VectorF = M->getFunction(VFnName);
4976  if (!VectorF) {
4977  // Generate a declaration
4978  FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
4979  VectorF =
4980  Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
4981  VectorF->copyAttributesFrom(F);
4982  }
4983  }
4984  assert(VectorF && "Can't create vector function.");
4985 
4987  CI->getOperandBundlesAsDefs(OpBundles);
4988  CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4989 
4990  if (isa<FPMathOperator>(V))
4991  V->copyFastMathFlags(CI);
4992 
4993  VectorLoopValueMap.setVectorValue(&I, Part, V);
4994  addMetadata(V, &I);
4995  }
4996 
4997  break;
4998  }
4999 
5000  default:
5001  // All other instructions are unsupported. Scalarize them.
5002  scalarizeInstruction(&I);
5003  break;
5004  } // end of switch.
5005 }
5006 
5007 void InnerLoopVectorizer::updateAnalysis() {
5008  // Forget the original basic block.
5009  PSE.getSE()->forgetLoop(OrigLoop);
5010 
5011  // Update the dominator tree information.
5012  assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
5013  "Entry does not dominate exit.");
5014 
5015  DT->addNewBlock(LI->getLoopFor(LoopVectorBody)->getHeader(),
5016  LoopVectorPreHeader);
5017  DT->addNewBlock(LoopMiddleBlock,
5018  LI->getLoopFor(LoopVectorBody)->getLoopLatch());
5019  DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
5020  DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
5021  DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
5022 
5023  DEBUG(DT->verifyDomTree());
5024 }
5025 
5026 /// \brief Check whether it is safe to if-convert this phi node.
5027 ///
5028 /// Phi nodes with constant expressions that can trap are not safe to if
5029 /// convert.
5031  for (Instruction &I : *BB) {
5032  auto *Phi = dyn_cast<PHINode>(&I);
5033  if (!Phi)
5034  return true;
5035  for (Value *V : Phi->incoming_values())
5036  if (auto *C = dyn_cast<Constant>(V))
5037  if (C->canTrap())
5038  return false;
5039  }
5040  return true;
5041 }
5042 
5043 bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
5044  if (!EnableIfConversion) {
5045  ORE->emit(createMissedAnalysis("IfConversionDisabled")
5046  << "if-conversion is disabled");
5047  return false;
5048  }
5049 
5050  assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
5051 
5052  // A list of pointers that we can safely read and write to.
5053  SmallPtrSet<Value *, 8> SafePointes;
5054 
5055  // Collect safe addresses.
5056  for (BasicBlock *BB : TheLoop->blocks()) {
5057  if (blockNeedsPredication(BB))
5058  continue;
5059 
5060  for (Instruction &I : *BB)
5061  if (auto *Ptr = getPointerOperand(&I))
5062  SafePointes.insert(Ptr);
5063  }
5064 
5065  // Collect the blocks that need predication.
5066  BasicBlock *Header = TheLoop->getHeader();
5067  for (BasicBlock *BB : TheLoop->blocks()) {
5068  // We don't support switch statements inside loops.
5069  if (!isa<BranchInst>(BB->getTerminator())) {
5070  ORE->emit(createMissedAnalysis("LoopContainsSwitch", BB->getTerminator())
5071  << "loop contains a switch statement");
5072  return false;
5073  }
5074 
5075  // We must be able to predicate all blocks that need to be predicated.
5076  if (blockNeedsPredication(BB)) {
5077  if (!blockCanBePredicated(BB, SafePointes)) {
5078  ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator())
5079  << "control flow cannot be substituted for a select");
5080  return false;
5081  }
5082  } else if (BB != Header && !canIfConvertPHINodes(BB)) {
5083  ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator())
5084  << "control flow cannot be substituted for a select");
5085  return false;
5086  }
5087  }
5088 
5089  // We can if-convert this loop.
5090  return true;
5091 }
5092 
5093 bool LoopVectorizationLegality::canVectorize() {
5094  // Store the result and return it at the end instead of exiting early, in case
5095  // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
5096  bool Result = true;
5097  // We must have a loop in canonical form. Loops with indirectbr in them cannot
5098  // be canonicalized.
5099  if (!TheLoop->getLoopPreheader()) {
5100  ORE->emit(createMissedAnalysis("CFGNotUnderstood")
5101  << "loop control flow is not understood by vectorizer");
5102  if (ORE->allowExtraAnalysis())
5103  Result = false;
5104  else
5105  return false;
5106  }
5107 
5108  // FIXME: The code is currently dead, since the loop gets sent to
5109  // LoopVectorizationLegality is already an innermost loop.
5110  //
5111  // We can only vectorize innermost loops.
5112  if (!TheLoop->empty()) {
5113  ORE->emit(createMissedAnalysis("NotInnermostLoop")
5114  << "loop is not the innermost loop");
5115  if (ORE->allowExtraAnalysis())
5116  Result = false;
5117  else
5118  return false;
5119  }
5120 
5121  // We must have a single backedge.
5122  if (TheLoop->getNumBackEdges() != 1) {
5123  ORE->emit(createMissedAnalysis("CFGNotUnderstood")
5124  << "loop control flow is not understood by vectorizer");
5125  if (ORE->allowExtraAnalysis())
5126  Result = false;
5127  else
5128  return false;
5129  }
5130 
5131  // We must have a single exiting block.
5132  if (!TheLoop->getExitingBlock()) {
5133  ORE->emit(createMissedAnalysis("CFGNotUnderstood")
5134  << "loop control flow is not understood by vectorizer");
5135  if (ORE->allowExtraAnalysis())
5136  Result = false;
5137  else
5138  return false;
5139  }
5140 
5141  // We only handle bottom-tested loops, i.e. loop in which the condition is
5142  // checked at the end of each iteration. With that we can assume that all
5143  // instructions in the loop are executed the same number of times.
5144  if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5145  ORE->emit(createMissedAnalysis("CFGNotUnderstood")
5146  << "loop control flow is not understood by vectorizer");
5147  if (ORE->allowExtraAnalysis())
5148  Result = false;
5149  else
5150  return false;
5151  }
5152 
5153  // We need to have a loop header.
5154  DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName()
5155  << '\n');
5156 
5157  // Check if we can if-convert non-single-bb loops.
5158  unsigned NumBlocks = TheLoop->getNumBlocks();
5159  if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
5160  DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
5161  if (ORE->allowExtraAnalysis())
5162  Result = false;
5163  else
5164  return false;
5165  }
5166 
5167  // Check if we can vectorize the instructions and CFG in this loop.
5168  if (!canVectorizeInstrs()) {
5169  DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
5170  if (ORE->allowExtraAnalysis())
5171  Result = false;
5172  else
5173  return false;
5174  }
5175 
5176  // Go over each instruction and look at memory deps.
5177  if (!canVectorizeMemory()) {
5178  DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
5179  if (ORE->allowExtraAnalysis())
5180  Result = false;
5181  else
5182  return false;
5183  }
5184 
5185  DEBUG(dbgs() << "LV: We can vectorize this loop"
5186  << (LAI->getRuntimePointerChecking()->Need
5187  ? " (with a runtime bound check)"
5188  : "")
5189  << "!\n");
5190 
5191  bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
5192 
5193  // If an override option has been passed in for interleaved accesses, use it.
5194  if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
5195  UseInterleaved = EnableInterleavedMemAccesses;
5196 
5197  // Analyze interleaved memory accesses.
5198  if (UseInterleaved)
5199  InterleaveInfo.analyzeInterleaving(*getSymbolicStrides());
5200 
5201  unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
5202  if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
5203  SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
5204 
5205  if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) {
5206  ORE->emit(createMissedAnalysis("TooManySCEVRunTimeChecks")
5207  << "Too many SCEV assumptions need to be made and checked "
5208  << "at runtime");
5209  DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n");
5210  if (ORE->allowExtraAnalysis())
5211  Result = false;
5212  else
5213  return false;
5214  }
5215 
5216  // Okay! We've done all the tests. If any have failed, return false. Otherwise
5217  // we can vectorize, and at this point we don't have any other mem analysis
5218  // which may limit our maximum vectorization factor, so just return true with
5219  // no restrictions.
5220  return Result;
5221 }
5222 
5224  if (Ty->isPointerTy())
5225  return DL.getIntPtrType(Ty);
5226 
5227  // It is possible that char's or short's overflow when we ask for the loop's
5228  // trip count, work around this by changing the type size.
5229  if (Ty->getScalarSizeInBits() < 32)
5230  return Type::getInt32Ty(Ty->getContext());
5231 
5232  return Ty;
5233 }
5234 
5235 static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
5236  Ty0 = convertPointerToIntegerType(DL, Ty0);
5237  Ty1 = convertPointerToIntegerType(DL, Ty1);
5238  if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
5239  return Ty0;
5240  return Ty1;
5241 }
5242 
5243 /// \brief Check that the instruction has outside loop users and is not an
5244 /// identified reduction variable.
5245 static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
5246  SmallPtrSetImpl<Value *> &AllowedExit) {
5247  // Reduction and Induction instructions are allowed to have exit users. All
5248  // other instructions must not have external users.
5249  if (!AllowedExit.count(Inst))
5250  // Check that all of the users of the loop are inside the BB.
5251  for (User *U : Inst->users()) {
5252  Instruction *UI = cast<Instruction>(U);
5253  // This user may be a reduction exit value.
5254  if (!TheLoop->contains(UI)) {
5255  DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n');
5256  return true;
5257  }
5258  }
5259  return false;
5260 }
5261 
5262 void LoopVectorizationLegality::addInductionPhi(
5263  PHINode *Phi, const InductionDescriptor &ID,
5264  SmallPtrSetImpl<Value *> &AllowedExit) {
5265  Inductions[Phi] = ID;
5266  Type *PhiTy = Phi->getType();
5267  const DataLayout &DL = Phi->getModule()->getDataLayout();
5268 
5269  // Get the widest type.
5270  if (!PhiTy->isFloatingPointTy()) {
5271  if (!WidestIndTy)
5272  WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
5273  else
5274  WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
5275  }
5276 
5277  // Int inductions are special because we only allow one IV.
5279  ID.getConstIntStepValue() &&
5280  ID.getConstIntStepValue()->isOne() &&
5281  isa<Constant>(ID.getStartValue()) &&
5282  cast<Constant>(ID.getStartValue())->isNullValue()) {
5283 
5284  // Use the phi node with the widest type as induction. Use the last
5285  // one if there are multiple (no good reason for doing this other
5286  // than it is expedient). We've checked that it begins at zero and
5287  // steps by one, so this is a canonical induction variable.
5288  if (!PrimaryInduction || PhiTy == WidestIndTy)
5289  PrimaryInduction = Phi;
5290  }
5291 
5292  // Both the PHI node itself, and the "post-increment" value feeding
5293  // back into the PHI node may have external users.
5294  // We can allow those uses, except if the SCEVs we have for them rely
5295  // on predicates that only hold within the loop, since allowing the exit
5296  // currently means re-using this SCEV outside the loop.
5297  if (PSE.getUnionPredicate().isAlwaysTrue()) {
5298  AllowedExit.insert(Phi);
5299  AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch()));
5300  }
5301 
5302  DEBUG(dbgs() << "LV: Found an induction variable.\n");
5303  return;
5304 }
5305 
5306 bool LoopVectorizationLegality::canVectorizeInstrs() {
5307  BasicBlock *Header = TheLoop->getHeader();
5308 
5309  // Look for the attribute signaling the absence of NaNs.
5310  Function &F = *Header->getParent();
5311  HasFunNoNaNAttr =
5312  F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
5313 
5314  // For each block in the loop.
5315  for (BasicBlock *BB : TheLoop->blocks()) {
5316  // Scan the instructions in the block and look for hazards.
5317  for (Instruction &I : *BB) {
5318  if (auto *Phi = dyn_cast<PHINode>(&I)) {
5319  Type *PhiTy = Phi->getType();
5320  // Check that this PHI type is allowed.
5321  if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
5322  !PhiTy->isPointerTy()) {
5323  ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi)
5324  << "loop control flow is not understood by vectorizer");
5325  DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
5326  return false;
5327  }
5328 
5329  // If this PHINode is not in the header block, then we know that we
5330  // can convert it to select during if-conversion. No need to check if
5331  // the PHIs in this block are induction or reduction variables.
5332  if (BB != Header) {
5333  // Check that this instruction has no outside users or is an
5334  // identified reduction value with an outside user.
5335  if (!hasOutsideLoopUser(TheLoop, Phi, AllowedExit))
5336  continue;
5337  ORE->emit(createMissedAnalysis("NeitherInductionNorReduction", Phi)
5338  << "value could not be identified as "
5339  "an induction or reduction variable");
5340  return false;
5341  }
5342 
5343  // We only allow if-converted PHIs with exactly two incoming values.
5344  if (Phi->getNumIncomingValues() != 2) {
5345  ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi)
5346  << "control flow not understood by vectorizer");
5347  DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
5348  return false;
5349  }
5350 
5351  RecurrenceDescriptor RedDes;
5352  if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes)) {
5353  if (RedDes.hasUnsafeAlgebra())
5354  Requirements->addUnsafeAlgebraInst(RedDes.getUnsafeAlgebraInst());
5355  AllowedExit.insert(RedDes.getLoopExitInstr());
5356  Reductions[Phi] = RedDes;
5357  continue;
5358  }
5359 
5361  if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) {
5362  addInductionPhi(Phi, ID, AllowedExit);
5363  if (ID.hasUnsafeAlgebra() && !HasFunNoNaNAttr)
5364  Requirements->addUnsafeAlgebraInst(ID.getUnsafeAlgebraInst());
5365  continue;
5366  }
5367 
5369  SinkAfter, DT)) {
5370  FirstOrderRecurrences.insert(Phi);
5371  continue;
5372  }
5373 
5374  // As a last resort, coerce the PHI to a AddRec expression
5375  // and re-try classifying it a an induction PHI.
5376  if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true)) {
5377  addInductionPhi(Phi, ID, AllowedExit);
5378  continue;
5379  }
5380 
5381  ORE->emit(createMissedAnalysis("NonReductionValueUsedOutsideLoop", Phi)
5382  << "value that could not be identified as "
5383  "reduction is used outside the loop");
5384  DEBUG(dbgs() << "LV: Found an unidentified PHI." << *Phi << "\n");
5385  return false;
5386  } // end of PHI handling
5387 
5388  // We handle calls that:
5389  // * Are debug info intrinsics.
5390  // * Have a mapping to an IR intrinsic.
5391  // * Have a vector version available.
5392  auto *CI = dyn_cast<CallInst>(&I);
5393  if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
5394  !isa<DbgInfoIntrinsic>(CI) &&
5395  !(CI->getCalledFunction() && TLI &&
5396  TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) {
5397  ORE->emit(createMissedAnalysis("CantVectorizeCall", CI)
5398  << "call instruction cannot be vectorized");
5399  DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n");
5400  return false;
5401  }
5402 
5403  // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the
5404  // second argument is the same (i.e. loop invariant)
5405  if (CI && hasVectorInstrinsicScalarOpd(
5406  getVectorIntrinsicIDForCall(CI, TLI), 1)) {
5407  auto *SE = PSE.getSE();
5408  if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) {
5409  ORE->emit(createMissedAnalysis("CantVectorizeIntrinsic", CI)
5410  << "intrinsic instruction cannot be vectorized");
5411  DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n");
5412  return false;
5413  }
5414  }
5415 
5416  // Check that the instruction return type is vectorizable.
5417  // Also, we can't vectorize extractelement instructions.
5419  !I.getType()->isVoidTy()) ||
5420  isa<ExtractElementInst>(I)) {
5421  ORE->emit(createMissedAnalysis("CantVectorizeInstructionReturnType", &I)
5422  << "instruction return type cannot be vectorized");
5423  DEBUG(dbgs() << "LV: Found unvectorizable type.\n");
5424  return false;
5425  }
5426 
5427  // Check that the stored type is vectorizable.
5428  if (auto *ST = dyn_cast<StoreInst>(&I)) {
5429  Type *T = ST->getValueOperand()->getType();
5431  ORE->emit(createMissedAnalysis("CantVectorizeStore", ST)
5432  << "store instruction cannot be vectorized");
5433  return false;
5434  }
5435 
5436  // FP instructions can allow unsafe algebra, thus vectorizable by
5437  // non-IEEE-754 compliant SIMD units.
5438  // This applies to floating-point math operations and calls, not memory
5439  // operations, shuffles, or casts, as they don't change precision or
5440  // semantics.
5441  } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) &&
5442  !I.hasUnsafeAlgebra()) {
5443  DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
5444  Hints->setPotentiallyUnsafe();
5445  }
5446 
5447  // Reduction instructions are allowed to have exit users.
5448  // All other instructions must not have external users.
5449  if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) {
5450  ORE->emit(createMissedAnalysis("ValueUsedOutsideLoop", &I)
5451  << "value cannot be used outside the loop");
5452  return false;
5453  }
5454 
5455  } // next instr.
5456  }
5457 
5458  if (!PrimaryInduction) {
5459  DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
5460  if (Inductions.empty()) {
5461  ORE->emit(createMissedAnalysis("NoInductionVariable")
5462  << "loop induction variable could not be identified");
5463  return false;
5464  }
5465  }
5466 
5467  // Now we know the widest induction type, check if our found induction
5468  // is the same size. If it's not, unset it here and InnerLoopVectorizer
5469  // will create another.
5470  if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType())
5471  PrimaryInduction = nullptr;
5472 
5473  return true;
5474 }
5475 
5476 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
5477 
5478  // We should not collect Scalars more than once per VF. Right now, this
5479  // function is called from collectUniformsAndScalars(), which already does
5480  // this check. Collecting Scalars for VF=1 does not make any sense.
5481  assert(VF >= 2 && !Scalars.count(VF) &&
5482  "This function should not be visited twice for the same VF");
5483 
5485 
5486  // These sets are used to seed the analysis with pointers used by memory
5487  // accesses that will remain scalar.
5489  SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
5490 
5491  // A helper that returns true if the use of Ptr by MemAccess will be scalar.
5492  // The pointer operands of loads and stores will be scalar as long as the
5493  // memory access is not a gather or scatter operation. The value operand of a
5494  // store will remain scalar if the store is scalarized.
5495  auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
5496  InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
5497  assert(WideningDecision != CM_Unknown &&
5498  "Widening decision should be ready at this moment");
5499  if (auto *Store = dyn_cast<StoreInst>(MemAccess))
5500  if (Ptr == Store->getValueOperand())
5501  return WideningDecision == CM_Scalarize;
5502  assert(Ptr == getPointerOperand(MemAccess) &&
5503  "Ptr is neither a value or pointer operand");
5504  return WideningDecision != CM_GatherScatter;
5505  };
5506 
5507  // A helper that returns true if the given value is a bitcast or
5508  // getelementptr instruction contained in the loop.
5509  auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
5510  return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
5511  isa<GetElementPtrInst>(V)) &&
5512  !TheLoop->isLoopInvariant(V);
5513  };
5514 
5515  // A helper that evaluates a memory access's use of a pointer. If the use
5516  // will be a scalar use, and the pointer is only used by memory accesses, we
5517  // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
5518  // PossibleNonScalarPtrs.
5519  auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
5520 
5521  // We only care about bitcast and getelementptr instructions contained in
5522  // the loop.
5523  if (!isLoopVaryingBitCastOrGEP(Ptr))
5524  return;
5525 
5526  // If the pointer has already been identified as scalar (e.g., if it was
5527  // also identified as uniform), there's nothing to do.
5528  auto *I = cast<Instruction>(Ptr);
5529  if (Worklist.count(I))
5530  return;
5531 
5532  // If the use of the pointer will be a scalar use, and all users of the
5533  // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
5534  // place the pointer in PossibleNonScalarPtrs.
5535  if (isScalarUse(MemAccess, Ptr) && all_of(I->users(), [&](User *U) {
5536  return isa<LoadInst>(U) || isa<StoreInst>(U);
5537  }))
5538  ScalarPtrs.insert(I);
5539  else
5540  PossibleNonScalarPtrs.insert(I);
5541  };
5542 
5543  // We seed the scalars analysis with three classes of instructions: (1)
5544  // instructions marked uniform-after-vectorization, (2) bitcast and
5545  // getelementptr instructions used by memory accesses requiring a scalar use,
5546  // and (3) pointer induction variables and their update instructions (we
5547  // currently only scalarize these).
5548  //
5549  // (1) Add to the worklist all instructions that have been identified as
5550  // uniform-after-vectorization.
5551  Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
5552 
5553  // (2) Add to the worklist all bitcast and getelementptr instructions used by
5554  // memory accesses requiring a scalar use. The pointer operands of loads and
5555  // stores will be scalar as long as the memory accesses is not a gather or
5556  // scatter operation. The value operand of a store will remain scalar if the
5557  // store is scalarized.
5558  for (auto *BB : TheLoop->blocks())
5559  for (auto &I : *BB) {
5560  if (auto *Load = dyn_cast<LoadInst>(&I)) {
5561  evaluatePtrUse(Load, Load->getPointerOperand());
5562  } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5563  evaluatePtrUse(Store, Store->getPointerOperand());
5564  evaluatePtrUse(Store, Store->getValueOperand());
5565  }
5566  }
5567  for (auto *I : ScalarPtrs)
5568  if (!PossibleNonScalarPtrs.count(I)) {
5569  DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
5570  Worklist.insert(I);
5571  }
5572 
5573  // (3) Add to the worklist all pointer induction variables and their update
5574  // instructions.
5575  //
5576  // TODO: Once we are able to vectorize pointer induction variables we should
5577  // no longer insert them into the worklist here.
5578  auto *Latch = TheLoop->getLoopLatch();
5579  for (auto &Induction : *Legal->getInductionVars()) {
5580  auto *Ind = Induction.first;
5581  auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5582  if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
5583  continue;
5584  Worklist.insert(Ind);
5585  Worklist.insert(IndUpdate);
5586  DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
5587  DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n");
5588  }
5589 
5590  // Insert the forced scalars.
5591  // FIXME: Currently widenPHIInstruction() often creates a dead vector
5592  // induction variable when the PHI user is scalarized.
5593  if (ForcedScalars.count(VF))
5594  for (auto *I : ForcedScalars.find(VF)->second)
5595  Worklist.insert(I);
5596 
5597  // Expand the worklist by looking through any bitcasts and getelementptr
5598  // instructions we've already identified as scalar. This is similar to the
5599  // expansion step in collectLoopUniforms(); however, here we're only
5600  // expanding to include additional bitcasts and getelementptr instructions.
5601  unsigned Idx = 0;
5602  while (Idx != Worklist.size()) {
5603  Instruction *Dst = Worklist[Idx++];
5604  if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
5605  continue;
5606  auto *Src = cast<Instruction>(Dst->getOperand(0));
5607  if (all_of(Src->users(), [&](User *U) -> bool {
5608  auto *J = cast<Instruction>(U);
5609  return !TheLoop->contains(J) || Worklist.count(J) ||
5610  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
5611  isScalarUse(J, Src));
5612  })) {
5613  Worklist.insert(Src);
5614  DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
5615  }
5616  }
5617 
5618  // An induction variable will remain scalar if all users of the induction
5619  // variable and induction variable update remain scalar.
5620  for (auto &Induction : *Legal->getInductionVars()) {
5621  auto *Ind = Induction.first;
5622  auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5623 
5624  // We already considered pointer induction variables, so there's no reason
5625  // to look at their users again.
5626  //
5627  // TODO: Once we are able to vectorize pointer induction variables we
5628  // should no longer skip over them here.
5629  if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
5630  continue;
5631 
5632  // Determine if all users of the induction variable are scalar after
5633  // vectorization.
5634  auto ScalarInd = all_of(Ind->users(), [&](User *U) -> bool {
5635  auto *I = cast<Instruction>(U);
5636  return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
5637  });
5638  if (!ScalarInd)
5639  continue;
5640 
5641  // Determine if all users of the induction variable update instruction are
5642  // scalar after vectorization.
5643  auto ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
5644  auto *I = cast<Instruction>(U);
5645  return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
5646  });
5647  if (!ScalarIndUpdate)
5648  continue;
5649 
5650  // The induction variable and its update instruction will remain scalar.
5651  Worklist.insert(Ind);
5652  Worklist.insert(IndUpdate);
5653  DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
5654  DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n");
5655  }
5656 
5657  Scalars[VF].insert(Worklist.begin(), Worklist.end());
5658 }
5659 
5660 bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) {
5661  if (!blockNeedsPredication(I->getParent()))
5662  return false;
5663  switch(I->getOpcode()) {
5664  default:
5665  break;
5666  case Instruction::Store:
5667  return !isMaskRequired(I);
5668  case Instruction::UDiv:
5669  case Instruction::SDiv:
5670  case Instruction::SRem:
5671  case Instruction::URem:
5672  return mayDivideByZero(*I);
5673  }
5674  return false;
5675 }
5676 
5677 bool LoopVectorizationLegality::memoryInstructionCanBeWidened(Instruction *I,
5678  unsigned VF) {
5679  // Get and ensure we have a valid memory instruction.
5680  LoadInst *LI = dyn_cast<LoadInst>(I);
5682  assert((LI || SI) && "Invalid memory instruction");
5683 
5684  auto *Ptr = getPointerOperand(I);
5685 
5686  // In order to be widened, the pointer should be consecutive, first of all.
5687  if (!isConsecutivePtr(Ptr))
5688  return false;
5689 
5690  // If the instruction is a store located in a predicated block, it will be
5691  // scalarized.
5692  if (isScalarWithPredication(I))
5693  return false;
5694 
5695  // If the instruction's allocated size doesn't equal it's type size, it
5696  // requires padding and will be scalarized.
5697  auto &DL = I->getModule()->getDataLayout();
5698  auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
5699  if (hasIrregularType(ScalarTy, DL, VF))
5700  return false;
5701 
5702  return true;
5703 }
5704 
5705 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
5706 
5707  // We should not collect Uniforms more than once per VF. Right now,
5708  // this function is called from collectUniformsAndScalars(), which
5709  // already does this check. Collecting Uniforms for VF=1 does not make any
5710  // sense.
5711 
5712  assert(VF >= 2 && !Uniforms.count(VF) &&
5713  "This function should not be visited twice for the same VF");
5714 
5715  // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5716  // not analyze again. Uniforms.count(VF) will return 1.
5717  Uniforms[VF].clear();
5718 
5719  // We now know that the loop is vectorizable!
5720  // Collect instructions inside the loop that will remain uniform after
5721  // vectorization.
5722 
5723  // Global values, params and instructions outside of current loop are out of
5724  // scope.
5725  auto isOutOfScope = [&](Value *V) -> bool {
5726  Instruction *I = dyn_cast<Instruction>(V);
5727  return (!I || !TheLoop->contains(I));
5728  };
5729 
5730  SetVector<Instruction *> Worklist;
5731  BasicBlock *Latch = TheLoop->getLoopLatch();
5732 
5733  // Start with the conditional branch. If the branch condition is an
5734  // instruction contained in the loop that is only used by the branch, it is
5735  // uniform.
5736  auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5737  if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
5738  Worklist.insert(Cmp);
5739  DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
5740  }
5741 
5742  // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
5743  // are pointers that are treated like consecutive pointers during
5744  // vectorization. The pointer operands of interleaved accesses are an
5745  // example.
5746  SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
5747 
5748  // Holds pointer operands of instructions that are possibly non-uniform.
5749  SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
5750 
5751  auto isUniformDecision = [&](Instruction *I, unsigned VF) {
5752  InstWidening WideningDecision = getWideningDecision(I, VF);
5753  assert(WideningDecision != CM_Unknown &&
5754  "Widening decision should be ready at this moment");
5755 
5756  return (WideningDecision == CM_Widen ||
5757  WideningDecision == CM_Interleave);
5758  };
5759  // Iterate over the instructions in the loop, and collect all
5760  // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
5761  // that a consecutive-like pointer operand will be scalarized, we collect it
5762  // in PossibleNonUniformPtrs instead. We use two sets here because a single
5763  // getelementptr instruction can be used by both vectorized and scalarized
5764  // memory instructions. For example, if a loop loads and stores from the same
5765  // location, but the store is conditional, the store will be scalarized, and
5766  // the getelementptr won't remain uniform.
5767  for (auto *BB : TheLoop->blocks())
5768  for (auto &I : *BB) {
5769 
5770  // If there's no pointer operand, there's nothing to do.
5771  auto *Ptr = dyn_cast_or_null<Instruction>(getPointerOperand(&I));
5772  if (!Ptr)
5773  continue;
5774 
5775  // True if all users of Ptr are memory accesses that have Ptr as their
5776  // pointer operand.
5777  auto UsersAreMemAccesses = all_of(Ptr->users(), [&](User *U) -> bool {
5778  return getPointerOperand(U) == Ptr;
5779  });
5780 
5781  // Ensure the memory instruction will not be scalarized or used by
5782  // gather/scatter, making its pointer operand non-uniform. If the pointer
5783  // operand is used by any instruction other than a memory access, we
5784  // conservatively assume the pointer operand may be non-uniform.
5785  if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
5786  PossibleNonUniformPtrs.insert(Ptr);
5787 
5788  // If the memory instruction will be vectorized and its pointer operand
5789  // is consecutive-like, or interleaving - the pointer operand should
5790  // remain uniform.
5791  else
5792  ConsecutiveLikePtrs.insert(Ptr);
5793  }
5794 
5795  // Add to the Worklist all consecutive and consecutive-like pointers that
5796  // aren't also identified as possibly non-uniform.
5797  for (auto *V : ConsecutiveLikePtrs)
5798  if (!PossibleNonUniformPtrs.count(V)) {
5799  DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
5800  Worklist.insert(V);
5801  }
5802 
5803  // Expand Worklist in topological order: whenever a new instruction
5804  // is added , its users should be either already inside Worklist, or
5805  // out of scope. It ensures a uniform instruction will only be used
5806  // by uniform instructions or out of scope instructions.
5807  unsigned idx = 0;
5808  while (idx != Worklist.size()) {
5809  Instruction *I = Worklist[idx++];
5810 
5811  for (auto OV : I->operand_values()) {
5812  if (isOutOfScope(OV))
5813  continue;
5814  auto *OI = cast<Instruction>(OV);
5815  if (all_of(OI->users(), [&](User *U) -> bool {
5816  auto *J = cast<Instruction>(U);
5817  return !TheLoop->contains(J) || Worklist.count(J) ||
5818  (OI == getPointerOperand(J) && isUniformDecision(J, VF));
5819  })) {
5820  Worklist.insert(OI);
5821  DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
5822  }
5823  }
5824  }
5825 
5826  // Returns true if Ptr is the pointer operand of a memory access instruction
5827  // I, and I is known to not require scalarization.
5828  auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5829  return getPointerOperand(I) == Ptr && isUniformDecision(I, VF);
5830  };
5831 
5832  // For an instruction to be added into Worklist above, all its users inside
5833  // the loop should also be in Worklist. However, this condition cannot be
5834  // true for phi nodes that form a cyclic dependence. We must process phi
5835  // nodes separately. An induction variable will remain uniform if all users
5836  // of the induction variable and induction variable update remain uniform.
5837  // The code below handles both pointer and non-pointer induction variables.
5838  for (auto &Induction : *Legal->getInductionVars()) {
5839  auto *Ind = Induction.first;
5840  auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5841 
5842  // Determine if all users of the induction variable are uniform after
5843  // vectorization.
5844  auto UniformInd = all_of(Ind->users(), [&](User *U) -> bool {
5845  auto *I = cast<Instruction>(U);
5846  return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5847  isVectorizedMemAccessUse(I, Ind);
5848  });
5849  if (!UniformInd)
5850  continue;
5851 
5852  // Determine if all users of the induction variable update instruction are
5853  // uniform after vectorization.
5854  auto UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
5855  auto *I = cast<Instruction>(U);
5856  return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5857  isVectorizedMemAccessUse(I, IndUpdate);
5858  });
5859  if (!UniformIndUpdate)
5860  continue;
5861 
5862  // The induction variable and its update instruction will remain uniform.
5863  Worklist.insert(Ind);
5864  Worklist.insert(IndUpdate);
5865  DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
5866  DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate << "\n");
5867  }
5868 
5869  Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5870 }
5871 
5872 bool LoopVectorizationLegality::canVectorizeMemory() {
5873  LAI = &(*GetLAA)(*TheLoop);
5874  InterleaveInfo.setLAI(LAI);
5875  const Optimizatio