LLVM  6.0.0svn
LoopVectorize.cpp
Go to the documentation of this file.
1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
11 // and generates target-independent LLVM-IR.
12 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
13 // of instructions in order to estimate the profitability of vectorization.
14 //
15 // The loop vectorizer combines consecutive loop iterations into a single
16 // 'wide' iteration. After this transformation the index is incremented
17 // by the SIMD vector width, and not by one.
18 //
19 // This pass has three parts:
20 // 1. The main loop pass that drives the different parts.
21 // 2. LoopVectorizationLegality - A unit that checks for the legality
22 // of the vectorization.
23 // 3. InnerLoopVectorizer - A unit that performs the actual
24 // widening of instructions.
25 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
26 // of vectorization. It decides on the optimal vector width, which
27 // can be one, if vectorization is not profitable.
28 //
29 //===----------------------------------------------------------------------===//
30 //
31 // The reduction-variable vectorization is based on the paper:
32 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
33 //
34 // Variable uniformity checks are inspired by:
35 // Karrenberg, R. and Hack, S. Whole Function Vectorization.
36 //
37 // The interleaved access vectorization is based on the paper:
38 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
39 // Data for SIMD
40 //
41 // Other ideas/concepts are from:
42 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
43 //
44 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
45 // Vectorizing Compilers.
46 //
47 //===----------------------------------------------------------------------===//
48 
50 #include "VPlan.h"
51 #include "VPlanBuilder.h"
52 #include "llvm/ADT/APInt.h"
53 #include "llvm/ADT/ArrayRef.h"
54 #include "llvm/ADT/DenseMap.h"
55 #include "llvm/ADT/DenseMapInfo.h"
56 #include "llvm/ADT/Hashing.h"
57 #include "llvm/ADT/MapVector.h"
58 #include "llvm/ADT/None.h"
59 #include "llvm/ADT/Optional.h"
60 #include "llvm/ADT/SCCIterator.h"
61 #include "llvm/ADT/STLExtras.h"
62 #include "llvm/ADT/SetVector.h"
63 #include "llvm/ADT/SmallPtrSet.h"
64 #include "llvm/ADT/SmallSet.h"
65 #include "llvm/ADT/SmallVector.h"
66 #include "llvm/ADT/Statistic.h"
67 #include "llvm/ADT/StringRef.h"
68 #include "llvm/ADT/Twine.h"
78 #include "llvm/Analysis/LoopInfo.h"
87 #include "llvm/IR/Attributes.h"
88 #include "llvm/IR/BasicBlock.h"
89 #include "llvm/IR/CFG.h"
90 #include "llvm/IR/Constant.h"
91 #include "llvm/IR/Constants.h"
92 #include "llvm/IR/DataLayout.h"
94 #include "llvm/IR/DebugLoc.h"
95 #include "llvm/IR/DerivedTypes.h"
96 #include "llvm/IR/DiagnosticInfo.h"
97 #include "llvm/IR/Dominators.h"
98 #include "llvm/IR/Function.h"
99 #include "llvm/IR/IRBuilder.h"
100 #include "llvm/IR/InstrTypes.h"
101 #include "llvm/IR/Instruction.h"
102 #include "llvm/IR/Instructions.h"
103 #include "llvm/IR/IntrinsicInst.h"
104 #include "llvm/IR/Intrinsics.h"
105 #include "llvm/IR/LLVMContext.h"
106 #include "llvm/IR/Metadata.h"
107 #include "llvm/IR/Module.h"
108 #include "llvm/IR/Operator.h"
109 #include "llvm/IR/Type.h"
110 #include "llvm/IR/Use.h"
111 #include "llvm/IR/User.h"
112 #include "llvm/IR/Value.h"
113 #include "llvm/IR/ValueHandle.h"
114 #include "llvm/IR/Verifier.h"
115 #include "llvm/Pass.h"
116 #include "llvm/Support/Casting.h"
118 #include "llvm/Support/Compiler.h"
119 #include "llvm/Support/Debug.h"
121 #include "llvm/Support/MathExtras.h"
127 #include <algorithm>
128 #include <cassert>
129 #include <cstdint>
130 #include <cstdlib>
131 #include <functional>
132 #include <iterator>
133 #include <limits>
134 #include <memory>
135 #include <string>
136 #include <tuple>
137 #include <utility>
138 #include <vector>
139 
140 using namespace llvm;
141 
142 #define LV_NAME "loop-vectorize"
143 #define DEBUG_TYPE LV_NAME
144 
145 STATISTIC(LoopsVectorized, "Number of loops vectorized");
146 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
147 
148 static cl::opt<bool>
149  EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
150  cl::desc("Enable if-conversion during vectorization."));
151 
152 /// Loops with a known constant trip count below this number are vectorized only
153 /// if no scalar iteration overheads are incurred.
155  "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
156  cl::desc("Loops with a constant trip count that is smaller than this "
157  "value are vectorized only if no scalar iteration overheads "
158  "are incurred."));
159 
161  "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
162  cl::desc("Maximize bandwidth when selecting vectorization factor which "
163  "will be determined by the smallest type in loop."));
164 
166  "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
167  cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
168 
169 /// Maximum factor for an interleaved memory access.
171  "max-interleave-group-factor", cl::Hidden,
172  cl::desc("Maximum factor for an interleaved access group (default = 8)"),
173  cl::init(8));
174 
175 /// We don't interleave loops with a known constant trip count below this
176 /// number.
177 static const unsigned TinyTripCountInterleaveThreshold = 128;
178 
180  "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
181  cl::desc("A flag that overrides the target's number of scalar registers."));
182 
184  "force-target-num-vector-regs", cl::init(0), cl::Hidden,
185  cl::desc("A flag that overrides the target's number of vector registers."));
186 
187 /// Maximum vectorization interleave count.
188 static const unsigned MaxInterleaveFactor = 16;
189 
191  "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
192  cl::desc("A flag that overrides the target's max interleave factor for "
193  "scalar loops."));
194 
196  "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
197  cl::desc("A flag that overrides the target's max interleave factor for "
198  "vectorized loops."));
199 
201  "force-target-instruction-cost", cl::init(0), cl::Hidden,
202  cl::desc("A flag that overrides the target's expected cost for "
203  "an instruction to a single constant value. Mostly "
204  "useful for getting consistent testing."));
205 
207  "small-loop-cost", cl::init(20), cl::Hidden,
208  cl::desc(
209  "The cost of a loop that is considered 'small' by the interleaver."));
210 
212  "loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden,
213  cl::desc("Enable the use of the block frequency analysis to access PGO "
214  "heuristics minimizing code growth in cold regions and being more "
215  "aggressive in hot regions."));
216 
217 // Runtime interleave loops for load/store throughput.
219  "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
220  cl::desc(
221  "Enable runtime interleaving until load/store ports are saturated"));
222 
223 /// The number of stores in a loop that are allowed to need predication.
225  "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
226  cl::desc("Max number of stores to be predicated behind an if."));
227 
229  "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
230  cl::desc("Count the induction variable only once when interleaving"));
231 
233  "enable-cond-stores-vec", cl::init(true), cl::Hidden,
234  cl::desc("Enable if predication of stores during vectorization."));
235 
237  "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
238  cl::desc("The maximum interleave count to use when interleaving a scalar "
239  "reduction in a nested loop."));
240 
242  "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
243  cl::desc("The maximum allowed number of runtime memory checks with a "
244  "vectorize(enable) pragma."));
245 
247  "vectorize-scev-check-threshold", cl::init(16), cl::Hidden,
248  cl::desc("The maximum number of SCEV checks allowed."));
249 
251  "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden,
252  cl::desc("The maximum number of SCEV checks allowed with a "
253  "vectorize(enable) pragma"));
254 
255 /// Create an analysis remark that explains why vectorization failed
256 ///
257 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
258 /// RemarkName is the identifier for the remark. If \p I is passed it is an
259 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for
260 /// the location of the remark. \return the remark object that can be
261 /// streamed to.
263 createMissedAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
264  Instruction *I = nullptr) {
265  Value *CodeRegion = TheLoop->getHeader();
266  DebugLoc DL = TheLoop->getStartLoc();
267 
268  if (I) {
269  CodeRegion = I->getParent();
270  // If there is no debug location attached to the instruction, revert back to
271  // using the loop's.
272  if (I->getDebugLoc())
273  DL = I->getDebugLoc();
274  }
275 
276  OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
277  R << "loop not vectorized: ";
278  return R;
279 }
280 
281 namespace {
282 
283 class LoopVectorizationLegality;
284 class LoopVectorizationCostModel;
285 class LoopVectorizationRequirements;
286 class VPBlendRecipe;
287 class VPInterleaveRecipe;
288 class VPReplicateRecipe;
289 class VPWidenIntOrFpInductionRecipe;
290 class VPWidenRecipe;
291 class VPWidenMemoryInstructionRecipe;
292 
293 } // end anonymous namespace
294 
295 /// Returns true if the given loop body has a cycle, excluding the loop
296 /// itself.
297 static bool hasCyclesInLoopBody(const Loop &L) {
298  if (!L.empty())
299  return true;
300 
301  for (const auto &SCC :
304  if (SCC.size() > 1) {
305  DEBUG(dbgs() << "LVL: Detected a cycle in the loop body:\n");
306  DEBUG(L.dump());
307  return true;
308  }
309  }
310  return false;
311 }
312 
313 /// A helper function for converting Scalar types to vector types.
314 /// If the incoming type is void, we return void. If the VF is 1, we return
315 /// the scalar type.
316 static Type *ToVectorTy(Type *Scalar, unsigned VF) {
317  if (Scalar->isVoidTy() || VF == 1)
318  return Scalar;
319  return VectorType::get(Scalar, VF);
320 }
321 
322 // FIXME: The following helper functions have multiple implementations
323 // in the project. They can be effectively organized in a common Load/Store
324 // utilities unit.
325 
326 /// A helper function that returns the pointer operand of a load or store
327 /// instruction.
329  if (auto *LI = dyn_cast<LoadInst>(I))
330  return LI->getPointerOperand();
331  if (auto *SI = dyn_cast<StoreInst>(I))
332  return SI->getPointerOperand();
333  return nullptr;
334 }
335 
336 /// A helper function that returns the type of loaded or stored value.
338  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
339  "Expected Load or Store instruction");
340  if (auto *LI = dyn_cast<LoadInst>(I))
341  return LI->getType();
342  return cast<StoreInst>(I)->getValueOperand()->getType();
343 }
344 
345 /// A helper function that returns the alignment of load or store instruction.
346 static unsigned getMemInstAlignment(Value *I) {
347  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
348  "Expected Load or Store instruction");
349  if (auto *LI = dyn_cast<LoadInst>(I))
350  return LI->getAlignment();
351  return cast<StoreInst>(I)->getAlignment();
352 }
353 
354 /// A helper function that returns the address space of the pointer operand of
355 /// load or store instruction.
356 static unsigned getMemInstAddressSpace(Value *I) {
357  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
358  "Expected Load or Store instruction");
359  if (auto *LI = dyn_cast<LoadInst>(I))
360  return LI->getPointerAddressSpace();
361  return cast<StoreInst>(I)->getPointerAddressSpace();
362 }
363 
364 /// A helper function that returns true if the given type is irregular. The
365 /// type is irregular if its allocated size doesn't equal the store size of an
366 /// element of the corresponding vector type at the given vectorization factor.
367 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
368  // Determine if an array of VF elements of type Ty is "bitcast compatible"
369  // with a <VF x Ty> vector.
370  if (VF > 1) {
371  auto *VectorTy = VectorType::get(Ty, VF);
372  return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
373  }
374 
375  // If the vectorization factor is one, we just check if an array of type Ty
376  // requires padding between elements.
377  return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
378 }
379 
380 /// A helper function that returns the reciprocal of the block probability of
381 /// predicated blocks. If we return X, we are assuming the predicated block
382 /// will execute once for for every X iterations of the loop header.
383 ///
384 /// TODO: We should use actual block probability here, if available. Currently,
385 /// we always assume predicated blocks have a 50% chance of executing.
386 static unsigned getReciprocalPredBlockProb() { return 2; }
387 
388 /// A helper function that adds a 'fast' flag to floating-point operations.
390  if (isa<FPMathOperator>(V)) {
391  FastMathFlags Flags;
392  Flags.setFast();
393  cast<Instruction>(V)->setFastMathFlags(Flags);
394  }
395  return V;
396 }
397 
398 /// A helper function that returns an integer or floating-point constant with
399 /// value C.
400 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
401  return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
402  : ConstantFP::get(Ty, C);
403 }
404 
405 namespace llvm {
406 
407 /// InnerLoopVectorizer vectorizes loops which contain only one basic
408 /// block to a specified vectorization factor (VF).
409 /// This class performs the widening of scalars into vectors, or multiple
410 /// scalars. This class also implements the following features:
411 /// * It inserts an epilogue loop for handling loops that don't have iteration
412 /// counts that are known to be a multiple of the vectorization factor.
413 /// * It handles the code generation for reduction variables.
414 /// * Scalarization (implementation using scalars) of un-vectorizable
415 /// instructions.
416 /// InnerLoopVectorizer does not perform any vectorization-legality
417 /// checks, and relies on the caller to check for the different legality
418 /// aspects. The InnerLoopVectorizer relies on the
419 /// LoopVectorizationLegality class to provide information about the induction
420 /// and reduction variables that were found to a given vectorization factor.
422 public:
425  const TargetLibraryInfo *TLI,
427  OptimizationRemarkEmitter *ORE, unsigned VecWidth,
428  unsigned UnrollFactor, LoopVectorizationLegality *LVL,
429  LoopVectorizationCostModel *CM)
430  : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
431  AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
432  Builder(PSE.getSE()->getContext()),
433  VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
434  virtual ~InnerLoopVectorizer() = default;
435 
436  /// Create a new empty loop. Unlink the old loop and connect the new one.
437  /// Return the pre-header block of the new loop.
439 
440  /// Widen a single instruction within the innermost loop.
442 
443  /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
444  void fixVectorizedLoop();
445 
446  // Return true if any runtime check is added.
448 
449  /// A type for vectorized values in the new loop. Each value from the
450  /// original loop, when vectorized, is represented by UF vector values in the
451  /// new unrolled loop, where UF is the unroll factor.
453 
454  /// Vectorize a single PHINode in a block. This method handles the induction
455  /// variable canonicalization. It supports both VF = 1 for unrolled loops and
456  /// arbitrary length vectors.
457  void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
458 
459  /// A helper function to scalarize a single Instruction in the innermost loop.
460  /// Generates a sequence of scalar instances for each lane between \p MinLane
461  /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
462  /// inclusive..
463  void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
464  bool IfPredicateInstr);
465 
466  /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
467  /// is provided, the integer induction variable will first be truncated to
468  /// the corresponding type.
469  void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
470 
471  /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
472  /// vector or scalar value on-demand if one is not yet available. When
473  /// vectorizing a loop, we visit the definition of an instruction before its
474  /// uses. When visiting the definition, we either vectorize or scalarize the
475  /// instruction, creating an entry for it in the corresponding map. (In some
476  /// cases, such as induction variables, we will create both vector and scalar
477  /// entries.) Then, as we encounter uses of the definition, we derive values
478  /// for each scalar or vector use unless such a value is already available.
479  /// For example, if we scalarize a definition and one of its uses is vector,
480  /// we build the required vector on-demand with an insertelement sequence
481  /// when visiting the use. Otherwise, if the use is scalar, we can use the
482  /// existing scalar definition.
483  ///
484  /// Return a value in the new loop corresponding to \p V from the original
485  /// loop at unroll index \p Part. If the value has already been vectorized,
486  /// the corresponding vector entry in VectorLoopValueMap is returned. If,
487  /// however, the value has a scalar entry in VectorLoopValueMap, we construct
488  /// a new vector value on-demand by inserting the scalar values into a vector
489  /// with an insertelement sequence. If the value has been neither vectorized
490  /// nor scalarized, it must be loop invariant, so we simply broadcast the
491  /// value into a vector.
492  Value *getOrCreateVectorValue(Value *V, unsigned Part);
493 
494  /// Return a value in the new loop corresponding to \p V from the original
495  /// loop at unroll and vector indices \p Instance. If the value has been
496  /// vectorized but not scalarized, the necessary extractelement instruction
497  /// will be generated.
498  Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
499 
500  /// Construct the vector value of a scalarized value \p V one lane at a time.
501  void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
502 
503  /// Try to vectorize the interleaved access group that \p Instr belongs to.
505 
506  /// Vectorize Load and Store instructions, optionally masking the vector
507  /// operations if \p BlockInMask is non-null.
509  VectorParts *BlockInMask = nullptr);
510 
511  /// \brief Set the debug location in the builder using the debug location in
512  /// the instruction.
513  void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
514 
515 protected:
517 
518  /// A small list of PHINodes.
520 
521  /// A type for scalarized values in the new loop. Each value from the
522  /// original loop, when scalarized, is represented by UF x VF scalar values
523  /// in the new unrolled loop, where UF is the unroll factor and VF is the
524  /// vectorization factor.
526 
527  /// Set up the values of the IVs correctly when exiting the vector loop.
528  void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
529  Value *CountRoundDown, Value *EndValue,
530  BasicBlock *MiddleBlock);
531 
532  /// Create a new induction variable inside L.
534  Value *Step, Instruction *DL);
535 
536  /// Handle all cross-iteration phis in the header.
537  void fixCrossIterationPHIs();
538 
539  /// Fix a first-order recurrence. This is the second phase of vectorizing
540  /// this phi node.
541  void fixFirstOrderRecurrence(PHINode *Phi);
542 
543  /// Fix a reduction cross-iteration phi. This is the second phase of
544  /// vectorizing this phi node.
545  void fixReduction(PHINode *Phi);
546 
547  /// \brief The Loop exit block may have single value PHI nodes with some
548  /// incoming value. While vectorizing we only handled real values
549  /// that were defined inside the loop and we should have one value for
550  /// each predecessor of its parent basic block. See PR14725.
551  void fixLCSSAPHIs();
552 
553  /// Iteratively sink the scalarized operands of a predicated instruction into
554  /// the block that was created for it.
555  void sinkScalarOperands(Instruction *PredInst);
556 
557  /// Shrinks vector element sizes to the smallest bitwidth they can be legally
558  /// represented as.
560 
561  /// Insert the new loop to the loop hierarchy and pass manager
562  /// and update the analysis passes.
563  void updateAnalysis();
564 
565  /// Create a broadcast instruction. This method generates a broadcast
566  /// instruction (shuffle) for loop invariant values and for the induction
567  /// value. If this is the induction variable then we extend it to N, N+1, ...
568  /// this is needed because each iteration in the loop corresponds to a SIMD
569  /// element.
570  virtual Value *getBroadcastInstrs(Value *V);
571 
572  /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
573  /// to each vector element of Val. The sequence starts at StartIndex.
574  /// \p Opcode is relevant for FP induction variable.
575  virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
576  Instruction::BinaryOps Opcode =
577  Instruction::BinaryOpsEnd);
578 
579  /// Compute scalar induction steps. \p ScalarIV is the scalar induction
580  /// variable on which to base the steps, \p Step is the size of the step, and
581  /// \p EntryVal is the value from the original loop that maps to the steps.
582  /// Note that \p EntryVal doesn't have to be an induction variable (e.g., it
583  /// can be a truncate instruction).
584  void buildScalarSteps(Value *ScalarIV, Value *Step, Value *EntryVal,
585  const InductionDescriptor &ID);
586 
587  /// Create a vector induction phi node based on an existing scalar one. \p
588  /// EntryVal is the value from the original loop that maps to the vector phi
589  /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
590  /// truncate instruction, instead of widening the original IV, we widen a
591  /// version of the IV truncated to \p EntryVal's type.
593  Value *Step, Instruction *EntryVal);
594 
595  /// Returns true if an instruction \p I should be scalarized instead of
596  /// vectorized for the chosen vectorization factor.
598 
599  /// Returns true if we should generate a scalar version of \p IV.
600  bool needsScalarInduction(Instruction *IV) const;
601 
602  /// Generate a shuffle sequence that will reverse the vector Vec.
603  virtual Value *reverseVector(Value *Vec);
604 
605  /// Returns (and creates if needed) the original loop trip count.
606  Value *getOrCreateTripCount(Loop *NewLoop);
607 
608  /// Returns (and creates if needed) the trip count of the widened loop.
610 
611  /// Returns a bitcasted value to the requested vector type.
612  /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
614  const DataLayout &DL);
615 
616  /// Emit a bypass check to see if the vector trip count is zero, including if
617  /// it overflows.
619 
620  /// Emit a bypass check to see if all of the SCEV assumptions we've
621  /// had to make are correct.
622  void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
623 
624  /// Emit bypass checks to check any memory assumptions we may have made.
625  void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
626 
627  /// Add additional metadata to \p To that was not present on \p Orig.
628  ///
629  /// Currently this is used to add the noalias annotations based on the
630  /// inserted memchecks. Use this for instructions that are *cloned* into the
631  /// vector loop.
632  void addNewMetadata(Instruction *To, const Instruction *Orig);
633 
634  /// Add metadata from one instruction to another.
635  ///
636  /// This includes both the original MDs from \p From and additional ones (\see
637  /// addNewMetadata). Use this for *newly created* instructions in the vector
638  /// loop.
639  void addMetadata(Instruction *To, Instruction *From);
640 
641  /// \brief Similar to the previous function but it adds the metadata to a
642  /// vector of instructions.
643  void addMetadata(ArrayRef<Value *> To, Instruction *From);
644 
645  /// The original loop.
647 
648  /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
649  /// dynamic knowledge to simplify SCEV expressions and converts them to a
650  /// more usable form.
652 
653  /// Loop Info.
655 
656  /// Dominator Tree.
658 
659  /// Alias Analysis.
661 
662  /// Target Library Info.
664 
665  /// Target Transform Info.
667 
668  /// Assumption Cache.
670 
671  /// Interface to emit optimization remarks.
673 
674  /// \brief LoopVersioning. It's only set up (non-null) if memchecks were
675  /// used.
676  ///
677  /// This is currently only used to add no-alias metadata based on the
678  /// memchecks. The actually versioning is performed manually.
679  std::unique_ptr<LoopVersioning> LVer;
680 
681  /// The vectorization SIMD factor to use. Each vector will have this many
682  /// vector elements.
683  unsigned VF;
684 
685  /// The vectorization unroll factor to use. Each scalar is vectorized to this
686  /// many different vector instructions.
687  unsigned UF;
688 
689  /// The builder that we use
691 
692  // --- Vectorization state ---
693 
694  /// The vector-loop preheader.
696 
697  /// The scalar-loop preheader.
699 
700  /// Middle Block between the vector and the scalar.
702 
703  /// The ExitBlock of the scalar loop.
705 
706  /// The vector loop body.
708 
709  /// The scalar loop body.
711 
712  /// A list of all bypass blocks. The first block is the entry of the loop.
714 
715  /// The new Induction variable which was added to the new block.
716  PHINode *Induction = nullptr;
717 
718  /// The induction variable of the old basic block.
719  PHINode *OldInduction = nullptr;
720 
721  /// Maps values from the original loop to their corresponding values in the
722  /// vectorized loop. A key value can map to either vector values, scalar
723  /// values or both kinds of values, depending on whether the key was
724  /// vectorized and scalarized.
726 
727  /// Store instructions that were predicated.
729 
730  /// Trip count of the original loop.
731  Value *TripCount = nullptr;
732 
733  /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
734  Value *VectorTripCount = nullptr;
735 
736  /// The legality analysis.
737  LoopVectorizationLegality *Legal;
738 
739  /// The profitablity analysis.
740  LoopVectorizationCostModel *Cost;
741 
742  // Record whether runtime checks are added.
743  bool AddedSafetyChecks = false;
744 
745  // Holds the end values for each induction variable. We save the end values
746  // so we can later fix-up the external users of the induction variables.
748 };
749 
751 public:
754  const TargetLibraryInfo *TLI,
756  OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
757  LoopVectorizationLegality *LVL,
758  LoopVectorizationCostModel *CM)
759  : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
760  UnrollFactor, LVL, CM) {}
761 
762 private:
763  Value *getBroadcastInstrs(Value *V) override;
764  Value *getStepVector(Value *Val, int StartIdx, Value *Step,
765  Instruction::BinaryOps Opcode =
766  Instruction::BinaryOpsEnd) override;
767  Value *reverseVector(Value *Vec) override;
768 };
769 
770 } // end namespace llvm
771 
772 /// \brief Look for a meaningful debug location on the instruction or it's
773 /// operands.
775  if (!I)
776  return I;
777 
778  DebugLoc Empty;
779  if (I->getDebugLoc() != Empty)
780  return I;
781 
782  for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
783  if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
784  if (OpInst->getDebugLoc() != Empty)
785  return OpInst;
786  }
787 
788  return I;
789 }
790 
792  if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
793  const DILocation *DIL = Inst->getDebugLoc();
794  if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
795  !isa<DbgInfoIntrinsic>(Inst))
797  else
799  } else
801 }
802 
803 #ifndef NDEBUG
804 /// \return string containing a file name and a line # for the given loop.
805 static std::string getDebugLocString(const Loop *L) {
806  std::string Result;
807  if (L) {
808  raw_string_ostream OS(Result);
809  if (const DebugLoc LoopDbgLoc = L->getStartLoc())
810  LoopDbgLoc.print(OS);
811  else
812  // Just print the module name.
813  OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
814  OS.flush();
815  }
816  return Result;
817 }
818 #endif
819 
821  const Instruction *Orig) {
822  // If the loop was versioned with memchecks, add the corresponding no-alias
823  // metadata.
824  if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
825  LVer->annotateInstWithNoAlias(To, Orig);
826 }
827 
829  Instruction *From) {
830  propagateMetadata(To, From);
831  addNewMetadata(To, From);
832 }
833 
835  Instruction *From) {
836  for (Value *V : To) {
837  if (Instruction *I = dyn_cast<Instruction>(V))
838  addMetadata(I, From);
839  }
840 }
841 
842 namespace {
843 
844 /// \brief The group of interleaved loads/stores sharing the same stride and
845 /// close to each other.
846 ///
847 /// Each member in this group has an index starting from 0, and the largest
848 /// index should be less than interleaved factor, which is equal to the absolute
849 /// value of the access's stride.
850 ///
851 /// E.g. An interleaved load group of factor 4:
852 /// for (unsigned i = 0; i < 1024; i+=4) {
853 /// a = A[i]; // Member of index 0
854 /// b = A[i+1]; // Member of index 1
855 /// d = A[i+3]; // Member of index 3
856 /// ...
857 /// }
858 ///
859 /// An interleaved store group of factor 4:
860 /// for (unsigned i = 0; i < 1024; i+=4) {
861 /// ...
862 /// A[i] = a; // Member of index 0
863 /// A[i+1] = b; // Member of index 1
864 /// A[i+2] = c; // Member of index 2
865 /// A[i+3] = d; // Member of index 3
866 /// }
867 ///
868 /// Note: the interleaved load group could have gaps (missing members), but
869 /// the interleaved store group doesn't allow gaps.
870 class InterleaveGroup {
871 public:
872  InterleaveGroup(Instruction *Instr, int Stride, unsigned Align)
873  : Align(Align), InsertPos(Instr) {
874  assert(Align && "The alignment should be non-zero");
875 
876  Factor = std::abs(Stride);
877  assert(Factor > 1 && "Invalid interleave factor");
878 
879  Reverse = Stride < 0;
880  Members[0] = Instr;
881  }
882 
883  bool isReverse() const { return Reverse; }
884  unsigned getFactor() const { return Factor; }
885  unsigned getAlignment() const { return Align; }
886  unsigned getNumMembers() const { return Members.size(); }
887 
888  /// \brief Try to insert a new member \p Instr with index \p Index and
889  /// alignment \p NewAlign. The index is related to the leader and it could be
890  /// negative if it is the new leader.
891  ///
892  /// \returns false if the instruction doesn't belong to the group.
893  bool insertMember(Instruction *Instr, int Index, unsigned NewAlign) {
894  assert(NewAlign && "The new member's alignment should be non-zero");
895 
896  int Key = Index + SmallestKey;
897 
898  // Skip if there is already a member with the same index.
899  if (Members.count(Key))
900  return false;
901 
902  if (Key > LargestKey) {
903  // The largest index is always less than the interleave factor.
904  if (Index >= static_cast<int>(Factor))
905  return false;
906 
907  LargestKey = Key;
908  } else if (Key < SmallestKey) {
909  // The largest index is always less than the interleave factor.
910  if (LargestKey - Key >= static_cast<int>(Factor))
911  return false;
912 
913  SmallestKey = Key;
914  }
915 
916  // It's always safe to select the minimum alignment.
917  Align = std::min(Align, NewAlign);
918  Members[Key] = Instr;
919  return true;
920  }
921 
922  /// \brief Get the member with the given index \p Index
923  ///
924  /// \returns nullptr if contains no such member.
925  Instruction *getMember(unsigned Index) const {
926  int Key = SmallestKey + Index;
927  if (!Members.count(Key))
928  return nullptr;
929 
930  return Members.find(Key)->second;
931  }
932 
933  /// \brief Get the index for the given member. Unlike the key in the member
934  /// map, the index starts from 0.
935  unsigned getIndex(Instruction *Instr) const {
936  for (auto I : Members)
937  if (I.second == Instr)
938  return I.first - SmallestKey;
939 
940  llvm_unreachable("InterleaveGroup contains no such member");
941  }
942 
943  Instruction *getInsertPos() const { return InsertPos; }
944  void setInsertPos(Instruction *Inst) { InsertPos = Inst; }
945 
946 private:
947  unsigned Factor; // Interleave Factor.
948  bool Reverse;
949  unsigned Align;
951  int SmallestKey = 0;
952  int LargestKey = 0;
953 
954  // To avoid breaking dependences, vectorized instructions of an interleave
955  // group should be inserted at either the first load or the last store in
956  // program order.
957  //
958  // E.g. %even = load i32 // Insert Position
959  // %add = add i32 %even // Use of %even
960  // %odd = load i32
961  //
962  // store i32 %even
963  // %odd = add i32 // Def of %odd
964  // store i32 %odd // Insert Position
965  Instruction *InsertPos;
966 };
967 
968 /// \brief Drive the analysis of interleaved memory accesses in the loop.
969 ///
970 /// Use this class to analyze interleaved accesses only when we can vectorize
971 /// a loop. Otherwise it's meaningless to do analysis as the vectorization
972 /// on interleaved accesses is unsafe.
973 ///
974 /// The analysis collects interleave groups and records the relationships
975 /// between the member and the group in a map.
976 class InterleavedAccessInfo {
977 public:
978  InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,
980  : PSE(PSE), TheLoop(L), DT(DT), LI(LI) {}
981 
982  ~InterleavedAccessInfo() {
984  // Avoid releasing a pointer twice.
985  for (auto &I : InterleaveGroupMap)
986  DelSet.insert(I.second);
987  for (auto *Ptr : DelSet)
988  delete Ptr;
989  }
990 
991  /// \brief Analyze the interleaved accesses and collect them in interleave
992  /// groups. Substitute symbolic strides using \p Strides.
993  void analyzeInterleaving(const ValueToValueMap &Strides);
994 
995  /// \brief Check if \p Instr belongs to any interleave group.
996  bool isInterleaved(Instruction *Instr) const {
997  return InterleaveGroupMap.count(Instr);
998  }
999 
1000  /// \brief Get the interleave group that \p Instr belongs to.
1001  ///
1002  /// \returns nullptr if doesn't have such group.
1003  InterleaveGroup *getInterleaveGroup(Instruction *Instr) const {
1004  if (InterleaveGroupMap.count(Instr))
1005  return InterleaveGroupMap.find(Instr)->second;
1006  return nullptr;
1007  }
1008 
1009  /// \brief Returns true if an interleaved group that may access memory
1010  /// out-of-bounds requires a scalar epilogue iteration for correctness.
1011  bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; }
1012 
1013  /// \brief Initialize the LoopAccessInfo used for dependence checking.
1014  void setLAI(const LoopAccessInfo *Info) { LAI = Info; }
1015 
1016 private:
1017  /// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
1018  /// Simplifies SCEV expressions in the context of existing SCEV assumptions.
1019  /// The interleaved access analysis can also add new predicates (for example
1020  /// by versioning strides of pointers).
1022 
1023  Loop *TheLoop;
1024  DominatorTree *DT;
1025  LoopInfo *LI;
1026  const LoopAccessInfo *LAI = nullptr;
1027 
1028  /// True if the loop may contain non-reversed interleaved groups with
1029  /// out-of-bounds accesses. We ensure we don't speculatively access memory
1030  /// out-of-bounds by executing at least one scalar epilogue iteration.
1031  bool RequiresScalarEpilogue = false;
1032 
1033  /// Holds the relationships between the members and the interleave group.
1034  DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap;
1035 
1036  /// Holds dependences among the memory accesses in the loop. It maps a source
1037  /// access to a set of dependent sink accesses.
1039 
1040  /// \brief The descriptor for a strided memory access.
1041  struct StrideDescriptor {
1042  StrideDescriptor() = default;
1043  StrideDescriptor(int64_t Stride, const SCEV *Scev, uint64_t Size,
1044  unsigned Align)
1045  : Stride(Stride), Scev(Scev), Size(Size), Align(Align) {}
1046 
1047  // The access's stride. It is negative for a reverse access.
1048  int64_t Stride = 0;
1049 
1050  // The scalar expression of this access.
1051  const SCEV *Scev = nullptr;
1052 
1053  // The size of the memory object.
1054  uint64_t Size = 0;
1055 
1056  // The alignment of this access.
1057  unsigned Align = 0;
1058  };
1059 
1060  /// \brief A type for holding instructions and their stride descriptors.
1061  using StrideEntry = std::pair<Instruction *, StrideDescriptor>;
1062 
1063  /// \brief Create a new interleave group with the given instruction \p Instr,
1064  /// stride \p Stride and alignment \p Align.
1065  ///
1066  /// \returns the newly created interleave group.
1067  InterleaveGroup *createInterleaveGroup(Instruction *Instr, int Stride,
1068  unsigned Align) {
1069  assert(!InterleaveGroupMap.count(Instr) &&
1070  "Already in an interleaved access group");
1071  InterleaveGroupMap[Instr] = new InterleaveGroup(Instr, Stride, Align);
1072  return InterleaveGroupMap[Instr];
1073  }
1074 
1075  /// \brief Release the group and remove all the relationships.
1076  void releaseGroup(InterleaveGroup *Group) {
1077  for (unsigned i = 0; i < Group->getFactor(); i++)
1078  if (Instruction *Member = Group->getMember(i))
1079  InterleaveGroupMap.erase(Member);
1080 
1081  delete Group;
1082  }
1083 
1084  /// \brief Collect all the accesses with a constant stride in program order.
1085  void collectConstStrideAccesses(
1087  const ValueToValueMap &Strides);
1088 
1089  /// \brief Returns true if \p Stride is allowed in an interleaved group.
1090  static bool isStrided(int Stride) {
1091  unsigned Factor = std::abs(Stride);
1092  return Factor >= 2 && Factor <= MaxInterleaveGroupFactor;
1093  }
1094 
1095  /// \brief Returns true if \p BB is a predicated block.
1096  bool isPredicated(BasicBlock *BB) const {
1097  return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
1098  }
1099 
1100  /// \brief Returns true if LoopAccessInfo can be used for dependence queries.
1101  bool areDependencesValid() const {
1102  return LAI && LAI->getDepChecker().getDependences();
1103  }
1104 
1105  /// \brief Returns true if memory accesses \p A and \p B can be reordered, if
1106  /// necessary, when constructing interleaved groups.
1107  ///
1108  /// \p A must precede \p B in program order. We return false if reordering is
1109  /// not necessary or is prevented because \p A and \p B may be dependent.
1110  bool canReorderMemAccessesForInterleavedGroups(StrideEntry *A,
1111  StrideEntry *B) const {
1112  // Code motion for interleaved accesses can potentially hoist strided loads
1113  // and sink strided stores. The code below checks the legality of the
1114  // following two conditions:
1115  //
1116  // 1. Potentially moving a strided load (B) before any store (A) that
1117  // precedes B, or
1118  //
1119  // 2. Potentially moving a strided store (A) after any load or store (B)
1120  // that A precedes.
1121  //
1122  // It's legal to reorder A and B if we know there isn't a dependence from A
1123  // to B. Note that this determination is conservative since some
1124  // dependences could potentially be reordered safely.
1125 
1126  // A is potentially the source of a dependence.
1127  auto *Src = A->first;
1128  auto SrcDes = A->second;
1129 
1130  // B is potentially the sink of a dependence.
1131  auto *Sink = B->first;
1132  auto SinkDes = B->second;
1133 
1134  // Code motion for interleaved accesses can't violate WAR dependences.
1135  // Thus, reordering is legal if the source isn't a write.
1136  if (!Src->mayWriteToMemory())
1137  return true;
1138 
1139  // At least one of the accesses must be strided.
1140  if (!isStrided(SrcDes.Stride) && !isStrided(SinkDes.Stride))
1141  return true;
1142 
1143  // If dependence information is not available from LoopAccessInfo,
1144  // conservatively assume the instructions can't be reordered.
1145  if (!areDependencesValid())
1146  return false;
1147 
1148  // If we know there is a dependence from source to sink, assume the
1149  // instructions can't be reordered. Otherwise, reordering is legal.
1150  return !Dependences.count(Src) || !Dependences.lookup(Src).count(Sink);
1151  }
1152 
1153  /// \brief Collect the dependences from LoopAccessInfo.
1154  ///
1155  /// We process the dependences once during the interleaved access analysis to
1156  /// enable constant-time dependence queries.
1157  void collectDependences() {
1158  if (!areDependencesValid())
1159  return;
1160  auto *Deps = LAI->getDepChecker().getDependences();
1161  for (auto Dep : *Deps)
1162  Dependences[Dep.getSource(*LAI)].insert(Dep.getDestination(*LAI));
1163  }
1164 };
1165 
1166 /// Utility class for getting and setting loop vectorizer hints in the form
1167 /// of loop metadata.
1168 /// This class keeps a number of loop annotations locally (as member variables)
1169 /// and can, upon request, write them back as metadata on the loop. It will
1170 /// initially scan the loop for existing metadata, and will update the local
1171 /// values based on information in the loop.
1172 /// We cannot write all values to metadata, as the mere presence of some info,
1173 /// for example 'force', means a decision has been made. So, we need to be
1174 /// careful NOT to add them if the user hasn't specifically asked so.
1175 class LoopVectorizeHints {
1176  enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE, HK_ISVECTORIZED };
1177 
1178  /// Hint - associates name and validation with the hint value.
1179  struct Hint {
1180  const char *Name;
1181  unsigned Value; // This may have to change for non-numeric values.
1182  HintKind Kind;
1183 
1184  Hint(const char *Name, unsigned Value, HintKind Kind)
1185  : Name(Name), Value(Value), Kind(Kind) {}
1186 
1187  bool validate(unsigned Val) {
1188  switch (Kind) {
1189  case HK_WIDTH:
1190  return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth;
1191  case HK_UNROLL:
1192  return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
1193  case HK_FORCE:
1194  return (Val <= 1);
1195  case HK_ISVECTORIZED:
1196  return (Val==0 || Val==1);
1197  }
1198  return false;
1199  }
1200  };
1201 
1202  /// Vectorization width.
1203  Hint Width;
1204 
1205  /// Vectorization interleave factor.
1206  Hint Interleave;
1207 
1208  /// Vectorization forced
1209  Hint Force;
1210 
1211  /// Already Vectorized
1212  Hint IsVectorized;
1213 
1214  /// Return the loop metadata prefix.
1215  static StringRef Prefix() { return "llvm.loop."; }
1216 
1217  /// True if there is any unsafe math in the loop.
1218  bool PotentiallyUnsafe = false;
1219 
1220 public:
1221  enum ForceKind {
1222  FK_Undefined = -1, ///< Not selected.
1223  FK_Disabled = 0, ///< Forcing disabled.
1224  FK_Enabled = 1, ///< Forcing enabled.
1225  };
1226 
1227  LoopVectorizeHints(const Loop *L, bool DisableInterleaving,
1229  : Width("vectorize.width", VectorizerParams::VectorizationFactor,
1230  HK_WIDTH),
1231  Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
1232  Force("vectorize.enable", FK_Undefined, HK_FORCE),
1233  IsVectorized("isvectorized", 0, HK_ISVECTORIZED), TheLoop(L), ORE(ORE) {
1234  // Populate values with existing loop metadata.
1235  getHintsFromMetadata();
1236 
1237  // force-vector-interleave overrides DisableInterleaving.
1239  Interleave.Value = VectorizerParams::VectorizationInterleave;
1240 
1241  if (IsVectorized.Value != 1)
1242  // If the vectorization width and interleaving count are both 1 then
1243  // consider the loop to have been already vectorized because there's
1244  // nothing more that we can do.
1245  IsVectorized.Value = Width.Value == 1 && Interleave.Value == 1;
1246  DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()
1247  << "LV: Interleaving disabled by the pass manager\n");
1248  }
1249 
1250  /// Mark the loop L as already vectorized by setting the width to 1.
1251  void setAlreadyVectorized() {
1252  IsVectorized.Value = 1;
1253  Hint Hints[] = {IsVectorized};
1254  writeHintsToMetadata(Hints);
1255  }
1256 
1257  bool allowVectorization(Function *F, Loop *L, bool AlwaysVectorize) const {
1258  if (getForce() == LoopVectorizeHints::FK_Disabled) {
1259  DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
1260  emitRemarkWithHints();
1261  return false;
1262  }
1263 
1264  if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) {
1265  DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
1266  emitRemarkWithHints();
1267  return false;
1268  }
1269 
1270  if (getIsVectorized() == 1) {
1271  DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
1272  // FIXME: Add interleave.disable metadata. This will allow
1273  // vectorize.disable to be used without disabling the pass and errors
1274  // to differentiate between disabled vectorization and a width of 1.
1275  ORE.emit([&]() {
1276  return OptimizationRemarkAnalysis(vectorizeAnalysisPassName(),
1277  "AllDisabled", L->getStartLoc(),
1278  L->getHeader())
1279  << "loop not vectorized: vectorization and interleaving are "
1280  "explicitly disabled, or the loop has already been "
1281  "vectorized";
1282  });
1283  return false;
1284  }
1285 
1286  return true;
1287  }
1288 
1289  /// Dumps all the hint information.
1290  void emitRemarkWithHints() const {
1291  using namespace ore;
1292 
1293  ORE.emit([&]() {
1294  if (Force.Value == LoopVectorizeHints::FK_Disabled)
1295  return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled",
1296  TheLoop->getStartLoc(),
1297  TheLoop->getHeader())
1298  << "loop not vectorized: vectorization is explicitly disabled";
1299  else {
1300  OptimizationRemarkMissed R(LV_NAME, "MissedDetails",
1301  TheLoop->getStartLoc(),
1302  TheLoop->getHeader());
1303  R << "loop not vectorized";
1304  if (Force.Value == LoopVectorizeHints::FK_Enabled) {
1305  R << " (Force=" << NV("Force", true);
1306  if (Width.Value != 0)
1307  R << ", Vector Width=" << NV("VectorWidth", Width.Value);
1308  if (Interleave.Value != 0)
1309  R << ", Interleave Count="
1310  << NV("InterleaveCount", Interleave.Value);
1311  R << ")";
1312  }
1313  return R;
1314  }
1315  });
1316  }
1317 
1318  unsigned getWidth() const { return Width.Value; }
1319  unsigned getInterleave() const { return Interleave.Value; }
1320  unsigned getIsVectorized() const { return IsVectorized.Value; }
1321  enum ForceKind getForce() const { return (ForceKind)Force.Value; }
1322 
1323  /// \brief If hints are provided that force vectorization, use the AlwaysPrint
1324  /// pass name to force the frontend to print the diagnostic.
1325  const char *vectorizeAnalysisPassName() const {
1326  if (getWidth() == 1)
1327  return LV_NAME;
1328  if (getForce() == LoopVectorizeHints::FK_Disabled)
1329  return LV_NAME;
1330  if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0)
1331  return LV_NAME;
1333  }
1334 
1335  bool allowReordering() const {
1336  // When enabling loop hints are provided we allow the vectorizer to change
1337  // the order of operations that is given by the scalar loop. This is not
1338  // enabled by default because can be unsafe or inefficient. For example,
1339  // reordering floating-point operations will change the way round-off
1340  // error accumulates in the loop.
1341  return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1;
1342  }
1343 
1344  bool isPotentiallyUnsafe() const {
1345  // Avoid FP vectorization if the target is unsure about proper support.
1346  // This may be related to the SIMD unit in the target not handling
1347  // IEEE 754 FP ops properly, or bad single-to-double promotions.
1348  // Otherwise, a sequence of vectorized loops, even without reduction,
1349  // could lead to different end results on the destination vectors.
1350  return getForce() != LoopVectorizeHints::FK_Enabled && PotentiallyUnsafe;
1351  }
1352 
1353  void setPotentiallyUnsafe() { PotentiallyUnsafe = true; }
1354 
1355 private:
1356  /// Find hints specified in the loop metadata and update local values.
1357  void getHintsFromMetadata() {
1358  MDNode *LoopID = TheLoop->getLoopID();
1359  if (!LoopID)
1360  return;
1361 
1362  // First operand should refer to the loop id itself.
1363  assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
1364  assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
1365 
1366  for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
1367  const MDString *S = nullptr;
1369 
1370  // The expected hint is either a MDString or a MDNode with the first
1371  // operand a MDString.
1372  if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
1373  if (!MD || MD->getNumOperands() == 0)
1374  continue;
1375  S = dyn_cast<MDString>(MD->getOperand(0));
1376  for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
1377  Args.push_back(MD->getOperand(i));
1378  } else {
1379  S = dyn_cast<MDString>(LoopID->getOperand(i));
1380  assert(Args.size() == 0 && "too many arguments for MDString");
1381  }
1382 
1383  if (!S)
1384  continue;
1385 
1386  // Check if the hint starts with the loop metadata prefix.
1387  StringRef Name = S->getString();
1388  if (Args.size() == 1)
1389  setHint(Name, Args[0]);
1390  }
1391  }
1392 
1393  /// Checks string hint with one operand and set value if valid.
1394  void setHint(StringRef Name, Metadata *Arg) {
1395  if (!Name.startswith(Prefix()))
1396  return;
1397  Name = Name.substr(Prefix().size(), StringRef::npos);
1398 
1399  const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
1400  if (!C)
1401  return;
1402  unsigned Val = C->getZExtValue();
1403 
1404  Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized};
1405  for (auto H : Hints) {
1406  if (Name == H->Name) {
1407  if (H->validate(Val))
1408  H->Value = Val;
1409  else
1410  DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
1411  break;
1412  }
1413  }
1414  }
1415 
1416  /// Create a new hint from name / value pair.
1417  MDNode *createHintMetadata(StringRef Name, unsigned V) const {
1418  LLVMContext &Context = TheLoop->getHeader()->getContext();
1419  Metadata *MDs[] = {MDString::get(Context, Name),
1421  ConstantInt::get(Type::getInt32Ty(Context), V))};
1422  return MDNode::get(Context, MDs);
1423  }
1424 
1425  /// Matches metadata with hint name.
1426  bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) {
1427  MDString *Name = dyn_cast<MDString>(Node->getOperand(0));
1428  if (!Name)
1429  return false;
1430 
1431  for (auto H : HintTypes)
1432  if (Name->getString().endswith(H.Name))
1433  return true;
1434  return false;
1435  }
1436 
1437  /// Sets current hints into loop metadata, keeping other values intact.
1438  void writeHintsToMetadata(ArrayRef<Hint> HintTypes) {
1439  if (HintTypes.empty())
1440  return;
1441 
1442  // Reserve the first element to LoopID (see below).
1444  // If the loop already has metadata, then ignore the existing operands.
1445  MDNode *LoopID = TheLoop->getLoopID();
1446  if (LoopID) {
1447  for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
1448  MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
1449  // If node in update list, ignore old value.
1450  if (!matchesHintMetadataName(Node, HintTypes))
1451  MDs.push_back(Node);
1452  }
1453  }
1454 
1455  // Now, add the missing hints.
1456  for (auto H : HintTypes)
1457  MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));
1458 
1459  // Replace current metadata node with new one.
1460  LLVMContext &Context = TheLoop->getHeader()->getContext();
1461  MDNode *NewLoopID = MDNode::get(Context, MDs);
1462  // Set operand 0 to refer to the loop id itself.
1463  NewLoopID->replaceOperandWith(0, NewLoopID);
1464 
1465  TheLoop->setLoopID(NewLoopID);
1466  }
1467 
1468  /// The loop these hints belong to.
1469  const Loop *TheLoop;
1470 
1471  /// Interface to emit optimization remarks.
1473 };
1474 
1475 } // end anonymous namespace
1476 
1478  const LoopVectorizeHints &LH,
1480  LH.emitRemarkWithHints();
1481 
1482  if (LH.getForce() == LoopVectorizeHints::FK_Enabled) {
1483  if (LH.getWidth() != 1)
1485  DEBUG_TYPE, "FailedRequestedVectorization",
1486  L->getStartLoc(), L->getHeader())
1487  << "loop not vectorized: "
1488  << "failed explicitly specified loop vectorization");
1489  else if (LH.getInterleave() != 1)
1491  DEBUG_TYPE, "FailedRequestedInterleaving", L->getStartLoc(),
1492  L->getHeader())
1493  << "loop not interleaved: "
1494  << "failed explicitly specified loop interleaving");
1495  }
1496 }
1497 
1498 namespace {
1499 
1500 /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
1501 /// to what vectorization factor.
1502 /// This class does not look at the profitability of vectorization, only the
1503 /// legality. This class has two main kinds of checks:
1504 /// * Memory checks - The code in canVectorizeMemory checks if vectorization
1505 /// will change the order of memory accesses in a way that will change the
1506 /// correctness of the program.
1507 /// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory
1508 /// checks for a number of different conditions, such as the availability of a
1509 /// single induction variable, that all types are supported and vectorize-able,
1510 /// etc. This code reflects the capabilities of InnerLoopVectorizer.
1511 /// This class is also used by InnerLoopVectorizer for identifying
1512 /// induction variable and the different reduction variables.
1513 class LoopVectorizationLegality {
1514 public:
1515  LoopVectorizationLegality(
1518  const TargetTransformInfo *TTI,
1519  std::function<const LoopAccessInfo &(Loop &)> *GetLAA, LoopInfo *LI,
1520  OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R,
1521  LoopVectorizeHints *H)
1522  : TheLoop(L), PSE(PSE), TLI(TLI), TTI(TTI), DT(DT), GetLAA(GetLAA),
1523  ORE(ORE), InterleaveInfo(PSE, L, DT, LI), Requirements(R), Hints(H) {}
1524 
1525  /// ReductionList contains the reduction descriptors for all
1526  /// of the reductions that were found in the loop.
1527  using ReductionList = DenseMap<PHINode *, RecurrenceDescriptor>;
1528 
1529  /// InductionList saves induction variables and maps them to the
1530  /// induction descriptor.
1531  using InductionList = MapVector<PHINode *, InductionDescriptor>;
1532 
1533  /// RecurrenceSet contains the phi nodes that are recurrences other than
1534  /// inductions and reductions.
1535  using RecurrenceSet = SmallPtrSet<const PHINode *, 8>;
1536 
1537  /// Returns true if it is legal to vectorize this loop.
1538  /// This does not mean that it is profitable to vectorize this
1539  /// loop, only that it is legal to do so.
1540  bool canVectorize();
1541 
1542  /// Returns the primary induction variable.
1543  PHINode *getPrimaryInduction() { return PrimaryInduction; }
1544 
1545  /// Returns the reduction variables found in the loop.
1546  ReductionList *getReductionVars() { return &Reductions; }
1547 
1548  /// Returns the induction variables found in the loop.
1549  InductionList *getInductionVars() { return &Inductions; }
1550 
1551  /// Return the first-order recurrences found in the loop.
1552  RecurrenceSet *getFirstOrderRecurrences() { return &FirstOrderRecurrences; }
1553 
1554  /// Return the set of instructions to sink to handle first-order recurrences.
1555  DenseMap<Instruction *, Instruction *> &getSinkAfter() { return SinkAfter; }
1556 
1557  /// Returns the widest induction type.
1558  Type *getWidestInductionType() { return WidestIndTy; }
1559 
1560  /// Returns True if V is an induction variable in this loop.
1561  bool isInductionVariable(const Value *V);
1562 
1563  /// Returns True if PN is a reduction variable in this loop.
1564  bool isReductionVariable(PHINode *PN) { return Reductions.count(PN); }
1565 
1566  /// Returns True if Phi is a first-order recurrence in this loop.
1567  bool isFirstOrderRecurrence(const PHINode *Phi);
1568 
1569  /// Return true if the block BB needs to be predicated in order for the loop
1570  /// to be vectorized.
1571  bool blockNeedsPredication(BasicBlock *BB);
1572 
1573  /// Check if this pointer is consecutive when vectorizing. This happens
1574  /// when the last index of the GEP is the induction variable, or that the
1575  /// pointer itself is an induction variable.
1576  /// This check allows us to vectorize A[idx] into a wide load/store.
1577  /// Returns:
1578  /// 0 - Stride is unknown or non-consecutive.
1579  /// 1 - Address is consecutive.
1580  /// -1 - Address is consecutive, and decreasing.
1581  int isConsecutivePtr(Value *Ptr);
1582 
1583  /// Returns true if the value V is uniform within the loop.
1584  bool isUniform(Value *V);
1585 
1586  /// Returns the information that we collected about runtime memory check.
1587  const RuntimePointerChecking *getRuntimePointerChecking() const {
1588  return LAI->getRuntimePointerChecking();
1589  }
1590 
1591  const LoopAccessInfo *getLAI() const { return LAI; }
1592 
1593  /// \brief Check if \p Instr belongs to any interleaved access group.
1594  bool isAccessInterleaved(Instruction *Instr) {
1595  return InterleaveInfo.isInterleaved(Instr);
1596  }
1597 
1598  /// \brief Get the interleaved access group that \p Instr belongs to.
1599  const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) {
1600  return InterleaveInfo.getInterleaveGroup(Instr);
1601  }
1602 
1603  /// \brief Returns true if an interleaved group requires a scalar iteration
1604  /// to handle accesses with gaps.
1605  bool requiresScalarEpilogue() const {
1606  return InterleaveInfo.requiresScalarEpilogue();
1607  }
1608 
1609  unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); }
1610 
1611  uint64_t getMaxSafeRegisterWidth() const {
1612  return LAI->getDepChecker().getMaxSafeRegisterWidth();
1613  }
1614 
1615  bool hasStride(Value *V) { return LAI->hasStride(V); }
1616 
1617  /// Returns true if the target machine supports masked store operation
1618  /// for the given \p DataType and kind of access to \p Ptr.
1619  bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
1620  return isConsecutivePtr(Ptr) && TTI->isLegalMaskedStore(DataType);
1621  }
1622 
1623  /// Returns true if the target machine supports masked load operation
1624  /// for the given \p DataType and kind of access to \p Ptr.
1625  bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
1626  return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType);
1627  }
1628 
1629  /// Returns true if the target machine supports masked scatter operation
1630  /// for the given \p DataType.
1631  bool isLegalMaskedScatter(Type *DataType) {
1632  return TTI->isLegalMaskedScatter(DataType);
1633  }
1634 
1635  /// Returns true if the target machine supports masked gather operation
1636  /// for the given \p DataType.
1637  bool isLegalMaskedGather(Type *DataType) {
1638  return TTI->isLegalMaskedGather(DataType);
1639  }
1640 
1641  /// Returns true if the target machine can represent \p V as a masked gather
1642  /// or scatter operation.
1643  bool isLegalGatherOrScatter(Value *V) {
1644  auto *LI = dyn_cast<LoadInst>(V);
1645  auto *SI = dyn_cast<StoreInst>(V);
1646  if (!LI && !SI)
1647  return false;
1648  auto *Ptr = getPointerOperand(V);
1649  auto *Ty = cast<PointerType>(Ptr->getType())->getElementType();
1650  return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
1651  }
1652 
1653  /// Returns true if vector representation of the instruction \p I
1654  /// requires mask.
1655  bool isMaskRequired(const Instruction *I) { return (MaskedOp.count(I) != 0); }
1656 
1657  unsigned getNumStores() const { return LAI->getNumStores(); }
1658  unsigned getNumLoads() const { return LAI->getNumLoads(); }
1659  unsigned getNumPredStores() const { return NumPredStores; }
1660 
1661  /// Returns true if \p I is an instruction that will be scalarized with
1662  /// predication. Such instructions include conditional stores and
1663  /// instructions that may divide by zero.
1664  bool isScalarWithPredication(Instruction *I);
1665 
1666  /// Returns true if \p I is a memory instruction with consecutive memory
1667  /// access that can be widened.
1668  bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1669 
1670  // Returns true if the NoNaN attribute is set on the function.
1671  bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; }
1672 
1673 private:
1674  /// Check if a single basic block loop is vectorizable.
1675  /// At this point we know that this is a loop with a constant trip count
1676  /// and we only need to check individual instructions.
1677  bool canVectorizeInstrs();
1678 
1679  /// When we vectorize loops we may change the order in which
1680  /// we read and write from memory. This method checks if it is
1681  /// legal to vectorize the code, considering only memory constrains.
1682  /// Returns true if the loop is vectorizable
1683  bool canVectorizeMemory();
1684 
1685  /// Return true if we can vectorize this loop using the IF-conversion
1686  /// transformation.
1687  bool canVectorizeWithIfConvert();
1688 
1689  /// Return true if all of the instructions in the block can be speculatively
1690  /// executed. \p SafePtrs is a list of addresses that are known to be legal
1691  /// and we know that we can read from them without segfault.
1692  bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs);
1693 
1694  /// Updates the vectorization state by adding \p Phi to the inductions list.
1695  /// This can set \p Phi as the main induction of the loop if \p Phi is a
1696  /// better choice for the main induction than the existing one.
1697  void addInductionPhi(PHINode *Phi, const InductionDescriptor &ID,
1698  SmallPtrSetImpl<Value *> &AllowedExit);
1699 
1700  /// Create an analysis remark that explains why vectorization failed
1701  ///
1702  /// \p RemarkName is the identifier for the remark. If \p I is passed it is
1703  /// an instruction that prevents vectorization. Otherwise the loop is used
1704  /// for the location of the remark. \return the remark object that can be
1705  /// streamed to.
1707  createMissedAnalysis(StringRef RemarkName, Instruction *I = nullptr) const {
1708  return ::createMissedAnalysis(Hints->vectorizeAnalysisPassName(),
1709  RemarkName, TheLoop, I);
1710  }
1711 
1712  /// \brief If an access has a symbolic strides, this maps the pointer value to
1713  /// the stride symbol.
1714  const ValueToValueMap *getSymbolicStrides() {
1715  // FIXME: Currently, the set of symbolic strides is sometimes queried before
1716  // it's collected. This happens from canVectorizeWithIfConvert, when the
1717  // pointer is checked to reference consecutive elements suitable for a
1718  // masked access.
1719  return LAI ? &LAI->getSymbolicStrides() : nullptr;
1720  }
1721 
1722  unsigned NumPredStores = 0;
1723 
1724  /// The loop that we evaluate.
1725  Loop *TheLoop;
1726 
1727  /// A wrapper around ScalarEvolution used to add runtime SCEV checks.
1728  /// Applies dynamic knowledge to simplify SCEV expressions in the context
1729  /// of existing SCEV assumptions. The analysis will also add a minimal set
1730  /// of new predicates if this is required to enable vectorization and
1731  /// unrolling.
1733 
1734  /// Target Library Info.
1736 
1737  /// Target Transform Info
1738  const TargetTransformInfo *TTI;
1739 
1740  /// Dominator Tree.
1741  DominatorTree *DT;
1742 
1743  // LoopAccess analysis.
1744  std::function<const LoopAccessInfo &(Loop &)> *GetLAA;
1745 
1746  // And the loop-accesses info corresponding to this loop. This pointer is
1747  // null until canVectorizeMemory sets it up.
1748  const LoopAccessInfo *LAI = nullptr;
1749 
1750  /// Interface to emit optimization remarks.
1752 
1753  /// The interleave access information contains groups of interleaved accesses
1754  /// with the same stride and close to each other.
1755  InterleavedAccessInfo InterleaveInfo;
1756 
1757  // --- vectorization state --- //
1758 
1759  /// Holds the primary induction variable. This is the counter of the
1760  /// loop.
1761  PHINode *PrimaryInduction = nullptr;
1762 
1763  /// Holds the reduction variables.
1764  ReductionList Reductions;
1765 
1766  /// Holds all of the induction variables that we found in the loop.
1767  /// Notice that inductions don't need to start at zero and that induction
1768  /// variables can be pointers.
1769  InductionList Inductions;
1770 
1771  /// Holds the phi nodes that are first-order recurrences.
1772  RecurrenceSet FirstOrderRecurrences;
1773 
1774  /// Holds instructions that need to sink past other instructions to handle
1775  /// first-order recurrences.
1777 
1778  /// Holds the widest induction type encountered.
1779  Type *WidestIndTy = nullptr;
1780 
1781  /// Allowed outside users. This holds the induction and reduction
1782  /// vars which can be accessed from outside the loop.
1783  SmallPtrSet<Value *, 4> AllowedExit;
1784 
1785  /// Can we assume the absence of NaNs.
1786  bool HasFunNoNaNAttr = false;
1787 
1788  /// Vectorization requirements that will go through late-evaluation.
1789  LoopVectorizationRequirements *Requirements;
1790 
1791  /// Used to emit an analysis of any legality issues.
1792  LoopVectorizeHints *Hints;
1793 
1794  /// While vectorizing these instructions we have to generate a
1795  /// call to the appropriate masked intrinsic
1797 };
1798 
1799 /// LoopVectorizationCostModel - estimates the expected speedups due to
1800 /// vectorization.
1801 /// In many cases vectorization is not profitable. This can happen because of
1802 /// a number of reasons. In this class we mainly attempt to predict the
1803 /// expected speedup/slowdowns due to the supported instruction set. We use the
1804 /// TargetTransformInfo to query the different backends for the cost of
1805 /// different operations.
1806 class LoopVectorizationCostModel {
1807 public:
1808  LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE,
1809  LoopInfo *LI, LoopVectorizationLegality *Legal,
1810  const TargetTransformInfo &TTI,
1811  const TargetLibraryInfo *TLI, DemandedBits *DB,
1814  const LoopVectorizeHints *Hints)
1815  : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
1816  AC(AC), ORE(ORE), TheFunction(F), Hints(Hints) {}
1817 
1818  /// \return An upper bound for the vectorization factor, or None if
1819  /// vectorization should be avoided up front.
1820  Optional<unsigned> computeMaxVF(bool OptForSize);
1821 
1822  /// Information about vectorization costs
1823  struct VectorizationFactor {
1824  // Vector width with best cost
1825  unsigned Width;
1826 
1827  // Cost of the loop with that width
1828  unsigned Cost;
1829  };
1830 
1831  /// \return The most profitable vectorization factor and the cost of that VF.
1832  /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
1833  /// then this vectorization factor will be selected if vectorization is
1834  /// possible.
1835  VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
1836 
1837  /// Setup cost-based decisions for user vectorization factor.
1838  void selectUserVectorizationFactor(unsigned UserVF) {
1839  collectUniformsAndScalars(UserVF);
1840  collectInstsToScalarize(UserVF);
1841  }
1842 
1843  /// \return The size (in bits) of the smallest and widest types in the code
1844  /// that needs to be vectorized. We ignore values that remain scalar such as
1845  /// 64 bit loop indices.
1846  std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1847 
1848  /// \return The desired interleave count.
1849  /// If interleave count has been specified by metadata it will be returned.
1850  /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1851  /// are the selected vectorization factor and the cost of the selected VF.
1852  unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
1853  unsigned LoopCost);
1854 
1855  /// Memory access instruction may be vectorized in more than one way.
1856  /// Form of instruction after vectorization depends on cost.
1857  /// This function takes cost-based decisions for Load/Store instructions
1858  /// and collects them in a map. This decisions map is used for building
1859  /// the lists of loop-uniform and loop-scalar instructions.
1860  /// The calculated cost is saved with widening decision in order to
1861  /// avoid redundant calculations.
1862  void setCostBasedWideningDecision(unsigned VF);
1863 
1864  /// \brief A struct that represents some properties of the register usage
1865  /// of a loop.
1866  struct RegisterUsage {
1867  /// Holds the number of loop invariant values that are used in the loop.
1868  unsigned LoopInvariantRegs;
1869 
1870  /// Holds the maximum number of concurrent live intervals in the loop.
1871  unsigned MaxLocalUsers;
1872 
1873  /// Holds the number of instructions in the loop.
1874  unsigned NumInstructions;
1875  };
1876 
1877  /// \return Returns information about the register usages of the loop for the
1878  /// given vectorization factors.
1879  SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
1880 
1881  /// Collect values we want to ignore in the cost model.
1882  void collectValuesToIgnore();
1883 
1884  /// \returns The smallest bitwidth each instruction can be represented with.
1885  /// The vector equivalents of these instructions should be truncated to this
1886  /// type.
1887  const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1888  return MinBWs;
1889  }
1890 
1891  /// \returns True if it is more profitable to scalarize instruction \p I for
1892  /// vectorization factor \p VF.
1893  bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1894  assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1895  auto Scalars = InstsToScalarize.find(VF);
1896  assert(Scalars != InstsToScalarize.end() &&
1897  "VF not yet analyzed for scalarization profitability");
1898  return Scalars->second.count(I);
1899  }
1900 
1901  /// Returns true if \p I is known to be uniform after vectorization.
1902  bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1903  if (VF == 1)
1904  return true;
1905  assert(Uniforms.count(VF) && "VF not yet analyzed for uniformity");
1906  auto UniformsPerVF = Uniforms.find(VF);
1907  return UniformsPerVF->second.count(I);
1908  }
1909 
1910  /// Returns true if \p I is known to be scalar after vectorization.
1911  bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1912  if (VF == 1)
1913  return true;
1914  assert(Scalars.count(VF) && "Scalar values are not calculated for VF");
1915  auto ScalarsPerVF = Scalars.find(VF);
1916  return ScalarsPerVF->second.count(I);
1917  }
1918 
1919  /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1920  /// for vectorization factor \p VF.
1921  bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1922  return VF > 1 && MinBWs.count(I) && !isProfitableToScalarize(I, VF) &&
1923  !isScalarAfterVectorization(I, VF);
1924  }
1925 
1926  /// Decision that was taken during cost calculation for memory instruction.
1927  enum InstWidening {
1928  CM_Unknown,
1929  CM_Widen,
1930  CM_Interleave,
1931  CM_GatherScatter,
1932  CM_Scalarize
1933  };
1934 
1935  /// Save vectorization decision \p W and \p Cost taken by the cost model for
1936  /// instruction \p I and vector width \p VF.
1937  void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1938  unsigned Cost) {
1939  assert(VF >= 2 && "Expected VF >=2");
1940  WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1941  }
1942 
1943  /// Save vectorization decision \p W and \p Cost taken by the cost model for
1944  /// interleaving group \p Grp and vector width \p VF.
1945  void setWideningDecision(const InterleaveGroup *Grp, unsigned VF,
1946  InstWidening W, unsigned Cost) {
1947  assert(VF >= 2 && "Expected VF >=2");
1948  /// Broadcast this decicion to all instructions inside the group.
1949  /// But the cost will be assigned to one instruction only.
1950  for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1951  if (auto *I = Grp->getMember(i)) {
1952  if (Grp->getInsertPos() == I)
1953  WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1954  else
1955  WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1956  }
1957  }
1958  }
1959 
1960  /// Return the cost model decision for the given instruction \p I and vector
1961  /// width \p VF. Return CM_Unknown if this instruction did not pass
1962  /// through the cost modeling.
1963  InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1964  assert(VF >= 2 && "Expected VF >=2");
1965  std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1966  auto Itr = WideningDecisions.find(InstOnVF);
1967  if (Itr == WideningDecisions.end())
1968  return CM_Unknown;
1969  return Itr->second.first;
1970  }
1971 
1972  /// Return the vectorization cost for the given instruction \p I and vector
1973  /// width \p VF.
1974  unsigned getWideningCost(Instruction *I, unsigned VF) {
1975  assert(VF >= 2 && "Expected VF >=2");
1976  std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1977  assert(WideningDecisions.count(InstOnVF) && "The cost is not calculated");
1978  return WideningDecisions[InstOnVF].second;
1979  }
1980 
1981  /// Return True if instruction \p I is an optimizable truncate whose operand
1982  /// is an induction variable. Such a truncate will be removed by adding a new
1983  /// induction variable with the destination type.
1984  bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1985  // If the instruction is not a truncate, return false.
1986  auto *Trunc = dyn_cast<TruncInst>(I);
1987  if (!Trunc)
1988  return false;
1989 
1990  // Get the source and destination types of the truncate.
1991  Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1992  Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1993 
1994  // If the truncate is free for the given types, return false. Replacing a
1995  // free truncate with an induction variable would add an induction variable
1996  // update instruction to each iteration of the loop. We exclude from this
1997  // check the primary induction variable since it will need an update
1998  // instruction regardless.
1999  Value *Op = Trunc->getOperand(0);
2000  if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
2001  return false;
2002 
2003  // If the truncated value is not an induction variable, return false.
2004  return Legal->isInductionVariable(Op);
2005  }
2006 
2007  /// Collects the instructions to scalarize for each predicated instruction in
2008  /// the loop.
2009  void collectInstsToScalarize(unsigned VF);
2010 
2011  /// Collect Uniform and Scalar values for the given \p VF.
2012  /// The sets depend on CM decision for Load/Store instructions
2013  /// that may be vectorized as interleave, gather-scatter or scalarized.
2014  void collectUniformsAndScalars(unsigned VF) {
2015  // Do the analysis once.
2016  if (VF == 1 || Uniforms.count(VF))
2017  return;
2018  setCostBasedWideningDecision(VF);
2019  collectLoopUniforms(VF);
2020  collectLoopScalars(VF);
2021  }
2022 
2023 private:
2024  /// \return An upper bound for the vectorization factor, larger than zero.
2025  /// One is returned if vectorization should best be avoided due to cost.
2026  unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount);
2027 
2028  /// The vectorization cost is a combination of the cost itself and a boolean
2029  /// indicating whether any of the contributing operations will actually
2030  /// operate on
2031  /// vector values after type legalization in the backend. If this latter value
2032  /// is
2033  /// false, then all operations will be scalarized (i.e. no vectorization has
2034  /// actually taken place).
2035  using VectorizationCostTy = std::pair<unsigned, bool>;
2036 
2037  /// Returns the expected execution cost. The unit of the cost does
2038  /// not matter because we use the 'cost' units to compare different
2039  /// vector widths. The cost that is returned is *not* normalized by
2040  /// the factor width.
2041  VectorizationCostTy expectedCost(unsigned VF);
2042 
2043  /// Returns the execution time cost of an instruction for a given vector
2044  /// width. Vector width of one means scalar.
2045  VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
2046 
2047  /// The cost-computation logic from getInstructionCost which provides
2048  /// the vector type as an output parameter.
2049  unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
2050 
2051  /// Calculate vectorization cost of memory instruction \p I.
2052  unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
2053 
2054  /// The cost computation for scalarized memory instruction.
2055  unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
2056 
2057  /// The cost computation for interleaving group of memory instructions.
2058  unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
2059 
2060  /// The cost computation for Gather/Scatter instruction.
2061  unsigned getGatherScatterCost(Instruction *I, unsigned VF);
2062 
2063  /// The cost computation for widening instruction \p I with consecutive
2064  /// memory access.
2065  unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
2066 
2067  /// The cost calculation for Load instruction \p I with uniform pointer -
2068  /// scalar load + broadcast.
2069  unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
2070 
2071  /// Returns whether the instruction is a load or store and will be a emitted
2072  /// as a vector operation.
2073  bool isConsecutiveLoadOrStore(Instruction *I);
2074 
2075  /// Create an analysis remark that explains why vectorization failed
2076  ///
2077  /// \p RemarkName is the identifier for the remark. \return the remark object
2078  /// that can be streamed to.
2080  return ::createMissedAnalysis(Hints->vectorizeAnalysisPassName(),
2081  RemarkName, TheLoop);
2082  }
2083 
2084  /// Map of scalar integer values to the smallest bitwidth they can be legally
2085  /// represented as. The vector equivalents of these values should be truncated
2086  /// to this type.
2088 
2089  /// A type representing the costs for instructions if they were to be
2090  /// scalarized rather than vectorized. The entries are Instruction-Cost
2091  /// pairs.
2092  using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
2093 
2094  /// A set containing all BasicBlocks that are known to present after
2095  /// vectorization as a predicated block.
2096  SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
2097 
2098  /// A map holding scalar costs for different vectorization factors. The
2099  /// presence of a cost for an instruction in the mapping indicates that the
2100  /// instruction will be scalarized when vectorizing with the associated
2101  /// vectorization factor. The entries are VF-ScalarCostTy pairs.
2102  DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
2103 
2104  /// Holds the instructions known to be uniform after vectorization.
2105  /// The data is collected per VF.
2107 
2108  /// Holds the instructions known to be scalar after vectorization.
2109  /// The data is collected per VF.
2111 
2112  /// Holds the instructions (address computations) that are forced to be
2113  /// scalarized.
2115 
2116  /// Returns the expected difference in cost from scalarizing the expression
2117  /// feeding a predicated instruction \p PredInst. The instructions to
2118  /// scalarize and their scalar costs are collected in \p ScalarCosts. A
2119  /// non-negative return value implies the expression will be scalarized.
2120  /// Currently, only single-use chains are considered for scalarization.
2121  int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
2122  unsigned VF);
2123 
2124  /// Collect the instructions that are uniform after vectorization. An
2125  /// instruction is uniform if we represent it with a single scalar value in
2126  /// the vectorized loop corresponding to each vector iteration. Examples of
2127  /// uniform instructions include pointer operands of consecutive or
2128  /// interleaved memory accesses. Note that although uniformity implies an
2129  /// instruction will be scalar, the reverse is not true. In general, a
2130  /// scalarized instruction will be represented by VF scalar values in the
2131  /// vectorized loop, each corresponding to an iteration of the original
2132  /// scalar loop.
2133  void collectLoopUniforms(unsigned VF);
2134 
2135  /// Collect the instructions that are scalar after vectorization. An
2136  /// instruction is scalar if it is known to be uniform or will be scalarized
2137  /// during vectorization. Non-uniform scalarized instructions will be
2138  /// represented by VF values in the vectorized loop, each corresponding to an
2139  /// iteration of the original scalar loop.
2140  void collectLoopScalars(unsigned VF);
2141 
2142  /// Keeps cost model vectorization decision and cost for instructions.
2143  /// Right now it is used for memory instructions only.
2144  using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
2145  std::pair<InstWidening, unsigned>>;
2146 
2147  DecisionList WideningDecisions;
2148 
2149 public:
2150  /// The loop that we evaluate.
2151  Loop *TheLoop;
2152 
2153  /// Predicated scalar evolution analysis.
2155 
2156  /// Loop Info analysis.
2157  LoopInfo *LI;
2158 
2159  /// Vectorization legality.
2160  LoopVectorizationLegality *Legal;
2161 
2162  /// Vector target information.
2163  const TargetTransformInfo &TTI;
2164 
2165  /// Target Library Info.
2166  const TargetLibraryInfo *TLI;
2167 
2168  /// Demanded bits analysis.
2169  DemandedBits *DB;
2170 
2171  /// Assumption cache.
2173 
2174  /// Interface to emit optimization remarks.
2176 
2177  const Function *TheFunction;
2178 
2179  /// Loop Vectorize Hint.
2180  const LoopVectorizeHints *Hints;
2181 
2182  /// Values to ignore in the cost model.
2183  SmallPtrSet<const Value *, 16> ValuesToIgnore;
2184 
2185  /// Values to ignore in the cost model when VF > 1.
2186  SmallPtrSet<const Value *, 16> VecValuesToIgnore;
2187 };
2188 
2189 } // end anonymous namespace
2190 
2191 namespace llvm {
2192 
2193 /// InnerLoopVectorizer vectorizes loops which contain only one basic
2194 /// LoopVectorizationPlanner - drives the vectorization process after having
2195 /// passed Legality checks.
2196 /// The planner builds and optimizes the Vectorization Plans which record the
2197 /// decisions how to vectorize the given loop. In particular, represent the
2198 /// control-flow of the vectorized version, the replication of instructions that
2199 /// are to be scalarized, and interleave access groups.
2201  /// The loop that we evaluate.
2202  Loop *OrigLoop;
2203 
2204  /// Loop Info analysis.
2205  LoopInfo *LI;
2206 
2207  /// Target Library Info.
2208  const TargetLibraryInfo *TLI;
2209 
2210  /// Target Transform Info.
2211  const TargetTransformInfo *TTI;
2212 
2213  /// The legality analysis.
2214  LoopVectorizationLegality *Legal;
2215 
2216  /// The profitablity analysis.
2217  LoopVectorizationCostModel &CM;
2218 
2219  using VPlanPtr = std::unique_ptr<VPlan>;
2220 
2221  SmallVector<VPlanPtr, 4> VPlans;
2222 
2223  /// This class is used to enable the VPlan to invoke a method of ILV. This is
2224  /// needed until the method is refactored out of ILV and becomes reusable.
2225  struct VPCallbackILV : public VPCallback {
2226  InnerLoopVectorizer &ILV;
2227 
2228  VPCallbackILV(InnerLoopVectorizer &ILV) : ILV(ILV) {}
2229 
2230  Value *getOrCreateVectorValues(Value *V, unsigned Part) override {
2231  return ILV.getOrCreateVectorValue(V, Part);
2232  }
2233  };
2234 
2235  /// A builder used to construct the current plan.
2237 
2238  /// When we if-convert we need to create edge masks. We have to cache values
2239  /// so that we don't end up with exponential recursion/IR. Note that
2240  /// if-conversion currently takes place during VPlan-construction, so these
2241  /// caches are only used at that stage.
2242  using EdgeMaskCacheTy =
2245  EdgeMaskCacheTy EdgeMaskCache;
2246  BlockMaskCacheTy BlockMaskCache;
2247 
2248  unsigned BestVF = 0;
2249  unsigned BestUF = 0;
2250 
2251 public:
2253  const TargetTransformInfo *TTI,
2254  LoopVectorizationLegality *Legal,
2255  LoopVectorizationCostModel &CM)
2256  : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM) {}
2257 
2258  /// Plan how to best vectorize, return the best VF and its cost.
2260  unsigned UserVF);
2261 
2262  /// Finalize the best decision and dispose of all other VPlans.
2263  void setBestPlan(unsigned VF, unsigned UF);
2264 
2265  /// Generate the IR code for the body of the vectorized loop according to the
2266  /// best selected VPlan.
2267  void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT);
2268 
2270  for (const auto &Plan : VPlans)
2271  O << *Plan;
2272  }
2273 
2274 protected:
2275  /// Collect the instructions from the original loop that would be trivially
2276  /// dead in the vectorized loop if generated.
2277  void collectTriviallyDeadInstructions(
2278  SmallPtrSetImpl<Instruction *> &DeadInstructions);
2279 
2280  /// A range of powers-of-2 vectorization factors with fixed start and
2281  /// adjustable end. The range includes start and excludes end, e.g.,:
2282  /// [1, 9) = {1, 2, 4, 8}
2283  struct VFRange {
2284  // A power of 2.
2285  const unsigned Start;
2286 
2287  // Need not be a power of 2. If End <= Start range is empty.
2288  unsigned End;
2289  };
2290 
2291  /// Test a \p Predicate on a \p Range of VF's. Return the value of applying
2292  /// \p Predicate on Range.Start, possibly decreasing Range.End such that the
2293  /// returned value holds for the entire \p Range.
2294  bool getDecisionAndClampRange(const std::function<bool(unsigned)> &Predicate,
2295  VFRange &Range);
2296 
2297  /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
2298  /// according to the information gathered by Legal when it checked if it is
2299  /// legal to vectorize the loop.
2300  void buildVPlans(unsigned MinVF, unsigned MaxVF);
2301 
2302 private:
2303  /// A helper function that computes the predicate of the block BB, assuming
2304  /// that the header block of the loop is set to True. It returns the *entry*
2305  /// mask for the block BB.
2306  VPValue *createBlockInMask(BasicBlock *BB, VPlanPtr &Plan);
2307 
2308  /// A helper function that computes the predicate of the edge between SRC
2309  /// and DST.
2310  VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlanPtr &Plan);
2311 
2312  /// Check if \I belongs to an Interleave Group within the given VF \p Range,
2313  /// \return true in the first returned value if so and false otherwise.
2314  /// Build a new VPInterleaveGroup Recipe if \I is the primary member of an IG
2315  /// for \p Range.Start, and provide it as the second returned value.
2316  /// Note that if \I is an adjunct member of an IG for \p Range.Start, the
2317  /// \return value is <true, nullptr>, as it is handled by another recipe.
2318  /// \p Range.End may be decreased to ensure same decision from \p Range.Start
2319  /// to \p Range.End.
2320  VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range);
2321 
2322  // Check if \I is a memory instruction to be widened for \p Range.Start and
2323  // potentially masked. Such instructions are handled by a recipe that takes an
2324  // additional VPInstruction for the mask.
2325  VPWidenMemoryInstructionRecipe *tryToWidenMemory(Instruction *I,
2326  VFRange &Range,
2327  VPlanPtr &Plan);
2328 
2329  /// Check if an induction recipe should be constructed for \I within the given
2330  /// VF \p Range. If so build and return it. If not, return null. \p Range.End
2331  /// may be decreased to ensure same decision from \p Range.Start to
2332  /// \p Range.End.
2333  VPWidenIntOrFpInductionRecipe *tryToOptimizeInduction(Instruction *I,
2334  VFRange &Range);
2335 
2336  /// Handle non-loop phi nodes. Currently all such phi nodes are turned into
2337  /// a sequence of select instructions as the vectorizer currently performs
2338  /// full if-conversion.
2339  VPBlendRecipe *tryToBlend(Instruction *I, VPlanPtr &Plan);
2340 
2341  /// Check if \p I can be widened within the given VF \p Range. If \p I can be
2342  /// widened for \p Range.Start, check if the last recipe of \p VPBB can be
2343  /// extended to include \p I or else build a new VPWidenRecipe for it and
2344  /// append it to \p VPBB. Return true if \p I can be widened for Range.Start,
2345  /// false otherwise. Range.End may be decreased to ensure same decision from
2346  /// \p Range.Start to \p Range.End.
2347  bool tryToWiden(Instruction *I, VPBasicBlock *VPBB, VFRange &Range);
2348 
2349  /// Build a VPReplicationRecipe for \p I and enclose it within a Region if it
2350  /// is predicated. \return \p VPBB augmented with this new recipe if \p I is
2351  /// not predicated, otherwise \return a new VPBasicBlock that succeeds the new
2352  /// Region. Update the packing decision of predicated instructions if they
2353  /// feed \p I. Range.End may be decreased to ensure same recipe behavior from
2354  /// \p Range.Start to \p Range.End.
2355  VPBasicBlock *handleReplication(
2356  Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
2358  VPlanPtr &Plan);
2359 
2360  /// Create a replicating region for instruction \p I that requires
2361  /// predication. \p PredRecipe is a VPReplicateRecipe holding \p I.
2362  VPRegionBlock *createReplicateRegion(Instruction *I, VPRecipeBase *PredRecipe,
2363  VPlanPtr &Plan);
2364 
2365  /// Build a VPlan according to the information gathered by Legal. \return a
2366  /// VPlan for vectorization factors \p Range.Start and up to \p Range.End
2367  /// exclusive, possibly decreasing \p Range.End.
2368  VPlanPtr buildVPlan(VFRange &Range,
2369  const SmallPtrSetImpl<Value *> &NeedDef);
2370 };
2371 
2372 } // end namespace llvm
2373 
2374 namespace {
2375 
2376 /// \brief This holds vectorization requirements that must be verified late in
2377 /// the process. The requirements are set by legalize and costmodel. Once
2378 /// vectorization has been determined to be possible and profitable the
2379 /// requirements can be verified by looking for metadata or compiler options.
2380 /// For example, some loops require FP commutativity which is only allowed if
2381 /// vectorization is explicitly specified or if the fast-math compiler option
2382 /// has been provided.
2383 /// Late evaluation of these requirements allows helpful diagnostics to be
2384 /// composed that tells the user what need to be done to vectorize the loop. For
2385 /// example, by specifying #pragma clang loop vectorize or -ffast-math. Late
2386 /// evaluation should be used only when diagnostics can generated that can be
2387 /// followed by a non-expert user.
2388 class LoopVectorizationRequirements {
2389 public:
2390  LoopVectorizationRequirements(OptimizationRemarkEmitter &ORE) : ORE(ORE) {}
2391 
2392  void addUnsafeAlgebraInst(Instruction *I) {
2393  // First unsafe algebra instruction.
2394  if (!UnsafeAlgebraInst)
2395  UnsafeAlgebraInst = I;
2396  }
2397 
2398  void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; }
2399 
2400  bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints) {
2401  const char *PassName = Hints.vectorizeAnalysisPassName();
2402  bool Failed = false;
2403  if (UnsafeAlgebraInst && !Hints.allowReordering()) {
2404  ORE.emit([&]() {
2406  PassName, "CantReorderFPOps",
2407  UnsafeAlgebraInst->getDebugLoc(),
2408  UnsafeAlgebraInst->getParent())
2409  << "loop not vectorized: cannot prove it is safe to reorder "
2410  "floating-point operations";
2411  });
2412  Failed = true;
2413  }
2414 
2415  // Test if runtime memcheck thresholds are exceeded.
2416  bool PragmaThresholdReached =
2417  NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
2418  bool ThresholdReached =
2419  NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
2420  if ((ThresholdReached && !Hints.allowReordering()) ||
2421  PragmaThresholdReached) {
2422  ORE.emit([&]() {
2423  return OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps",
2424  L->getStartLoc(),
2425  L->getHeader())
2426  << "loop not vectorized: cannot prove it is safe to reorder "
2427  "memory operations";
2428  });
2429  DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
2430  Failed = true;
2431  }
2432 
2433  return Failed;
2434  }
2435 
2436 private:
2437  unsigned NumRuntimePointerChecks = 0;
2438  Instruction *UnsafeAlgebraInst = nullptr;
2439 
2440  /// Interface to emit optimization remarks.
2442 };
2443 
2444 } // end anonymous namespace
2445 
2447  if (L.empty()) {
2448  if (!hasCyclesInLoopBody(L))
2449  V.push_back(&L);
2450  return;
2451  }
2452  for (Loop *InnerL : L)
2453  addAcyclicInnerLoop(*InnerL, V);
2454 }
2455 
2456 namespace {
2457 
2458 /// The LoopVectorize Pass.
2459 struct LoopVectorize : public FunctionPass {
2460  /// Pass identification, replacement for typeid
2461  static char ID;
2462 
2463  LoopVectorizePass Impl;
2464 
2465  explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true)
2466  : FunctionPass(ID) {
2467  Impl.DisableUnrolling = NoUnrolling;
2468  Impl.AlwaysVectorize = AlwaysVectorize;
2470  }
2471 
2472  bool runOnFunction(Function &F) override {
2473  if (skipFunction(F))
2474  return false;
2475 
2476  auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2477  auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2478  auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2479  auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2480  auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2481  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2482  auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
2483  auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2484  auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2485  auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2486  auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2487  auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2488 
2489  std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2490  [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2491 
2492  return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2493  GetLAA, *ORE);
2494  }
2495 
2496  void getAnalysisUsage(AnalysisUsage &AU) const override {
2511  }
2512 };
2513 
2514 } // end anonymous namespace
2515 
2516 //===----------------------------------------------------------------------===//
2517 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2518 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2519 //===----------------------------------------------------------------------===//
2520 
2522  // We need to place the broadcast of invariant variables outside the loop.
2523  Instruction *Instr = dyn_cast<Instruction>(V);
2524  bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody);
2525  bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
2526 
2527  // Place the code for broadcasting invariant variables in the new preheader.
2529  if (Invariant)
2531 
2532  // Broadcast the scalar into all locations in the vector.
2533  Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2534 
2535  return Shuf;
2536 }
2537 
2539  const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
2540  Value *Start = II.getStartValue();
2541 
2542  // Construct the initial value of the vector IV in the vector loop preheader
2543  auto CurrIP = Builder.saveIP();
2545  if (isa<TruncInst>(EntryVal)) {
2546  assert(Start->getType()->isIntegerTy() &&
2547  "Truncation requires an integer type");
2548  auto *TruncType = cast<IntegerType>(EntryVal->getType());
2549  Step = Builder.CreateTrunc(Step, TruncType);
2550  Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2551  }
2552  Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
2553  Value *SteppedStart =
2554  getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
2555 
2556  // We create vector phi nodes for both integer and floating-point induction
2557  // variables. Here, we determine the kind of arithmetic we will perform.
2558  Instruction::BinaryOps AddOp;
2559  Instruction::BinaryOps MulOp;
2560  if (Step->getType()->isIntegerTy()) {
2561  AddOp = Instruction::Add;
2562  MulOp = Instruction::Mul;
2563  } else {
2564  AddOp = II.getInductionOpcode();
2565  MulOp = Instruction::FMul;
2566  }
2567 
2568  // Multiply the vectorization factor by the step using integer or
2569  // floating-point arithmetic as appropriate.
2570  Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
2571  Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
2572 
2573  // Create a vector splat to use in the induction update.
2574  //
2575  // FIXME: If the step is non-constant, we create the vector splat with
2576  // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2577  // handle a constant vector splat.
2578  Value *SplatVF = isa<Constant>(Mul)
2579  ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
2580  : Builder.CreateVectorSplat(VF, Mul);
2581  Builder.restoreIP(CurrIP);
2582 
2583  // We may need to add the step a number of times, depending on the unroll
2584  // factor. The last of those goes into the PHI.
2585  PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2587  Instruction *LastInduction = VecInd;
2588  for (unsigned Part = 0; Part < UF; ++Part) {
2589  VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
2590  if (isa<TruncInst>(EntryVal))
2591  addMetadata(LastInduction, EntryVal);
2592  LastInduction = cast<Instruction>(addFastMathFlag(
2593  Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
2594  }
2595 
2596  // Move the last step to the end of the latch block. This ensures consistent
2597  // placement of all induction updates.
2598  auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2599  auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2600  auto *ICmp = cast<Instruction>(Br->getCondition());
2601  LastInduction->moveBefore(ICmp);
2602  LastInduction->setName("vec.ind.next");
2603 
2604  VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2605  VecInd->addIncoming(LastInduction, LoopVectorLatch);
2606 }
2607 
2609  return Cost->isScalarAfterVectorization(I, VF) ||
2610  Cost->isProfitableToScalarize(I, VF);
2611 }
2612 
2615  return true;
2616  auto isScalarInst = [&](User *U) -> bool {
2617  auto *I = cast<Instruction>(U);
2618  return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
2619  };
2620  return llvm::any_of(IV->users(), isScalarInst);
2621 }
2622 
2624  assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
2625  "Primary induction variable must have an integer type");
2626 
2627  auto II = Legal->getInductionVars()->find(IV);
2628  assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
2629 
2630  auto ID = II->second;
2631  assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
2632 
2633  // The scalar value to broadcast. This will be derived from the canonical
2634  // induction variable.
2635  Value *ScalarIV = nullptr;
2636 
2637  // The value from the original loop to which we are mapping the new induction
2638  // variable.
2639  Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2640 
2641  // True if we have vectorized the induction variable.
2642  auto VectorizedIV = false;
2643 
2644  // Determine if we want a scalar version of the induction variable. This is
2645  // true if the induction variable itself is not widened, or if it has at
2646  // least one user in the loop that is not widened.
2647  auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
2648 
2649  // Generate code for the induction step. Note that induction steps are
2650  // required to be loop-invariant
2651  assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
2652  "Induction step should be loop invariant");
2653  auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2654  Value *Step = nullptr;
2655  if (PSE.getSE()->isSCEVable(IV->getType())) {
2656  SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2657  Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
2659  } else {
2660  Step = cast<SCEVUnknown>(ID.getStep())->getValue();
2661  }
2662 
2663  // Try to create a new independent vector induction variable. If we can't
2664  // create the phi node, we will splat the scalar induction variable in each
2665  // loop iteration.
2666  if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
2667  createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2668  VectorizedIV = true;
2669  }
2670 
2671  // If we haven't yet vectorized the induction variable, or if we will create
2672  // a scalar one, we need to define the scalar induction variable and step
2673  // values. If we were given a truncation type, truncate the canonical
2674  // induction variable and step. Otherwise, derive these values from the
2675  // induction descriptor.
2676  if (!VectorizedIV || NeedsScalarIV) {
2677  ScalarIV = Induction;
2678  if (IV != OldInduction) {
2679  ScalarIV = IV->getType()->isIntegerTy()
2681  : Builder.CreateCast(Instruction::SIToFP, Induction,
2682  IV->getType());
2683  ScalarIV = ID.transform(Builder, ScalarIV, PSE.getSE(), DL);
2684  ScalarIV->setName("offset.idx");
2685  }
2686  if (Trunc) {
2687  auto *TruncType = cast<IntegerType>(Trunc->getType());
2688  assert(Step->getType()->isIntegerTy() &&
2689  "Truncation requires an integer step");
2690  ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2691  Step = Builder.CreateTrunc(Step, TruncType);
2692  }
2693  }
2694 
2695  // If we haven't yet vectorized the induction variable, splat the scalar
2696  // induction variable, and build the necessary step vectors.
2697  if (!VectorizedIV) {
2698  Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2699  for (unsigned Part = 0; Part < UF; ++Part) {
2700  Value *EntryPart =
2701  getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
2702  VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
2703  if (Trunc)
2704  addMetadata(EntryPart, Trunc);
2705  }
2706  }
2707 
2708  // If an induction variable is only used for counting loop iterations or
2709  // calculating addresses, it doesn't need to be widened. Create scalar steps
2710  // that can be used by instructions we will later scalarize. Note that the
2711  // addition of the scalar steps will not increase the number of instructions
2712  // in the loop in the common case prior to InstCombine. We will be trading
2713  // one vector extract for each scalar step.
2714  if (NeedsScalarIV)
2715  buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2716 }
2717 
2719  Instruction::BinaryOps BinOp) {
2720  // Create and check the types.
2721  assert(Val->getType()->isVectorTy() && "Must be a vector");
2722  int VLen = Val->getType()->getVectorNumElements();
2723 
2724  Type *STy = Val->getType()->getScalarType();
2725  assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2726  "Induction Step must be an integer or FP");
2727  assert(Step->getType() == STy && "Step has wrong type");
2728 
2730 
2731  if (STy->isIntegerTy()) {
2732  // Create a vector of consecutive numbers from zero to VF.
2733  for (int i = 0; i < VLen; ++i)
2734  Indices.push_back(ConstantInt::get(STy, StartIdx + i));
2735 
2736  // Add the consecutive indices to the vector value.
2737  Constant *Cv = ConstantVector::get(Indices);
2738  assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
2739  Step = Builder.CreateVectorSplat(VLen, Step);
2740  assert(Step->getType() == Val->getType() && "Invalid step vec");
2741  // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2742  // which can be found from the original scalar operations.
2743  Step = Builder.CreateMul(Cv, Step);
2744  return Builder.CreateAdd(Val, Step, "induction");
2745  }
2746 
2747  // Floating point induction.
2748  assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2749  "Binary Opcode should be specified for FP induction");
2750  // Create a vector of consecutive numbers from zero to VF.
2751  for (int i = 0; i < VLen; ++i)
2752  Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
2753 
2754  // Add the consecutive indices to the vector value.
2755  Constant *Cv = ConstantVector::get(Indices);
2756 
2757  Step = Builder.CreateVectorSplat(VLen, Step);
2758 
2759  // Floating point operations had to be 'fast' to enable the induction.
2760  FastMathFlags Flags;
2761  Flags.setFast();
2762 
2763  Value *MulOp = Builder.CreateFMul(Cv, Step);
2764  if (isa<Instruction>(MulOp))
2765  // Have to check, MulOp may be a constant
2766  cast<Instruction>(MulOp)->setFastMathFlags(Flags);
2767 
2768  Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2769  if (isa<Instruction>(BOp))
2770  cast<Instruction>(BOp)->setFastMathFlags(Flags);
2771  return BOp;
2772 }
2773 
2775  Value *EntryVal,
2776  const InductionDescriptor &ID) {
2777  // We shouldn't have to build scalar steps if we aren't vectorizing.
2778  assert(VF > 1 && "VF should be greater than one");
2779 
2780  // Get the value type and ensure it and the step have the same integer type.
2781  Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2782  assert(ScalarIVTy == Step->getType() &&
2783  "Val and Step should have the same type");
2784 
2785  // We build scalar steps for both integer and floating-point induction
2786  // variables. Here, we determine the kind of arithmetic we will perform.
2787  Instruction::BinaryOps AddOp;
2788  Instruction::BinaryOps MulOp;
2789  if (ScalarIVTy->isIntegerTy()) {
2790  AddOp = Instruction::Add;
2791  MulOp = Instruction::Mul;
2792  } else {
2793  AddOp = ID.getInductionOpcode();
2794  MulOp = Instruction::FMul;
2795  }
2796 
2797  // Determine the number of scalars we need to generate for each unroll
2798  // iteration. If EntryVal is uniform, we only need to generate the first
2799  // lane. Otherwise, we generate all VF values.
2800  unsigned Lanes =
2801  Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
2802  : VF;
2803  // Compute the scalar steps and save the results in VectorLoopValueMap.
2804  for (unsigned Part = 0; Part < UF; ++Part) {
2805  for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2806  auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
2807  auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
2808  auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
2809  VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
2810  }
2811  }
2812 }
2813 
2814 int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
2815  const ValueToValueMap &Strides = getSymbolicStrides() ? *getSymbolicStrides() :
2816  ValueToValueMap();
2817 
2818  int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, true, false);
2819  if (Stride == 1 || Stride == -1)
2820  return Stride;
2821  return 0;
2822 }
2823 
2824 bool LoopVectorizationLegality::isUniform(Value *V) {
2825  return LAI->isUniform(V);
2826 }
2827 
2829  assert(V != Induction && "The new induction variable should not be used.");
2830  assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2831  assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2832 
2833  // If we have a stride that is replaced by one, do it here.
2834  if (Legal->hasStride(V))
2835  V = ConstantInt::get(V->getType(), 1);
2836 
2837  // If we have a vector mapped to this value, return it.
2838  if (VectorLoopValueMap.hasVectorValue(V, Part))
2839  return VectorLoopValueMap.getVectorValue(V, Part);
2840 
2841  // If the value has not been vectorized, check if it has been scalarized
2842  // instead. If it has been scalarized, and we actually need the value in
2843  // vector form, we will construct the vector values on demand.
2845  Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2846 
2847  // If we've scalarized a value, that value should be an instruction.
2848  auto *I = cast<Instruction>(V);
2849 
2850  // If we aren't vectorizing, we can just copy the scalar map values over to
2851  // the vector map.
2852  if (VF == 1) {
2853  VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2854  return ScalarValue;
2855  }
2856 
2857  // Get the last scalar instruction we generated for V and Part. If the value
2858  // is known to be uniform after vectorization, this corresponds to lane zero
2859  // of the Part unroll iteration. Otherwise, the last instruction is the one
2860  // we created for the last vector lane of the Part unroll iteration.
2861  unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
2862  auto *LastInst = cast<Instruction>(
2863  VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2864 
2865  // Set the insert point after the last scalarized instruction. This ensures
2866  // the insertelement sequence will directly follow the scalar definitions.
2867  auto OldIP = Builder.saveIP();
2868  auto NewIP = std::next(BasicBlock::iterator(LastInst));
2869  Builder.SetInsertPoint(&*NewIP);
2870 
2871  // However, if we are vectorizing, we need to construct the vector values.
2872  // If the value is known to be uniform after vectorization, we can just
2873  // broadcast the scalar value corresponding to lane zero for each unroll
2874  // iteration. Otherwise, we construct the vector values using insertelement
2875  // instructions. Since the resulting vectors are stored in
2876  // VectorLoopValueMap, we will only generate the insertelements once.
2877  Value *VectorValue = nullptr;
2878  if (Cost->isUniformAfterVectorization(I, VF)) {
2879  VectorValue = getBroadcastInstrs(ScalarValue);
2880  VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2881  } else {
2882  // Initialize packing with insertelements to start from undef.
2884  VectorLoopValueMap.setVectorValue(V, Part, Undef);
2885  for (unsigned Lane = 0; Lane < VF; ++Lane)
2886  packScalarIntoVectorValue(V, {Part, Lane});
2887  VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2888  }
2889  Builder.restoreIP(OldIP);
2890  return VectorValue;
2891  }
2892 
2893  // If this scalar is unknown, assume that it is a constant or that it is
2894  // loop invariant. Broadcast V and save the value for future uses.
2895  Value *B = getBroadcastInstrs(V);
2896  VectorLoopValueMap.setVectorValue(V, Part, B);
2897  return B;
2898 }
2899 
2900 Value *
2902  const VPIteration &Instance) {
2903  // If the value is not an instruction contained in the loop, it should
2904  // already be scalar.
2905  if (OrigLoop->isLoopInvariant(V))
2906  return V;
2907 
2908  assert(Instance.Lane > 0
2909  ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2910  : true && "Uniform values only have lane zero");
2911 
2912  // If the value from the original loop has not been vectorized, it is
2913  // represented by UF x VF scalar values in the new loop. Return the requested
2914  // scalar value.
2915  if (VectorLoopValueMap.hasScalarValue(V, Instance))
2916  return VectorLoopValueMap.getScalarValue(V, Instance);
2917 
2918  // If the value has not been scalarized, get its entry in VectorLoopValueMap
2919  // for the given unroll part. If this entry is not a vector type (i.e., the
2920  // vectorization factor is one), there is no need to generate an
2921  // extractelement instruction.
2922  auto *U = getOrCreateVectorValue(V, Instance.Part);
2923  if (!U->getType()->isVectorTy()) {
2924  assert(VF == 1 && "Value not scalarized has non-vector type");
2925  return U;
2926  }
2927 
2928  // Otherwise, the value from the original loop has been vectorized and is
2929  // represented by UF vector values. Extract and return the requested scalar
2930  // value from the appropriate vector lane.
2931  return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2932 }
2933 
2935  Value *V, const VPIteration &Instance) {
2936  assert(V != Induction && "The new induction variable should not be used.");
2937  assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2938  assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2939 
2940  Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2941  Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2942  VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2943  Builder.getInt32(Instance.Lane));
2944  VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2945 }
2946 
2948  assert(Vec->getType()->isVectorTy() && "Invalid type");
2949  SmallVector<Constant *, 8> ShuffleMask;
2950  for (unsigned i = 0; i < VF; ++i)
2951  ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2952 
2953  return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2954  ConstantVector::get(ShuffleMask),
2955  "reverse");
2956 }
2957 
2958 // Try to vectorize the interleave group that \p Instr belongs to.
2959 //
2960 // E.g. Translate following interleaved load group (factor = 3):
2961 // for (i = 0; i < N; i+=3) {
2962 // R = Pic[i]; // Member of index 0
2963 // G = Pic[i+1]; // Member of index 1
2964 // B = Pic[i+2]; // Member of index 2
2965 // ... // do something to R, G, B
2966 // }
2967 // To:
2968 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2969 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements
2970 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements
2971 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements
2972 //
2973 // Or translate following interleaved store group (factor = 3):
2974 // for (i = 0; i < N; i+=3) {
2975 // ... do something to R, G, B
2976 // Pic[i] = R; // Member of index 0
2977 // Pic[i+1] = G; // Member of index 1
2978 // Pic[i+2] = B; // Member of index 2
2979 // }
2980 // To:
2981 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2982 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2983 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2984 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2985 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2987  const InterleaveGroup *Group = Legal->getInterleavedAccessGroup(Instr);
2988  assert(Group && "Fail to get an interleaved access group.");
2989 
2990  // Skip if current instruction is not the insert position.
2991  if (Instr != Group->getInsertPos())
2992  return;
2993 
2994  const DataLayout &DL = Instr->getModule()->getDataLayout();
2995  Value *Ptr = getPointerOperand(Instr);
2996 
2997  // Prepare for the vector type of the interleaved load/store.
2998  Type *ScalarTy = getMemInstValueType(Instr);
2999  unsigned InterleaveFactor = Group->getFactor();
3000  Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
3001  Type *PtrTy = VecTy->getPointerTo(getMemInstAddressSpace(Instr));
3002 
3003  // Prepare for the new pointers.
3005  SmallVector<Value *, 2> NewPtrs;
3006  unsigned Index = Group->getIndex(Instr);
3007 
3008  // If the group is reverse, adjust the index to refer to the last vector lane
3009  // instead of the first. We adjust the index from the first vector lane,
3010  // rather than directly getting the pointer for lane VF - 1, because the
3011  // pointer operand of the interleaved access is supposed to be uniform. For
3012  // uniform instructions, we're only required to generate a value for the
3013  // first vector lane in each unroll iteration.
3014  if (Group->isReverse())
3015  Index += (VF - 1) * Group->getFactor();
3016 
3017  for (unsigned Part = 0; Part < UF; Part++) {
3018  Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
3019 
3020  // Notice current instruction could be any index. Need to adjust the address
3021  // to the member of index 0.
3022  //
3023  // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
3024  // b = A[i]; // Member of index 0
3025  // Current pointer is pointed to A[i+1], adjust it to A[i].
3026  //
3027  // E.g. A[i+1] = a; // Member of index 1
3028  // A[i] = b; // Member of index 0
3029  // A[i+2] = c; // Member of index 2 (Current instruction)
3030  // Current pointer is pointed to A[i+2], adjust it to A[i].
3031  NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index));
3032 
3033  // Cast to the vector pointer type.
3034  NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
3035  }
3036 
3037  setDebugLocFromInst(Builder, Instr);
3038  Value *UndefVec = UndefValue::get(VecTy);
3039 
3040  // Vectorize the interleaved load group.
3041  if (isa<LoadInst>(Instr)) {
3042  // For each unroll part, create a wide load for the group.
3043  SmallVector<Value *, 2> NewLoads;
3044  for (unsigned Part = 0; Part < UF; Part++) {
3045  auto *NewLoad = Builder.CreateAlignedLoad(
3046  NewPtrs[Part], Group->getAlignment(), "wide.vec");
3047  addMetadata(NewLoad, Instr);
3048  NewLoads.push_back(NewLoad);
3049  }
3050 
3051  // For each member in the group, shuffle out the appropriate data from the
3052  // wide loads.
3053  for (unsigned I = 0; I < InterleaveFactor; ++I) {
3054  Instruction *Member = Group->getMember(I);
3055 
3056  // Skip the gaps in the group.
3057  if (!Member)
3058  continue;
3059 
3060  Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
3061  for (unsigned Part = 0; Part < UF; Part++) {
3062  Value *StridedVec = Builder.CreateShuffleVector(
3063  NewLoads[Part], UndefVec, StrideMask, "strided.vec");
3064 
3065  // If this member has different type, cast the result type.
3066  if (Member->getType() != ScalarTy) {
3067  VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
3068  StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
3069  }
3070 
3071  if (Group->isReverse())
3072  StridedVec = reverseVector(StridedVec);
3073 
3074  VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
3075  }
3076  }
3077  return;
3078  }
3079 
3080  // The sub vector type for current instruction.
3081  VectorType *SubVT = VectorType::get(ScalarTy, VF);
3082 
3083  // Vectorize the interleaved store group.
3084  for (unsigned Part = 0; Part < UF; Part++) {
3085  // Collect the stored vector from each member.
3086  SmallVector<Value *, 4> StoredVecs;
3087  for (unsigned i = 0; i < InterleaveFactor; i++) {
3088  // Interleaved store group doesn't allow a gap, so each index has a member
3089  Instruction *Member = Group->getMember(i);
3090  assert(Member && "Fail to get a member from an interleaved store group");
3091 
3092  Value *StoredVec = getOrCreateVectorValue(
3093  cast<StoreInst>(Member)->getValueOperand(), Part);
3094  if (Group->isReverse())
3095  StoredVec = reverseVector(StoredVec);
3096 
3097  // If this member has different type, cast it to a unified type.
3098 
3099  if (StoredVec->getType() != SubVT)
3100  StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
3101 
3102  StoredVecs.push_back(StoredVec);
3103  }
3104 
3105  // Concatenate all vectors into a wide vector.
3106  Value *WideVec = concatenateVectors(Builder, StoredVecs);
3107 
3108  // Interleave the elements in the wide vector.
3109  Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
3110  Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
3111  "interleaved.vec");
3112 
3113  Instruction *NewStoreInstr =
3114  Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment());
3115  addMetadata(NewStoreInstr, Instr);
3116  }
3117 }
3118 
3120  VectorParts *BlockInMask) {
3121  // Attempt to issue a wide load.
3122  LoadInst *LI = dyn_cast<LoadInst>(Instr);
3123  StoreInst *SI = dyn_cast<StoreInst>(Instr);
3124 
3125  assert((LI || SI) && "Invalid Load/Store instruction");
3126 
3127  LoopVectorizationCostModel::InstWidening Decision =
3128  Cost->getWideningDecision(Instr, VF);
3129  assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
3130  "CM decision should be taken at this point");
3131  if (Decision == LoopVectorizationCostModel::CM_Interleave)
3132  return vectorizeInterleaveGroup(Instr);
3133 
3134  Type *ScalarDataTy = getMemInstValueType(Instr);
3135  Type *DataTy = VectorType::get(ScalarDataTy, VF);
3136  Value *Ptr = getPointerOperand(Instr);
3137  unsigned Alignment = getMemInstAlignment(Instr);
3138  // An alignment of 0 means target abi alignment. We need to use the scalar's
3139  // target abi alignment in such a case.
3140  const DataLayout &DL = Instr->getModule()->getDataLayout();
3141  if (!Alignment)
3142  Alignment = DL.getABITypeAlignment(ScalarDataTy);
3143  unsigned AddressSpace = getMemInstAddressSpace(Instr);
3144 
3145  // Determine if the pointer operand of the access is either consecutive or
3146  // reverse consecutive.
3147  int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
3148  bool Reverse = ConsecutiveStride < 0;
3149  bool CreateGatherScatter =
3150  (Decision == LoopVectorizationCostModel::CM_GatherScatter);
3151 
3152  // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
3153  // gather/scatter. Otherwise Decision should have been to Scalarize.
3154  assert((ConsecutiveStride || CreateGatherScatter) &&
3155  "The instruction should be scalarized");
3156 
3157  // Handle consecutive loads/stores.
3158  if (ConsecutiveStride)
3159  Ptr = getOrCreateScalarValue(Ptr, {0, 0});
3160 
3161  VectorParts Mask;
3162  bool isMaskRequired = BlockInMask;
3163  if (isMaskRequired)
3164  Mask = *BlockInMask;
3165 
3166  // Handle Stores:
3167  if (SI) {
3168  assert(!Legal->isUniform(SI->getPointerOperand()) &&
3169  "We do not allow storing to uniform addresses");
3171 
3172  for (unsigned Part = 0; Part < UF; ++Part) {
3173  Instruction *NewSI = nullptr;
3174  Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
3175  if (CreateGatherScatter) {
3176  Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
3177  Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
3178  NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
3179  MaskPart);
3180  } else {
3181  // Calculate the pointer for the specific unroll-part.
3182  Value *PartPtr =
3183  Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
3184 
3185  if (Reverse) {
3186  // If we store to reverse consecutive memory locations, then we need
3187  // to reverse the order of elements in the stored value.
3188  StoredVal = reverseVector(StoredVal);
3189  // We don't want to update the value in the map as it might be used in
3190  // another expression. So don't call resetVectorValue(StoredVal).
3191 
3192  // If the address is consecutive but reversed, then the
3193  // wide store needs to start at the last vector element.
3194  PartPtr =
3195  Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
3196  PartPtr =
3197  Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
3198  if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
3199  Mask[Part] = reverseVector(Mask[Part]);
3200  }
3201 
3202  Value *VecPtr =
3203  Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
3204 
3205  if (isMaskRequired)
3206  NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
3207  Mask[Part]);
3208  else
3209  NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
3210  }
3211  addMetadata(NewSI, SI);
3212  }
3213  return;
3214  }
3215 
3216  // Handle loads.
3217  assert(LI && "Must have a load instruction");
3219  for (unsigned Part = 0; Part < UF; ++Part) {
3220  Value *NewLI;
3221  if (CreateGatherScatter) {
3222  Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
3223  Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
3224  NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
3225  nullptr, "wide.masked.gather");
3226  addMetadata(NewLI, LI);
3227  } else {
3228  // Calculate the pointer for the specific unroll-part.
3229  Value *PartPtr =
3230  Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
3231 
3232  if (Reverse) {
3233  // If the address is consecutive but reversed, then the
3234  // wide load needs to start at the last vector element.
3235  PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
3236  PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
3237  if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
3238  Mask[Part] = reverseVector(Mask[Part]);
3239  }
3240 
3241  Value *VecPtr =
3242  Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
3243  if (isMaskRequired)
3244  NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
3245  UndefValue::get(DataTy),
3246  "wide.masked.load");
3247  else
3248  NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");
3249 
3250  // Add metadata to the load, but setVectorValue to the reverse shuffle.
3251  addMetadata(NewLI, LI);
3252  if (Reverse)
3253  NewLI = reverseVector(NewLI);
3254  }
3255  VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
3256  }
3257 }
3258 
3260  const VPIteration &Instance,
3261  bool IfPredicateInstr) {
3262  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
3263 
3264  setDebugLocFromInst(Builder, Instr);
3265 
3266  // Does this instruction return a value ?
3267  bool IsVoidRetTy = Instr->getType()->isVoidTy();
3268 
3269  Instruction *Cloned = Instr->clone();
3270  if (!IsVoidRetTy)
3271  Cloned->setName(Instr->getName() + ".cloned");
3272 
3273  // Replace the operands of the cloned instructions with their scalar
3274  // equivalents in the new loop.
3275  for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
3276  auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
3277  Cloned->setOperand(op, NewOp);
3278  }
3279  addNewMetadata(Cloned, Instr);
3280 
3281  // Place the cloned scalar in the new loop.
3282  Builder.Insert(Cloned);
3283 
3284  // Add the cloned scalar to the scalar map entry.
3285  VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
3286 
3287  // If we just cloned a new assumption, add it the assumption cache.
3288  if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
3289  if (II->getIntrinsicID() == Intrinsic::assume)
3290  AC->registerAssumption(II);
3291 
3292  // End if-block.
3293  if (IfPredicateInstr)
3294  PredicatedInstructions.push_back(Cloned);
3295 }
3296 
3298  Value *End, Value *Step,
3299  Instruction *DL) {
3300  BasicBlock *Header = L->getHeader();
3301  BasicBlock *Latch = L->getLoopLatch();
3302  // As we're just creating this loop, it's possible no latch exists
3303  // yet. If so, use the header as this will be a single block loop.
3304  if (!Latch)
3305  Latch = Header;
3306 
3309  setDebugLocFromInst(Builder, OldInst);
3310  auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
3311 
3313  setDebugLocFromInst(Builder, OldInst);
3314 
3315  // Create i+1 and fill the PHINode.
3316  Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
3317  Induction->addIncoming(Start, L->getLoopPreheader());
3318  Induction->addIncoming(Next, Latch);
3319  // Create the compare.
3320  Value *ICmp = Builder.CreateICmpEQ(Next, End);
3321  Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
3322 
3323  // Now we have two terminators. Remove the old one from the block.
3324  Latch->getTerminator()->eraseFromParent();
3325 
3326  return Induction;
3327 }
3328 
3330  if (TripCount)
3331  return TripCount;
3332 
3334  // Find the loop boundaries.
3335  ScalarEvolution *SE = PSE.getSE();
3336  const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
3337  assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
3338  "Invalid loop count");
3339 
3340  Type *IdxTy = Legal->getWidestInductionType();
3341 
3342  // The exit count might have the type of i64 while the phi is i32. This can
3343  // happen if we have an induction variable that is sign extended before the
3344  // compare. The only way that we get a backedge taken count is that the
3345  // induction variable was signed and as such will not overflow. In such a case
3346  // truncation is legal.
3347  if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
3348  IdxTy->getPrimitiveSizeInBits())
3349  BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
3350  BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
3351 
3352  // Get the total trip count from the count by adding 1.
3353  const SCEV *ExitCount = SE->getAddExpr(
3354  BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3355 
3356  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
3357 
3358  // Expand the trip count and place the new instructions in the preheader.
3359  // Notice that the pre-header does not change, only the loop body.
3360  SCEVExpander Exp(*SE, DL, "induction");
3361 
3362  // Count holds the overall loop count (N).
3363  TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
3365 
3366  if (TripCount->getType()->isPointerTy())
3367  TripCount =
3368  CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
3370 
3371  return TripCount;
3372 }
3373 
3375  if (VectorTripCount)
3376  return VectorTripCount;
3377 
3378  Value *TC = getOrCreateTripCount(L);
3380 
3381  // Now we need to generate the expression for the part of the loop that the
3382  // vectorized body will execute. This is equal to N - (N % Step) if scalar
3383  // iterations are not required for correctness, or N - Step, otherwise. Step
3384  // is equal to the vectorization factor (number of SIMD elements) times the
3385  // unroll factor (number of SIMD instructions).
3386  Constant *Step = ConstantInt::get(TC->getType(), VF * UF);
3387  Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
3388 
3389  // If there is a non-reversed interleaved group that may speculatively access
3390  // memory out-of-bounds, we need to ensure that there will be at least one
3391  // iteration of the scalar epilogue loop. Thus, if the step evenly divides
3392  // the trip count, we set the remainder to be equal to the step. If the step
3393  // does not evenly divide the trip count, no adjustment is necessary since
3394  // there will already be scalar iterations. Note that the minimum iterations
3395  // check ensures that N >= Step.
3396  if (VF > 1 && Legal->requiresScalarEpilogue()) {
3397  auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3398  R = Builder.CreateSelect(IsZero, Step, R);
3399  }
3400 
3401  VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3402 
3403  return VectorTripCount;
3404 }
3405 
3407  const DataLayout &DL) {
3408  // Verify that V is a vector type with same number of elements as DstVTy.
3409  unsigned VF = DstVTy->getNumElements();
3410  VectorType *SrcVecTy = cast<VectorType>(V->getType());
3411  assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
3412  Type *SrcElemTy = SrcVecTy->getElementType();
3413  Type *DstElemTy = DstVTy->getElementType();
3414  assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3415  "Vector elements must have same size");
3416 
3417  // Do a direct cast if element types are castable.
3418  if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3419  return Builder.CreateBitOrPointerCast(V, DstVTy);
3420  }
3421  // V cannot be directly casted to desired vector type.
3422  // May happen when V is a floating point vector but DstVTy is a vector of
3423  // pointers or vice-versa. Handle this using a two-step bitcast using an
3424  // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3425  assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3426  "Only one type should be a pointer type");
3427  assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3428  "Only one type should be a floating point type");
3429  Type *IntTy =
3431  VectorType *VecIntTy = VectorType::get(IntTy, VF);
3432  Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3433  return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
3434 }
3435 
3437  BasicBlock *Bypass) {
3438  Value *Count = getOrCreateTripCount(L);
3439  BasicBlock *BB = L->getLoopPreheader();
3441 
3442  // Generate code to check if the loop's trip count is less than VF * UF, or
3443  // equal to it in case a scalar epilogue is required; this implies that the
3444  // vector trip count is zero. This check also covers the case where adding one
3445  // to the backedge-taken count overflowed leading to an incorrect trip count
3446  // of zero. In this case we will also jump to the scalar loop.
3447  auto P = Legal->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
3449  Value *CheckMinIters = Builder.CreateICmp(
3450  P, Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check");
3451 
3452  BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
3453  // Update dominator tree immediately if the generated block is a
3454  // LoopBypassBlock because SCEV expansions to generate loop bypass
3455  // checks may query it before the current function is finished.
3456  DT->addNewBlock(NewBB, BB);
3457  if (L->getParentLoop())
3458  L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
3460  BranchInst::Create(Bypass, NewBB, CheckMinIters));
3461  LoopBypassBlocks.push_back(BB);
3462 }
3463 
3465  BasicBlock *BB = L->getLoopPreheader();
3466 
3467  // Generate the code to check that the SCEV assumptions that we made.
3468  // We want the new basic block to start at the first instruction in a
3469  // sequence of instructions that form a check.
3470  SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
3471  "scev.check");
3472  Value *SCEVCheck =
3473  Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
3474 
3475  if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
3476  if (C->isZero())
3477  return;
3478 
3479  // Create a new block containing the stride check.
3480  BB->setName("vector.scevcheck");
3481  auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
3482  // Update dominator tree immediately if the generated block is a
3483  // LoopBypassBlock because SCEV expansions to generate loop bypass
3484  // checks may query it before the current function is finished.
3485  DT->addNewBlock(NewBB, BB);
3486  if (L->getParentLoop())
3487  L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
3489  BranchInst::Create(Bypass, NewBB, SCEVCheck));
3490  LoopBypassBlocks.push_back(BB);
3491  AddedSafetyChecks = true;
3492 }
3493 
3495  BasicBlock *BB = L->getLoopPreheader();
3496 
3497  // Generate the code that checks in runtime if arrays overlap. We put the
3498  // checks into a separate block to make the more common case of few elements
3499  // faster.
3500  Instruction *FirstCheckInst;
3501  Instruction *MemRuntimeCheck;
3502  std::tie(FirstCheckInst, MemRuntimeCheck) =
3503  Legal->getLAI()->addRuntimeChecks(BB->getTerminator());
3504  if (!MemRuntimeCheck)
3505  return;
3506 
3507  // Create a new block containing the memory check.
3508  BB->setName("vector.memcheck");
3509  auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
3510  // Update dominator tree immediately if the generated block is a
3511  // LoopBypassBlock because SCEV expansions to generate loop bypass
3512  // checks may query it before the current function is finished.
3513  DT->addNewBlock(NewBB, BB);
3514  if (L->getParentLoop())
3515  L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
3517  BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
3518  LoopBypassBlocks.push_back(BB);
3519  AddedSafetyChecks = true;
3520 
3521  // We currently don't use LoopVersioning for the actual loop cloning but we
3522  // still use it to add the noalias metadata.
3523  LVer = llvm::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
3524  PSE.getSE());
3525  LVer->prepareNoAliasMetadata();
3526 }
3527 
3529  /*
3530  In this function we generate a new loop. The new loop will contain
3531  the vectorized instructions while the old loop will continue to run the
3532  scalar remainder.
3533 
3534  [ ] <-- loop iteration number check.
3535  / |
3536  / v
3537  | [ ] <-- vector loop bypass (may consist of multiple blocks).
3538  | / |
3539  | / v
3540  || [ ] <-- vector pre header.
3541  |/ |
3542  | v
3543  | [ ] \
3544  | [ ]_| <-- vector loop.
3545  | |
3546  | v
3547  | -[ ] <--- middle-block.
3548  | / |
3549  | / v
3550  -|- >[ ] <--- new preheader.
3551  | |
3552  | v
3553  | [ ] \
3554  | [ ]_| <-- old scalar loop to handle remainder.
3555  \ |
3556  \ v
3557  >[ ] <-- exit block.
3558  ...
3559  */
3560 
3561  BasicBlock *OldBasicBlock = OrigLoop->getHeader();
3562  BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
3563  BasicBlock *ExitBlock = OrigLoop->getExitBlock();
3564  assert(VectorPH && "Invalid loop structure");
3565  assert(ExitBlock && "Must have an exit block");
3566 
3567  // Some loops have a single integer induction variable, while other loops
3568  // don't. One example is c++ iterators that often have multiple pointer
3569  // induction variables. In the code below we also support a case where we
3570  // don't have a single induction variable.
3571  //
3572  // We try to obtain an induction variable from the original loop as hard
3573  // as possible. However if we don't find one that:
3574  // - is an integer
3575  // - counts from zero, stepping by one
3576  // - is the size of the widest induction variable type
3577  // then we create a new one.
3578  OldInduction = Legal->getPrimaryInduction();
3579  Type *IdxTy = Legal->getWidestInductionType();
3580 
3581  // Split the single block loop into the two loop structure described above.
3582  BasicBlock *VecBody =
3583  VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
3584  BasicBlock *MiddleBlock =
3585  VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
3586  BasicBlock *ScalarPH =
3587  MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
3588 
3589  // Create and register the new vector loop.
3590  Loop *Lp = LI->AllocateLoop();
3591  Loop *ParentLoop = OrigLoop->getParentLoop();
3592 
3593  // Insert the new loop into the loop nest and register the new basic blocks
3594  // before calling any utilities such as SCEV that require valid LoopInfo.
3595  if (ParentLoop) {
3596  ParentLoop->addChildLoop(Lp);
3597  ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
3598  ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
3599  } else {
3600  LI->addTopLevelLoop(Lp);
3601  }
3602  Lp->addBasicBlockToLoop(VecBody, *LI);
3603 
3604  // Find the loop boundaries.
3605  Value *Count = getOrCreateTripCount(Lp);
3606 
3607  Value *StartIdx = ConstantInt::get(IdxTy, 0);
3608 
3609  // Now, compare the new count to zero. If it is zero skip the vector loop and
3610  // jump to the scalar loop. This check also covers the case where the
3611  // backedge-taken count is uint##_max: adding one to it will overflow leading
3612  // to an incorrect trip count of zero. In this (rare) case we will also jump
3613  // to the scalar loop.
3614  emitMinimumIterationCountCheck(Lp, ScalarPH);
3615 
3616  // Generate the code to check any assumptions that we've made for SCEV
3617  // expressions.
3618  emitSCEVChecks(Lp, ScalarPH);
3619 
3620  // Generate the code that checks in runtime if arrays overlap. We put the
3621  // checks into a separate block to make the more common case of few elements
3622  // faster.
3623  emitMemRuntimeChecks(Lp, ScalarPH);
3624 
3625  // Generate the induction variable.
3626  // The loop step is equal to the vectorization factor (num of SIMD elements)
3627  // times the unroll factor (num of SIMD instructions).
3628  Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3629  Constant *Step = ConstantInt::get(IdxTy, VF * UF);
3630  Induction =
3631  createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3633 
3634  // We are going to resume the execution of the scalar loop.
3635  // Go over all of the induction variables that we found and fix the
3636  // PHIs that are left in the scalar version of the loop.
3637  // The starting values of PHI nodes depend on the counter of the last
3638  // iteration in the vectorized loop.
3639  // If we come from a bypass edge then we need to start from the original
3640  // start value.
3641 
3642  // This variable saves the new starting index for the scalar loop. It is used
3643  // to test if there are any tail iterations left once the vector loop has
3644  // completed.
3645  LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
3646  for (auto &InductionEntry : *List) {
3647  PHINode *OrigPhi = InductionEntry.first;
3648  InductionDescriptor II = InductionEntry.second;
3649 
3650  // Create phi nodes to merge from the backedge-taken check block.
3651  PHINode *BCResumeVal = PHINode::Create(
3652  OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
3653  Value *&EndValue = IVEndValues[OrigPhi];
3654  if (OrigPhi == OldInduction) {
3655  // We know what the end value is.
3656  EndValue = CountRoundDown;
3657  } else {
3659  Type *StepType = II.getStep()->getType();
3660  Instruction::CastOps CastOp =
3661  CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
3662  Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
3663  const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
3664  EndValue = II.transform(B, CRD, PSE.getSE(), DL);
3665  EndValue->setName("ind.end");
3666  }
3667 
3668  // The new PHI merges the original incoming value, in case of a bypass,
3669  // or the value at the end of the vectorized loop.
3670  BCResumeVal->addIncoming(EndValue, MiddleBlock);
3671 
3672  // Fix the scalar body counter (PHI node).
3673  unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
3674 
3675  // The old induction's phi node in the scalar body needs the truncated
3676  // value.
3677  for (BasicBlock *BB : LoopBypassBlocks)
3678  BCResumeVal->addIncoming(II.getStartValue(), BB);
3679  OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
3680  }
3681 
3682  // Add a check in the middle block to see if we have completed
3683  // all of the iterations in the first vector loop.
3684  // If (N - N%VF) == N, then we *don't* need to run the remainder.
3685  Value *CmpN =
3686  CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3687  CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
3688  ReplaceInstWithInst(MiddleBlock->getTerminator(),
3689  BranchInst::Create(ExitBlock, ScalarPH, CmpN));
3690 
3691  // Get ready to start creating new instructions into the vectorized body.
3693 
3694  // Save the state.
3696  LoopScalarPreHeader = ScalarPH;
3697  LoopMiddleBlock = MiddleBlock;
3698  LoopExitBlock = ExitBlock;
3699  LoopVectorBody = VecBody;
3700  LoopScalarBody = OldBasicBlock;
3701 
3702  // Keep all loop hints from the original loop on the vector loop (we'll
3703  // replace the vectorizer-specific hints below).
3704  if (MDNode *LID = OrigLoop->getLoopID())
3705  Lp->setLoopID(LID);
3706 
3707  LoopVectorizeHints Hints(Lp, true, *ORE);
3708  Hints.setAlreadyVectorized();
3709 
3710  return LoopVectorPreHeader;
3711 }
3712 
3713 // Fix up external users of the induction variable. At this point, we are
3714 // in LCSSA form, with all external PHIs that use the IV having one input value,
3715 // coming from the remainder loop. We need those PHIs to also have a correct
3716 // value for the IV when arriving directly from the middle block.
3718  const InductionDescriptor &II,
3719  Value *CountRoundDown, Value *EndValue,
3720  BasicBlock *MiddleBlock) {
3721  // There are two kinds of external IV usages - those that use the value
3722  // computed in the last iteration (the PHI) and those that use the penultimate
3723  // value (the value that feeds into the phi from the loop latch).
3724  // We allow both, but they, obviously, have different values.
3725 
3726  assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3727 
3728  DenseMap<Value *, Value *> MissingVals;
3729 
3730  // An external user of the last iteration's value should see the value that
3731  // the remainder loop uses to initialize its own IV.
3733  for (User *U : PostInc->users()) {
3734  Instruction *UI = cast<Instruction>(U);
3735  if (!OrigLoop->contains(UI)) {
3736  assert(isa<PHINode>(UI) && "Expected LCSSA form");
3737  MissingVals[UI] = EndValue;
3738  }
3739  }
3740 
3741  // An external user of the penultimate value need to see EndValue - Step.
3742  // The simplest way to get this is to recompute it from the constituent SCEVs,
3743  // that is Start + (Step * (CRD - 1)).
3744  for (User *U : OrigPhi->users()) {
3745  auto *UI = cast<Instruction>(U);
3746  if (!OrigLoop->contains(UI)) {
3747  const DataLayout &DL =
3749  assert(isa<PHINode>(UI) && "Expected LCSSA form");
3750 
3751  IRBuilder<> B(MiddleBlock->getTerminator());
3752  Value *CountMinusOne = B.CreateSub(
3753  CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3754  Value *CMO =
3755  !II.getStep()->getType()->isIntegerTy()
3756  ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3757  II.getStep()->getType())
3758  : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3759  CMO->setName("cast.cmo");
3760  Value *Escape = II.transform(B, CMO, PSE.getSE(), DL);
3761  Escape->setName("ind.escape");
3762  MissingVals[UI] = Escape;
3763  }
3764  }
3765 
3766  for (auto &I : MissingVals) {
3767  PHINode *PHI = cast<PHINode>(I.first);
3768  // One corner case we have to handle is two IVs "chasing" each-other,
3769  // that is %IV2 = phi [...], [ %IV1, %latch ]
3770  // In this case, if IV1 has an external use, we need to avoid adding both
3771  // "last value of IV1" and "penultimate value of IV2". So, verify that we
3772  // don't already have an incoming value for the middle block.
3773  if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3774  PHI->addIncoming(I.second, MiddleBlock);
3775  }
3776 }
3777 
3778 namespace {
3779 
3780 struct CSEDenseMapInfo {
3781  static bool canHandle(const Instruction *I) {
3782  return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3783  isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3784  }
3785 
3786  static inline Instruction *getEmptyKey() {
3788  }
3789 
3790  static inline Instruction *getTombstoneKey() {
3792  }
3793 
3794  static unsigned getHashValue(const Instruction *I) {
3795  assert(canHandle(I) && "Unknown instruction!");
3797  I->value_op_end()));
3798  }
3799 
3800  static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3801  if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3802  LHS == getTombstoneKey() || RHS == getTombstoneKey())
3803  return LHS == RHS;
3804  return LHS->isIdenticalTo(RHS);
3805  }
3806 };
3807 
3808 } // end anonymous namespace
3809 
3810 ///\brief Perform cse of induction variable instructions.
3811 static void cse(BasicBlock *BB) {
3812  // Perform simple cse.
3814  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3815  Instruction *In = &*I++;
3816 
3817  if (!CSEDenseMapInfo::canHandle(In))
3818  continue;
3819 
3820  // Check if we can replace this instruction with any of the
3821  // visited instructions.
3822  if (Instruction *V = CSEMap.lookup(In)) {
3823  In->replaceAllUsesWith(V);
3824  In->eraseFromParent();
3825  continue;
3826  }
3827 
3828  CSEMap[In] = In;
3829  }
3830 }
3831 
3832 /// \brief Estimate the overhead of scalarizing an instruction. This is a
3833 /// convenience wrapper for the type-based getScalarizationOverhead API.
3834 static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,
3835  const TargetTransformInfo &TTI) {
3836  if (VF == 1)
3837  return 0;
3838 
3839  unsigned Cost = 0;
3840  Type *RetTy = ToVectorTy(I->getType(), VF);
3841  if (!RetTy->isVoidTy() &&
3842  (!isa<LoadInst>(I) ||
3844  Cost += TTI.getScalarizationOverhead(RetTy, true, false);
3845 
3846  if (CallInst *CI = dyn_cast<CallInst>(I)) {
3847  SmallVector<const Value *, 4> Operands(CI->arg_operands());
3848  Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
3849  }
3850  else if (!isa<StoreInst>(I) ||
3853  Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
3854  }
3855 
3856  return Cost;
3857 }
3858 
3859 // Estimate cost of a call instruction CI if it were vectorized with factor VF.
3860 // Return the cost of the instruction, including scalarization overhead if it's
3861 // needed. The flag NeedToScalarize shows if the call needs to be scalarized -
3862 // i.e. either vector version isn't available, or is too expensive.
3863 static unsigned getVectorCallCost(CallInst *CI, unsigned VF,
3864  const TargetTransformInfo &TTI,
3865  const TargetLibraryInfo *TLI,
3866  bool &NeedToScalarize) {
3867  Function *F = CI->getCalledFunction();
3868  StringRef FnName = CI->getCalledFunction()->getName();
3869  Type *ScalarRetTy = CI->getType();
3870  SmallVector<Type *, 4> Tys, ScalarTys;
3871  for (auto &ArgOp : CI->arg_operands())
3872  ScalarTys.push_back(ArgOp->getType());
3873 
3874  // Estimate cost of scalarized vector call. The source operands are assumed
3875  // to be vectors, so we need to extract individual elements from there,
3876  // execute VF scalar calls, and then gather the result into the vector return
3877  // value.
3878  unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3879  if (VF == 1)
3880  return ScalarCallCost;
3881 
3882  // Compute corresponding vector type for return value and arguments.
3883  Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3884  for (Type *ScalarTy : ScalarTys)
3885  Tys.push_back(ToVectorTy(ScalarTy, VF));
3886 
3887  // Compute costs of unpacking argument values for the scalar calls and
3888  // packing the return values to a vector.
3889  unsigned ScalarizationCost = getScalarizationOverhead(CI, VF, TTI);
3890 
3891  unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3892 
3893  // If we can't emit a vector call for this function, then the currently found
3894  // cost is the cost we need to return.
3895  NeedToScalarize = true;
3896  if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
3897  return Cost;
3898 
3899  // If the corresponding vector cost is cheaper, return its cost.
3900  unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3901  if (VectorCallCost < Cost) {
3902  NeedToScalarize = false;
3903  return VectorCallCost;
3904  }
3905  return Cost;
3906 }
3907 
3908 // Estimate cost of an intrinsic call instruction CI if it were vectorized with
3909 // factor VF. Return the cost of the instruction, including scalarization
3910 // overhead if it's needed.
3911 static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF,
3912  const TargetTransformInfo &TTI,
3913  const TargetLibraryInfo *TLI) {
3915  assert(ID && "Expected intrinsic call!");
3916 
3917  FastMathFlags FMF;
3918  if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3919  FMF = FPMO->getFastMathFlags();
3920 
3921  SmallVector<Value *, 4> Operands(CI->arg_operands());
3922  return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3923 }
3924 
3926  auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3927  auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3928  return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3929 }
3931  auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3932  auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3933  return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3934 }
3935 
3937  // For every instruction `I` in MinBWs, truncate the operands, create a
3938  // truncated version of `I` and reextend its result. InstCombine runs
3939  // later and will remove any ext/trunc pairs.
3940  SmallPtrSet<Value *, 4> Erased;
3941  for (const auto &KV : Cost->getMinimalBitwidths()) {
3942  // If the value wasn't vectorized, we must maintain the original scalar
3943  // type. The absence of the value from VectorLoopValueMap indicates that it
3944  // wasn't vectorized.
3945  if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3946  continue;
3947  for (unsigned Part = 0; Part < UF; ++Part) {
3948  Value *I = getOrCreateVectorValue(KV.first, Part);
3949  if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3950  continue;
3951  Type *OriginalTy = I->getType();
3952  Type *ScalarTruncatedTy =
3953  IntegerType::get(OriginalTy->getContext(), KV.second);
3954  Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3955  OriginalTy->getVectorNumElements());
3956  if (TruncatedTy == OriginalTy)
3957  continue;
3958 
3959  IRBuilder<> B(cast<Instruction>(I));
3960  auto ShrinkOperand = [&](Value *V) -> Value * {
3961  if (auto *ZI = dyn_cast<ZExtInst>(V))
3962  if (ZI->getSrcTy() == TruncatedTy)
3963  return ZI->getOperand(0);
3964  return B.CreateZExtOrTrunc(V, TruncatedTy);
3965  };
3966 
3967  // The actual instruction modification depends on the instruction type,
3968  // unfortunately.
3969  Value *NewI = nullptr;
3970  if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3971  NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3972  ShrinkOperand(BO->getOperand(1)));
3973 
3974  // Any wrapping introduced by shrinking this operation shouldn't be
3975  // considered undefined behavior. So, we can't unconditionally copy
3976  // arithmetic wrapping flags to NewI.
3977  cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3978  } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3979  NewI =
3980  B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3981  ShrinkOperand(CI->getOperand(1)));
3982  } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3983  NewI = B.CreateSelect(SI->getCondition(),
3984  ShrinkOperand(SI->getTrueValue()),
3985  ShrinkOperand(SI->getFalseValue()));
3986  } else if (auto *CI = dyn_cast<CastInst>(I)) {
3987  switch (CI->getOpcode()) {
3988  default:
3989  llvm_unreachable("Unhandled cast!");
3990  case Instruction::Trunc:
3991  NewI = ShrinkOperand(CI->getOperand(0));
3992  break;
3993  case Instruction::SExt:
3994  NewI = B.CreateSExtOrTrunc(
3995  CI->getOperand(0),
3996  smallestIntegerVectorType(OriginalTy, TruncatedTy));
3997  break;
3998  case Instruction::ZExt:
3999  NewI = B.CreateZExtOrTrunc(
4000  CI->getOperand(0),
4001  smallestIntegerVectorType(OriginalTy, TruncatedTy));
4002  break;
4003  }
4004  } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
4005  auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
4006  auto *O0 = B.CreateZExtOrTrunc(
4007  SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
4008  auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
4009  auto *O1 = B.CreateZExtOrTrunc(
4010  SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
4011 
4012  NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
4013  } else if (isa<LoadInst>(I)) {
4014  // Don't do anything with the operands, just extend the result.
4015  continue;
4016  } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
4017  auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
4018  auto *O0 = B.CreateZExtOrTrunc(
4019  IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
4020  auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
4021  NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
4022  } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
4023  auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
4024  auto *O0 = B.CreateZExtOrTrunc(
4025  EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
4026  NewI = B.CreateExtractElement(O0, EE->getOperand(2));
4027  } else {
4028  llvm_unreachable("Unhandled instruction type!");
4029  }
4030 
4031  // Lastly, extend the result.
4032  NewI->takeName(cast<Instruction>(I));
4033  Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
4034  I->replaceAllUsesWith(Res);
4035  cast<Instruction>(I)->eraseFromParent();
4036  Erased.insert(I);
4037  VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
4038  }
4039  }
4040 
4041  // We'll have created a bunch of ZExts that are now parentless. Clean up.
4042  for (const auto &KV : Cost->getMinimalBitwidths()) {
4043  // If the value wasn't vectorized, we must maintain the original scalar
4044  // type. The absence of the value from VectorLoopValueMap indicates that it
4045  // wasn't vectorized.
4046  if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
4047  continue;
4048  for (unsigned Part = 0; Part < UF; ++Part) {
4049  Value *I = getOrCreateVectorValue(KV.first, Part);
4050  ZExtInst *Inst = dyn_cast<ZExtInst>(I);
4051  if (Inst && Inst->use_empty()) {
4052  Value *NewI = Inst->getOperand(0);
4053  Inst->eraseFromParent();
4054  VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
4055  }
4056  }
4057  }
4058 }
4059 
4061  // Insert truncates and extends for any truncated instructions as hints to
4062  // InstCombine.
4063  if (VF > 1)
4065 
4066  // At this point every instruction in the original loop is widened to a
4067  // vector form. Now we need to fix the recurrences in the loop. These PHI
4068  // nodes are currently empty because we did not want to introduce cycles.
4069  // This is the second stage of vectorizing recurrences.
4071 
4072  // Update the dominator tree.
4073  //
4074  // FIXME: After creating the structure of the new loop, the dominator tree is
4075  // no longer up-to-date, and it remains that way until we update it
4076  // here. An out-of-date dominator tree is problematic for SCEV,
4077  // because SCEVExpander uses it to guide code generation. The
4078  // vectorizer use SCEVExpanders in several places. Instead, we should
4079  // keep the dominator tree up-to-date as we go.
4080  updateAnalysis();
4081 
4082  // Fix-up external users of the induction variables.
4083  for (auto &Entry : *Legal->getInductionVars())
4084  fixupIVUsers(Entry.first, Entry.second,
4086  IVEndValues[Entry.first], LoopMiddleBlock);
4087 
4088  fixLCSSAPHIs();
4090  sinkScalarOperands(&*PI);
4091 
4092  // Remove redundant induction instructions.
4094 }
4095 
4097  // In order to support recurrences we need to be able to vectorize Phi nodes.
4098  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4099  // stage #2: We now need to fix the recurrences by adding incoming edges to
4100  // the currently empty PHI nodes. At this point every instruction in the
4101  // original loop is widened to a vector form so we can use them to construct
4102  // the incoming edges.
4103  for (Instruction &I : *OrigLoop->getHeader()) {
4104  PHINode *Phi = dyn_cast<PHINode>(&I);
4105  if (!Phi)
4106  break;
4107  // Handle first-order recurrences and reductions that need to be fixed.
4108  if (Legal->isFirstOrderRecurrence(Phi))
4110  else if (Legal->isReductionVariable(Phi))
4111  fixReduction(Phi);
4112  }
4113 }
4114 
4116  // This is the second phase of vectorizing first-order recurrences. An
4117  // overview of the transformation is described below. Suppose we have the
4118  // following loop.
4119  //
4120  // for (int i = 0; i < n; ++i)
4121  // b[i] = a[i] - a[i - 1];
4122  //
4123  // There is a first-order recurrence on "a". For this loop, the shorthand
4124  // scalar IR looks like:
4125  //
4126  // scalar.ph:
4127  // s_init = a[-1]
4128  // br scalar.body
4129  //
4130  // scalar.body:
4131  // i = phi [0, scalar.ph], [i+1, scalar.body]
4132  // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
4133  // s2 = a[i]
4134  // b[i] = s2 - s1
4135  // br cond, scalar.body, ...
4136  //
4137  // In this example, s1 is a recurrence because it's value depends on the
4138  // previous iteration. In the first phase of vectorization, we created a
4139  // temporary value for s1. We now complete the vectorization and produce the
4140  // shorthand vector IR shown below (for VF = 4, UF = 1).
4141  //
4142  // vector.ph:
4143  // v_init = vector(..., ..., ..., a[-1])
4144  // br vector.body
4145  //
4146  // vector.body
4147  // i = phi [0, vector.ph], [i+4, vector.body]
4148  // v1 = phi [v_init, vector.ph], [v2, vector.body]
4149  // v2 = a[i, i+1, i+2, i+3];
4150  // v3 = vector(v1(3), v2(0, 1, 2))
4151  // b[i, i+1, i+2, i+3] = v2 - v3
4152  // br cond, vector.body, middle.block
4153  //
4154  // middle.block:
4155  // x = v2(3)
4156  // br scalar.ph
4157  //
4158  // scalar.ph:
4159  // s_init = phi [x, middle.block], [a[-1], otherwise]
4160  // br scalar.body
4161  //
4162  // After execution completes the vector loop, we extract the next value of
4163  // the recurrence (x) to use as the initial value in the scalar loop.
4164 
4165  // Get the original loop preheader and single loop latch.
4166  auto *Preheader = OrigLoop->getLoopPreheader();
4167  auto *Latch = OrigLoop->getLoopLatch();
4168 
4169  // Get the initial and previous values of the scalar recurrence.
4170  auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
4171  auto *Previous = Phi->getIncomingValueForBlock(Latch);
4172 
4173  // Create a vector from the initial value.
4174  auto *VectorInit = ScalarInit;
4175  if (VF > 1) {
4177  VectorInit = Builder.CreateInsertElement(
4178  UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
4179  Builder.getInt32(VF - 1), "vector.recur.init");
4180  }
4181 
4182  // We constructed a temporary phi node in the first phase of vectorization.
4183  // This phi node will eventually be deleted.
4185  cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
4186 
4187  // Create a phi node for the new recurrence. The current value will either be
4188  // the initial value inserted into a vector or loop-varying vector value.
4189  auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
4190  VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
4191 
4192  // Get the vectorized previous value of the last part UF - 1. It appears last
4193  // among all unrolled iterations, due to the order of their construction.
4194  Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
4195 
4196  // Set the insertion point after the previous value if it is an instruction.
4197  // Note that the previous value may have been constant-folded so it is not
4198  // guaranteed to be an instruction in the vector loop. Also, if the previous
4199  // value is a phi node, we should insert after all the phi nodes to avoid
4200  // breaking basic block verification.
4201  if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||
4202  isa<PHINode>(PreviousLastPart))
4204  else
4206  &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));
4207 
4208  // We will construct a vector for the recurrence by combining the values for
4209  // the current and previous iterations. This is the required shuffle mask.
4210  SmallVector<Constant *, 8> ShuffleMask(VF);
4211  ShuffleMask[0] = Builder.getInt32(VF - 1);
4212  for (unsigned I = 1; I < VF; ++I)
4213  ShuffleMask[I] = Builder.getInt32(I + VF - 1);
4214 
4215  // The vector from which to take the initial value for the current iteration
4216  // (actual or unrolled). Initially, this is the vector phi node.
4217  Value *Incoming = VecPhi;
4218 
4219  // Shuffle the current and previous vector and update the vector parts.
4220  for (unsigned Part = 0; Part < UF; ++Part) {
4221  Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
4222  Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
4223  auto *Shuffle =
4224  VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
4225  ConstantVector::get(ShuffleMask))
4226  : Incoming;
4227  PhiPart->replaceAllUsesWith(Shuffle);
4228  cast<Instruction>(PhiPart)->eraseFromParent();
4229  VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
4230  Incoming = PreviousPart;
4231  }
4232 
4233  // Fix the latch value of the new recurrence in the vector loop.
4234  VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4235 
4236  // Extract the last vector element in the middle block. This will be the
4237  // initial value for the recurrence when jumping to the scalar loop.
4238  auto *ExtractForScalar = Incoming;
4239  if (VF > 1) {
4241  ExtractForScalar = Builder.CreateExtractElement(
4242  ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
4243  }
4244  // Extract the second last element in the middle block if the
4245  // Phi is used outside the loop. We need to extract the phi itself
4246  // and not the last element (the phi update in the current iteration). This
4247  // will be the value when jumping to the exit block from the LoopMiddleBlock,
4248  // when the scalar loop is not run at all.
4249  Value *ExtractForPhiUsedOutsideLoop = nullptr;
4250  if (VF > 1)
4251  ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4252  Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
4253  // When loop is unrolled without vectorizing, initialize
4254  // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
4255  // `Incoming`. This is analogous to the vectorized case above: extracting the
4256  // second last element when VF > 1.
4257  else if (UF > 1)
4258  ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
4259 
4260  // Fix the initial value of the original recurrence in the scalar loop.
4262  auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4263  for (auto *BB : predecessors(LoopScalarPreHeader)) {
4264  auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4265  Start->addIncoming(Incoming, BB);
4266  }
4267 
4269  Phi->setName("scalar.recur");
4270 
4271  // Finally, fix users of the recurrence outside the loop. The users will need
4272  // either the last value of the scalar recurrence or the last value of the
4273  // vector recurrence we extracted in the middle block. Since the loop is in
4274  // LCSSA form, we just need to find the phi node for the original scalar
4275  // recurrence in the exit block, and then add an edge for the middle block.
4276  for (auto &I : *LoopExitBlock) {
4277  auto *LCSSAPhi = dyn_cast<PHINode>(&I);
4278  if (!LCSSAPhi)
4279  break;
4280  if (LCSSAPhi->getIncomingValue(0) == Phi) {
4281  LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4282  break;
4283  }
4284  }
4285 }
4286 
4288  Constant *Zero = Builder.getInt32(0);
4289 
4290  // Get it's reduction variable descriptor.
4291  assert(Legal->isReductionVariable(Phi) &&
4292  "Unable to find the reduction variable");
4293  RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
4294 
4296  TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4297  Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4299  RdxDesc.getMinMaxRecurrenceKind();
4300  setDebugLocFromInst(Builder, ReductionStartValue);
4301 
4302  // We need to generate a reduction vector from the incoming scalar.
4303  // To do so, we need to generate the 'identity' vector and override
4304  // one of the elements with the incoming scalar reduction. We need
4305  // to do it in the vector-loop preheader.
4307 
4308  // This is the vector-clone of the value that leaves the loop.
4309  Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
4310 
4311  // Find the reduction identity variable. Zero for addition, or, xor,
4312  // one for multiplication, -1 for And.
4313  Value *Identity;
4314  Value *VectorStart;
4317  // MinMax reduction have the start value as their identify.
4318  if (VF == 1) {
4319  VectorStart = Identity = ReductionStartValue;
4320  } else {
4321  VectorStart = Identity =
4322  Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
4323  }
4324  } else {
4325  // Handle other reduction kinds:
4327  RK, VecTy->getScalarType());
4328  if (VF == 1) {
4329  Identity = Iden;
4330  // This vector is the Identity vector where the first element is the
4331  // incoming scalar reduction.
4332  VectorStart = ReductionStartValue;
4333  } else {
4334  Identity = ConstantVector::getSplat(VF, Iden);
4335 
4336  // This vector is the Identity vector where the first element is the
4337  // incoming scalar reduction.
4338  VectorStart =
4339  Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
4340  }
4341  }
4342 
4343  // Fix the vector-loop phi.
4344 
4345  // Reductions do not have to start at zero. They can start with
4346  // any loop invariant values.
4347  BasicBlock *Latch = OrigLoop->getLoopLatch();
4348  Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
4349  for (unsigned Part = 0; Part < UF; ++Part) {
4350  Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
4351  Value *Val = getOrCreateVectorValue(LoopVal, Part);
4352  // Make sure to add the reduction stat value only to the
4353  // first unroll part.
4354  Value *StartVal = (Part == 0) ? VectorStart : Identity;
4355  cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
4356  cast<PHINode>(VecRdxPhi)
4357  ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4358  }
4359 
4360  // Before each round, move the insertion point right between
4361  // the PHIs and the values we are going to write.
4362  // This allows us to write both PHINodes and the extractelement
4363  // instructions.
4365 
4366  setDebugLocFromInst(Builder, LoopExitInst);
4367 
4368  // If the vector reduction can be performed in a smaller type, we truncate
4369  // then extend the loop exit value to enable InstCombine to evaluate the
4370  // entire expression in the smaller type.
4371  if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
4372  Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4375  VectorParts RdxParts(UF);
4376  for (unsigned Part = 0; Part < UF; ++Part) {
4377  RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4378  Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4379  Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4380  : Builder.CreateZExt(Trunc, VecTy);
4381  for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4382  UI != RdxParts[Part]->user_end();)
4383  if (*UI != Trunc) {
4384  (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4385  RdxParts[Part] = Extnd;
4386  } else {
4387  ++UI;
4388  }
4389  }
4391  for (unsigned Part = 0; Part < UF; ++Part) {
4392  RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4393  VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
4394  }
4395  }
4396 
4397  // Reduce all of the unrolled parts into a single vector.
4398  Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
4400  setDebugLocFromInst(Builder, ReducedPartRdx);
4401  for (unsigned Part = 1; Part < UF; ++Part) {
4402  Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4403  if (Op != Instruction::ICmp && Op != Instruction::FCmp)
4404  // Floating point operations had to be 'fast' to enable the reduction.
4405  ReducedPartRdx = addFastMathFlag(
4407  ReducedPartRdx, "bin.rdx"));
4408  else
4409  ReducedPartRdx = RecurrenceDescriptor::createMinMaxOp(
4410  Builder, MinMaxKind, ReducedPartRdx, RdxPart);
4411  }
4412 
4413  if (VF > 1) {
4414  bool NoNaN = Legal->hasFunNoNaNAttr();
4415  ReducedPartRdx =
4416  createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
4417  // If the reduction can be performed in a smaller type, we need to extend
4418  // the reduction to the wider type before we branch to the original loop.
4419  if (Phi->getType() != RdxDesc.getRecurrenceType())
4420  ReducedPartRdx =
4421  RdxDesc.isSigned()
4422  ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
4423  : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
4424  }
4425 
4426  // Create a phi node that merges control-flow from the backedge-taken check
4427  // block and the middle block.
4428  PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
4430  for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4431  BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4432  BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4433 
4434  // Now, we need to fix the users of the reduction variable
4435  // inside and outside of the scalar remainder loop.
4436  // We know that the loop is in LCSSA form. We need to update the
4437  // PHI nodes in the exit blocks.
4439  LEE = LoopExitBlock->end();
4440  LEI != LEE; ++LEI) {
4441  PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
4442  if (!LCSSAPhi)
4443  break;
4444 
4445  // All PHINodes need to have a single entry edge, or two if
4446  // we already fixed them.
4447  assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
4448 
4449  // We found a reduction value exit-PHI. Update it with the
4450  // incoming bypass edge.
4451  if (LCSSAPhi->getIncomingValue(0) == LoopExitInst)
4452  LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4453  } // end of the LCSSA phi scan.
4454 
4455  // Fix the scalar loop reduction variable with the incoming reduction sum
4456  // from the vector body and from the backedge value.
4457  int IncomingEdgeBlockIdx =
4459  assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4460  // Pick the other block.
4461  int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4462  Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4463  Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4464 }
4465 
4467  for (Instruction &LEI : *LoopExitBlock) {
4468  auto *LCSSAPhi = dyn_cast<PHINode>(&LEI);
4469  if (!LCSSAPhi)
4470  break;
4471  if (LCSSAPhi->getNumIncomingValues() == 1) {
4472  assert(OrigLoop->isLoopInvariant(LCSSAPhi->getIncomingValue(0)) &&
4473  "Incoming value isn't loop invariant");
4474  LCSSAPhi->addIncoming(LCSSAPhi->getIncomingValue(0), LoopMiddleBlock);
4475  }
4476  }
4477 }
4478 
4480  // The basic block and loop containing the predicated instruction.
4481  auto *PredBB = PredInst->getParent();
4482  auto *VectorLoop = LI->getLoopFor(PredBB);
4483 
4484  // Initialize a worklist with the operands of the predicated instruction.
4485  SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4486 
4487  // Holds instructions that we need to analyze again. An instruction may be
4488  // reanalyzed if we don't yet know if we can sink it or not.
4489  SmallVector<Instruction *, 8> InstsToReanalyze;
4490 
4491  // Returns true if a given use occurs in the predicated block. Phi nodes use
4492  // their operands in their corresponding predecessor blocks.
4493  auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4494  auto *I = cast<Instruction>(U.getUser());
4495  BasicBlock *BB = I->getParent();
4496  if (auto *Phi = dyn_cast<PHINode>(I))
4497  BB = Phi->getIncomingBlock(
4498  PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4499  return BB == PredBB;
4500  };
4501 
4502  // Iteratively sink the scalarized operands of the predicated instruction
4503  // into the block we created for it. When an instruction is sunk, it's
4504  // operands are then added to the worklist. The algorithm ends after one pass
4505  // through the worklist doesn't sink a single instruction.
4506  bool Changed;
4507  do {
4508  // Add the instructions that need to be reanalyzed to the worklist, and
4509  // reset the changed indicator.
4510  Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4511  InstsToReanalyze.clear();
4512  Changed = false;
4513 
4514  while (!Worklist.empty()) {
4515  auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4516 
4517  // We can't sink an instruction if it is a phi node, is already in the
4518  // predicated block, is not in the loop, or may have side effects.
4519  if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4520  !VectorLoop->contains(I) || I->mayHaveSideEffects())
4521  continue;
4522 
4523  // It's legal to sink the instruction if all its uses occur in the
4524  // predicated block. Otherwise, there's nothing to do yet, and we may
4525  // need to reanalyze the instruction.
4526  if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4527  InstsToReanalyze.push_back(I);
4528  continue;
4529  }
4530 
4531  // Move the instruction to the beginning of the predicated block, and add
4532  // it's operands to the worklist.
4533  I->moveBefore(&*PredBB->getFirstInsertionPt());
4534  Worklist.insert(I->op_begin(), I->op_end());
4535 
4536  // The sinking may have enabled other instructions to be sunk, so we will
4537  // need to iterate.
4538  Changed = true;
4539  }
4540  } while (Changed);
4541 }
4542 
4544  unsigned VF) {
4545  assert(PN->getParent() == OrigLoop->getHeader() &&
4546  "Non-header phis should have been handled elsewhere");
4547 
4548  PHINode *P = cast<PHINode>(PN);
4549  // In order to support recurrences we need to be able to vectorize Phi nodes.
4550  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4551  // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4552  // this value when we vectorize all of the instructions that use the PHI.
4553  if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4554  for (unsigned Part = 0; Part < UF; ++Part) {
4555  // This is phase one of vectorizing PHIs.
4556  Type *VecTy =
4557  (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4558  Value *EntryPart = PHINode::Create(
4559  VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4560  VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4561  }
4562  return;
4563  }
4564 
4566 
4567  // This PHINode must be an induction variable.
4568  // Make sure that we know about it.
4569  assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
4570 
4571  InductionDescriptor II = Legal->getInductionVars()->lookup(P);
4572  const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4573 
4574  // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4575  // which can be found from the original scalar operations.
4576  switch (II.getKind()) {
4578  llvm_unreachable("Unknown induction");
4581  llvm_unreachable("Integer/fp induction is handled elsewhere.");
4583  // Handle the pointer induction variable case.
4584  assert(P->getType()->isPointerTy() && "Unexpected type.");
4585  // This is the normalized GEP that starts counting at zero.
4586  Value *PtrInd = Induction;
4587  PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
4588  // Determine the number of scalars we need to generate for each unroll
4589  // iteration. If the instruction is uniform, we only need to generate the
4590  // first lane. Otherwise, we generate all VF values.
4591  unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
4592  // These are the scalar results. Notice that we don't generate vector GEPs
4593  // because scalar GEPs result in better code.
4594  for (unsigned Part = 0; Part < UF; ++Part) {
4595  for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4596  Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
4597  Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4598  Value *SclrGep = II.transform(Builder, GlobalIdx, PSE.getSE(), DL);
4599  SclrGep->setName("next.gep");
4600  VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4601  }
4602  }
4603  return;
4604  }
4605  }
4606 }
4607 
4608 /// A helper function for checking whether an integer division-related
4609 /// instruction may divide by zero (in which case it must be predicated if
4610 /// executed conditionally in the scalar code).
4611 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4612 /// Non-zero divisors that are non compile-time constants will not be
4613 /// converted into multiplication, so we will still end up scalarizing
4614 /// the division, but can do so w/o predication.
4615 static bool mayDivideByZero(Instruction &I) {
4616  assert((I.getOpcode() == Instruction::UDiv ||
4617  I.getOpcode() == Instruction::SDiv ||
4618  I.getOpcode() == Instruction::URem ||
4619  I.getOpcode() == Instruction::SRem) &&
4620  "Unexpected instruction");
4621  Value *Divisor = I.getOperand(1);
4622  auto *CInt = dyn_cast<ConstantInt>(Divisor);
4623  return !CInt || CInt->isZero();
4624 }
4625 
4627  switch (I.getOpcode()) {
4628  case Instruction::Br:
4629  case Instruction::PHI:
4630  llvm_unreachable("This instruction is handled by a different recipe.");
4631  case Instruction::GetElementPtr: {
4632  // Construct a vector GEP by widening the operands of the scalar GEP as
4633  // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4634  // results in a vector of pointers when at least one operand of the GEP
4635  // is vector-typed. Thus, to keep the representation compact, we only use
4636  // vector-typed operands for loop-varying values.
4637  auto *GEP = cast<GetElementPtrInst>(&I);
4638 
4639  if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
4640  // If we are vectorizing, but the GEP has only loop-invariant operands,
4641  // the GEP we build (by only using vector-typed operands for
4642  // loop-varying values) would be a scalar pointer. Thus, to ensure we
4643  // produce a vector of pointers, we need to either arbitrarily pick an
4644  // operand to broadcast, or broadcast a clone of the original GEP.
4645  // Here, we broadcast a clone of the original.
4646  //
4647  // TODO: If at some point we decide to scalarize instructions having
4648  // loop-invariant operands, this special case will no longer be
4649  // required. We would add the scalarization decision to
4650  // collectLoopScalars() and teach getVectorValue() to broadcast
4651  // the lane-zero scalar value.
4652  auto *Clone = Builder.Insert(GEP->clone());
4653  for (unsigned Part = 0; Part < UF; ++Part) {
4654  Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4655  VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
4656  addMetadata(EntryPart, GEP);
4657  }
4658  } else {
4659  // If the GEP has at least one loop-varying operand, we are sure to
4660  // produce a vector of pointers. But if we are only unrolling, we want
4661  // to produce a scalar GEP for each unroll part. Thus, the GEP we
4662  // produce with the code below will be scalar (if VF == 1) or vector
4663  // (otherwise). Note that for the unroll-only case, we still maintain
4664  // values in the vector mapping with initVector, as we do for other
4665  // instructions.
4666  for (unsigned Part = 0; Part < UF; ++Part) {
4667  // The pointer operand of the new GEP. If it's loop-invariant, we
4668  // won't broadcast it.
4669  auto *Ptr =
4670  OrigLoop->isLoopInvariant(GEP->getPointerOperand())
4671  ? GEP->getPointerOperand()
4672  : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
4673 
4674  // Collect all the indices for the new GEP. If any index is
4675  // loop-invariant, we won't broadcast it.
4676  SmallVector<Value *, 4> Indices;
4677  for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
4678  if (OrigLoop->isLoopInvariant(U.get()))
4679  Indices.push_back(U.get());
4680  else
4681  Indices.push_back(getOrCreateVectorValue(U.get(), Part));
4682  }
4683 
4684  // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4685  // but it should be a vector, otherwise.
4686  auto *NewGEP = GEP->isInBounds()
4687  ? Builder.CreateInBoundsGEP(Ptr, Indices)
4688  : Builder.CreateGEP(Ptr, Indices);
4689  assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4690  "NewGEP is not a pointer vector");
4691  VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
4692  addMetadata(NewGEP, GEP);
4693  }
4694  }
4695 
4696  break;
4697  }
4698  case Instruction::UDiv:
4699  case Instruction::SDiv:
4700  case Instruction::SRem:
4701  case Instruction::URem:
4702  case Instruction::Add:
4703  case Instruction::FAdd:
4704  case Instruction::Sub:
4705  case Instruction::FSub:
4706  case Instruction::Mul:
4707  case Instruction::FMul:
4708  case Instruction::FDiv:
4709  case Instruction::FRem:
4710  case Instruction::Shl:
4711  case Instruction::LShr:
4712  case Instruction::AShr:
4713  case Instruction::And:
4714  case Instruction::Or:
4715  case Instruction::Xor: {
4716  // Just widen binops.
4717  auto *BinOp = cast<BinaryOperator>(&I);
4718  setDebugLocFromInst(Builder, BinOp);
4719 
4720  for (unsigned Part = 0; Part < UF; ++Part) {
4721  Value *A = getOrCreateVectorValue(BinOp->getOperand(0), Part);
4722  Value *B = getOrCreateVectorValue(BinOp->getOperand(1), Part);
4723  Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
4724 
4725  if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
4726  VecOp->copyIRFlags(BinOp);
4727 
4728  // Use this vector value for all users of the original instruction.
4729  VectorLoopValueMap.setVectorValue(&I, Part, V);
4730  addMetadata(V, BinOp);
4731  }
4732 
4733  break;
4734  }
4735  case Instruction::Select: {
4736  // Widen selects.
4737  // If the selector is loop invariant we can create a select
4738  // instruction with a scalar condition. Otherwise, use vector-select.
4739  auto *SE = PSE.getSE();
4740  bool InvariantCond =
4743 
4744  // The condition can be loop invariant but still defined inside the
4745  // loop. This means that we can't just use the original 'cond' value.
4746  // We have to take the 'vectorized' value and pick the first lane.
4747  // Instcombine will make this a no-op.
4748 
4749  auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4750 
4751  for (unsigned Part = 0; Part < UF; ++Part) {
4752  Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4753  Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4754  Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4755  Value *Sel =
4756  Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4757  VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4758  addMetadata(Sel, &I);
4759  }
4760 
4761  break;
4762  }
4763 
4764  case Instruction::ICmp:
4765  case Instruction::FCmp: {
4766  // Widen compares. Generate vector compares.
4767  bool FCmp = (I.getOpcode() == Instruction::FCmp);
4768  auto *Cmp = dyn_cast<CmpInst>(&I);
4770  for (unsigned Part = 0; Part < UF; ++Part) {
4771  Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4772  Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4773  Value *C = nullptr;
4774  if (FCmp) {
4775  // Propagate fast math flags.
4777  Builder.setFastMathFlags(Cmp->getFastMathFlags());
4778  C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4779  } else {
4780  C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4781  }
4782  VectorLoopValueMap.setVectorValue(&I, Part, C);
4783  addMetadata(C, &I);
4784  }
4785 
4786  break;
4787  }
4788 
4789  case Instruction::ZExt:
4790  case Instruction::SExt:
4791  case Instruction::FPToUI:
4792  case Instruction::FPToSI:
4793  case Instruction::FPExt:
4794  case Instruction::PtrToInt:
4795  case Instruction::IntToPtr:
4796  case Instruction::SIToFP:
4797  case Instruction::UIToFP:
4798  case Instruction::Trunc:
4799  case Instruction::FPTrunc:
4800  case Instruction::BitCast: {
4801  auto *CI = dyn_cast<CastInst>(&I);
4803 
4804  /// Vectorize casts.
4805  Type *DestTy =
4806  (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4807 
4808  for (unsigned Part = 0; Part < UF; ++Part) {
4809  Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4810  Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4811  VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4812  addMetadata(Cast, &I);
4813  }
4814  break;
4815  }
4816 
4817  case Instruction::Call: {
4818  // Ignore dbg intrinsics.
4819  if (isa<DbgInfoIntrinsic>(I))
4820  break;
4822 
4823  Module *M = I.getParent()->getParent()->getParent();
4824  auto *CI = cast<CallInst>(&I);
4825 
4826  StringRef FnName = CI->getCalledFunction()->getName();
4827  Function *F = CI->getCalledFunction();
4828  Type *RetTy = ToVectorTy(CI->getType(), VF);
4830  for (Value *ArgOperand : CI->arg_operands())
4831  Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4832 
4834 
4835  // The flag shows whether we use Intrinsic or a usual Call for vectorized
4836  // version of the instruction.
4837  // Is it beneficial to perform intrinsic call compared to lib call?
4838  bool NeedToScalarize;
4839  unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
4840  bool UseVectorIntrinsic =
4841  ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
4842  assert((UseVectorIntrinsic || !NeedToScalarize) &&
4843  "Instruction should be scalarized elsewhere.");
4844 
4845  for (unsigned Part = 0; Part < UF; ++Part) {
4847  for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4848  Value *Arg = CI->getArgOperand(i);
4849  // Some intrinsics have a scalar argument - don't replace it with a
4850  // vector.
4851  if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4852  Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4853  Args.push_back(Arg);
4854  }
4855 
4856  Function *VectorF;
4857  if (UseVectorIntrinsic) {
4858  // Use vector version of the intrinsic.
4859  Type *TysForDecl[] = {CI->getType()};
4860  if (VF > 1)
4861  TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4862  VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4863  } else {
4864  // Use vector version of the library call.
4865  StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
4866  assert(!VFnName.empty() && "Vector function name is empty.");
4867  VectorF = M->getFunction(VFnName);
4868  if (!VectorF) {
4869  // Generate a declaration
4870  FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
4871  VectorF =
4872  Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
4873  VectorF->copyAttributesFrom(F);
4874  }
4875  }
4876  assert(VectorF && "Can't create vector function.");
4877 
4879  CI->getOperandBundlesAsDefs(OpBundles);
4880  CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4881 
4882  if (isa<FPMathOperator>(V))
4883  V->copyFastMathFlags(CI);
4884 
4885  VectorLoopValueMap.setVectorValue(&I, Part, V);
4886  addMetadata(V, &I);
4887  }
4888 
4889  break;
4890  }
4891 
4892  default:
4893  // This instruction is not vectorized by simple widening.
4894  DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4895  llvm_unreachable("Unhandled instruction!");
4896  } // end of switch.
4897 }
4898 
4900  // Forget the original basic block.
4902 
4903  // Update the dominator tree information.
4905  "Entry does not dominate exit.");
4906 
4912  DEBUG(DT->verifyDomTree());
4913 }
4914 
4915 /// \brief Check whether it is safe to if-convert this phi node.
4916 ///
4917 /// Phi nodes with constant expressions that can trap are not safe to if
4918 /// convert.
4920  for (Instruction &I : *BB) {
4921  auto *Phi = dyn_cast<PHINode>(&I);
4922  if (!Phi)
4923  return true;
4924  for (Value *V : Phi->incoming_values())
4925  if (auto *C = dyn_cast<Constant>(V))
4926  if (C->canTrap())
4927  return false;
4928  }
4929  return true;
4930 }
4931 
4932 bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
4933  if (!EnableIfConversion) {
4934  ORE->emit(createMissedAnalysis("IfConversionDisabled")
4935  << "if-conversion is disabled");
4936  return false;
4937  }
4938 
4939  assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
4940 
4941  // A list of pointers that we can safely read and write to.
4942  SmallPtrSet<Value *, 8> SafePointes;
4943 
4944  // Collect safe addresses.
4945  for (BasicBlock *BB : TheLoop->blocks()) {
4946  if (blockNeedsPredication(BB))
4947  continue;
4948 
4949  for (Instruction &I : *BB)
4950  if (auto *Ptr = getPointerOperand(&I))
4951  SafePointes.insert(Ptr);
4952  }
4953 
4954  // Collect the blocks that need predication.
4955  BasicBlock *Header = TheLoop->getHeader();
4956  for (BasicBlock *BB : TheLoop->blocks()) {
4957  // We don't support switch statements inside loops.
4958  if (!isa<BranchInst>(BB->getTerminator())) {
4959  ORE->emit(createMissedAnalysis("LoopContainsSwitch", BB->getTerminator())
4960  << "loop contains a switch statement");
4961  return false;
4962  }
4963 
4964  // We must be able to predicate all blocks that need to be predicated.
4965  if (blockNeedsPredication(BB)) {
4966  if (!blockCanBePredicated(BB, SafePointes)) {
4967  ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator())
4968  << "control flow cannot be substituted for a select");
4969  return false;
4970  }
4971  } else if (BB != Header && !canIfConvertPHINodes(BB)) {
4972  ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator())
4973  << "control flow cannot be substituted for a select");
4974  return false;
4975  }
4976  }
4977 
4978  // We can if-convert this loop.
4979  return true;
4980 }
4981 
4982 bool LoopVectorizationLegality::canVectorize() {
4983  // Store the result and return it at the end instead of exiting early, in case
4984  // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
4985  bool Result = true;
4986 
4987  bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
4988  if (DoExtraAnalysis)
4989  // We must have a loop in canonical form. Loops with indirectbr in them cannot
4990  // be canonicalized.
4991  if (!TheLoop->getLoopPreheader()) {
4992  ORE->emit(createMissedAnalysis("CFGNotUnderstood")
4993  << "loop control flow is not understood by vectorizer");
4994  if (DoExtraAnalysis)
4995  Result = false;
4996  else
4997  return false;
4998  }
4999 
5000  // FIXME: The code is currently dead, since the loop gets sent to
5001  // LoopVectorizationLegality is already an innermost loop.
5002  //
5003  // We can only vectorize innermost loops.
5004  if (!TheLoop->empty()) {
5005  ORE->emit(createMissedAnalysis("NotInnermostLoop")
5006  << "loop is not the innermost loop");
5007  if (DoExtraAnalysis)
5008  Result = false;
5009  else
5010  return false;
5011  }
5012 
5013  // We must have a single backedge.
5014  if (TheLoop->getNumBackEdges() != 1) {
5015  ORE->emit(createMissedAnalysis("CFGNotUnderstood")
5016  << "loop control flow is not understood by vectorizer");
5017  if (DoExtraAnalysis)
5018  Result = false;
5019  else
5020  return false;
5021  }
5022 
5023  // We must have a single exiting block.
5024  if (!TheLoop->getExitingBlock()) {
5025  ORE->emit(createMissedAnalysis("CFGNotUnderstood")
5026  << "loop control flow is not understood by vectorizer");
5027  if (DoExtraAnalysis)
5028  Result = false;
5029  else
5030  return false;
5031  }
5032 
5033  // We only handle bottom-tested loops, i.e. loop in which the condition is
5034  // checked at the end of each iteration. With that we can assume that all
5035  // instructions in the loop are executed the same number of times.
5036  if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5037  ORE->emit(createMissedAnalysis("CFGNotUnderstood")
5038  << "loop control flow is not understood by vectorizer");
5039  if (DoExtraAnalysis)
5040  Result = false;
5041  else
5042  return false;
5043  }
5044 
5045  // We need to have a loop header.
5046  DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName()
5047  << '\n');
5048 
5049  // Check if we can if-convert non-single-bb loops.
5050  unsigned NumBlocks = TheLoop->getNumBlocks();
5051  if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
5052  DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
5053  if (DoExtraAnalysis)
5054  Result = false;
5055  else
5056  return false;
5057  }
5058 
5059  // Check if we can vectorize the instructions and CFG in this loop.
5060  if (!canVectorizeInstrs()) {
5061  DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
5062  if (DoExtraAnalysis)
5063  Result = false;
5064  else
5065  return false;
5066  }
5067 
5068  // Go over each instruction and look at memory deps.
5069  if (!canVectorizeMemory()) {
5070  DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
5071  if (DoExtraAnalysis)
5072  Result = false;
5073  else
5074  return false;
5075  }
5076 
5077  DEBUG(dbgs() << "LV: We can vectorize this loop"
5078  << (LAI->getRuntimePointerChecking()->Need
5079  ? " (with a runtime bound check)"
5080  : "")
5081  << "!\n");
5082 
5083  bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
5084 
5085  // If an override option has been passed in for interleaved accesses, use it.
5086  if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
5087  UseInterleaved = EnableInterleavedMemAccesses;
5088 
5089  // Analyze interleaved memory accesses.
5090  if (UseInterleaved)
5091  InterleaveInfo.analyzeInterleaving(*getSymbolicStrides());
5092 
5093  unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
5094  if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
5095  SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
5096 
5097  if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) {
5098  ORE->emit(createMissedAnalysis("TooManySCEVRunTimeChecks")
5099  << "Too many SCEV assumptions need to be made and checked "
5100  << "at runtime");
5101  DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n");
5102  if (DoExtraAnalysis)
5103  Result = false;
5104  else
5105  return false;
5106  }
5107 
5108  // Okay! We've done all the tests. If any have failed, return false. Otherwise
5109  // we can vectorize, and at this point we don't have any other mem analysis
5110  // which may limit our maximum vectorization factor, so just return true with
5111  // no restrictions.
5112  return Result;
5113 }
5114 
5116  if (Ty->isPointerTy())
5117  return DL.getIntPtrType(Ty);
5118 
5119  // It is possible that char's or short's overflow when we ask for the loop's
5120  // trip count, work around this by changing the type size.
5121  if (Ty->getScalarSizeInBits() < 32)
5122  return Type::getInt32Ty(Ty->getContext());
5123 
5124  return Ty;
5125 }
5126 
5127 static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
5128  Ty0 = convertPointerToIntegerType(DL, Ty0);
5129  Ty1 = convertPointerToIntegerType(DL, Ty1);
5130  if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
5131  return Ty0;
5132  return Ty1;
5133 }
5134 
5135 /// \brief Check that the instruction has outside loop users and is not an
5136 /// identified reduction variable.
5137 static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
5138  SmallPtrSetImpl<Value *> &AllowedExit) {
5139  // Reduction and Induction instructions are allowed to have exit users. All
5140  // other instructions must not have external users.
5141  if (!AllowedExit.count(Inst))
5142  // Check that all of the users of the loop are inside the BB.
5143  for (User *U : Inst->users()) {
5144  Instruction *UI = cast<Instruction>(U);
5145  // This user may be a reduction exit value.
5146  if (!TheLoop->contains(UI)) {
5147  DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n');
5148  return true;
5149  }
5150  }
5151  return false;
5152 }
5153 
5154 void LoopVectorizationLegality::addInductionPhi(
5155  PHINode *Phi, const InductionDescriptor &ID,
5156  SmallPtrSetImpl<Value *> &AllowedExit) {
5157  Inductions[Phi] = ID;
5158  Type *PhiTy = Phi->getType();
5159  const DataLayout &DL = Phi->getModule()->getDataLayout();
5160 
5161  // Get the widest type.
5162  if (!PhiTy->isFloatingPointTy()) {
5163  if (!WidestIndTy)
5164  WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
5165  else
5166  WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
5167  }
5168 
5169  // Int inductions are special because we only allow one IV.
5171  ID.getConstIntStepValue() &&
5172  ID.getConstIntStepValue()->isOne() &&
5173  isa<Constant>(ID.getStartValue()) &&
5174  cast<Constant>(ID.getStartValue())->isNullValue()) {
5175 
5176  // Use the phi node with the widest type as induction. Use the last
5177  // one if there are multiple (no good reason for doing this other
5178  // than it is expedient). We've checked that it begins at zero and
5179  // steps by one, so this is a canonical induction variable.
5180  if (!PrimaryInduction || PhiTy == WidestIndTy)
5181  PrimaryInduction = Phi;
5182  }
5183 
5184  // Both the PHI node itself, and the "post-increment" value feeding
5185  // back into the PHI node may have external users.
5186  // We can allow those uses, except if the SCEVs we have for them rely
5187  // on predicates that only hold within the loop, since allowing the exit
5188  // currently means re-using this SCEV outside the loop.
5189  if (PSE.getUnionPredicate().