LLVM  7.0.0svn
LoopVectorize.cpp
Go to the documentation of this file.
1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
11 // and generates target-independent LLVM-IR.
12 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
13 // of instructions in order to estimate the profitability of vectorization.
14 //
15 // The loop vectorizer combines consecutive loop iterations into a single
16 // 'wide' iteration. After this transformation the index is incremented
17 // by the SIMD vector width, and not by one.
18 //
19 // This pass has three parts:
20 // 1. The main loop pass that drives the different parts.
21 // 2. LoopVectorizationLegality - A unit that checks for the legality
22 // of the vectorization.
23 // 3. InnerLoopVectorizer - A unit that performs the actual
24 // widening of instructions.
25 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
26 // of vectorization. It decides on the optimal vector width, which
27 // can be one, if vectorization is not profitable.
28 //
29 //===----------------------------------------------------------------------===//
30 //
31 // The reduction-variable vectorization is based on the paper:
32 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
33 //
34 // Variable uniformity checks are inspired by:
35 // Karrenberg, R. and Hack, S. Whole Function Vectorization.
36 //
37 // The interleaved access vectorization is based on the paper:
38 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
39 // Data for SIMD
40 //
41 // Other ideas/concepts are from:
42 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
43 //
44 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
45 // Vectorizing Compilers.
46 //
47 //===----------------------------------------------------------------------===//
48 
51 #include "llvm/ADT/APInt.h"
52 #include "llvm/ADT/ArrayRef.h"
53 #include "llvm/ADT/DenseMap.h"
54 #include "llvm/ADT/DenseMapInfo.h"
55 #include "llvm/ADT/Hashing.h"
56 #include "llvm/ADT/MapVector.h"
57 #include "llvm/ADT/None.h"
58 #include "llvm/ADT/Optional.h"
59 #include "llvm/ADT/SCCIterator.h"
60 #include "llvm/ADT/STLExtras.h"
61 #include "llvm/ADT/SetVector.h"
62 #include "llvm/ADT/SmallPtrSet.h"
63 #include "llvm/ADT/SmallSet.h"
64 #include "llvm/ADT/SmallVector.h"
65 #include "llvm/ADT/Statistic.h"
66 #include "llvm/ADT/StringRef.h"
67 #include "llvm/ADT/Twine.h"
77 #include "llvm/Analysis/LoopInfo.h"
86 #include "llvm/IR/Attributes.h"
87 #include "llvm/IR/BasicBlock.h"
88 #include "llvm/IR/CFG.h"
89 #include "llvm/IR/Constant.h"
90 #include "llvm/IR/Constants.h"
91 #include "llvm/IR/DataLayout.h"
93 #include "llvm/IR/DebugLoc.h"
94 #include "llvm/IR/DerivedTypes.h"
95 #include "llvm/IR/DiagnosticInfo.h"
96 #include "llvm/IR/Dominators.h"
97 #include "llvm/IR/Function.h"
98 #include "llvm/IR/IRBuilder.h"
99 #include "llvm/IR/InstrTypes.h"
100 #include "llvm/IR/Instruction.h"
101 #include "llvm/IR/Instructions.h"
102 #include "llvm/IR/IntrinsicInst.h"
103 #include "llvm/IR/Intrinsics.h"
104 #include "llvm/IR/LLVMContext.h"
105 #include "llvm/IR/Metadata.h"
106 #include "llvm/IR/Module.h"
107 #include "llvm/IR/Operator.h"
108 #include "llvm/IR/Type.h"
109 #include "llvm/IR/Use.h"
110 #include "llvm/IR/User.h"
111 #include "llvm/IR/Value.h"
112 #include "llvm/IR/ValueHandle.h"
113 #include "llvm/IR/Verifier.h"
114 #include "llvm/Pass.h"
115 #include "llvm/Support/Casting.h"
117 #include "llvm/Support/Compiler.h"
118 #include "llvm/Support/Debug.h"
120 #include "llvm/Support/MathExtras.h"
126 #include <algorithm>
127 #include <cassert>
128 #include <cstdint>
129 #include <cstdlib>
130 #include <functional>
131 #include <iterator>
132 #include <limits>
133 #include <memory>
134 #include <string>
135 #include <tuple>
136 #include <utility>
137 #include <vector>
138 
139 using namespace llvm;
140 
141 #define LV_NAME "loop-vectorize"
142 #define DEBUG_TYPE LV_NAME
143 
144 STATISTIC(LoopsVectorized, "Number of loops vectorized");
145 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
146 
147 static cl::opt<bool>
148  EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
149  cl::desc("Enable if-conversion during vectorization."));
150 
151 /// Loops with a known constant trip count below this number are vectorized only
152 /// if no scalar iteration overheads are incurred.
154  "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
155  cl::desc("Loops with a constant trip count that is smaller than this "
156  "value are vectorized only if no scalar iteration overheads "
157  "are incurred."));
158 
160  "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
161  cl::desc("Maximize bandwidth when selecting vectorization factor which "
162  "will be determined by the smallest type in loop."));
163 
165  "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
166  cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
167 
168 /// Maximum factor for an interleaved memory access.
170  "max-interleave-group-factor", cl::Hidden,
171  cl::desc("Maximum factor for an interleaved access group (default = 8)"),
172  cl::init(8));
173 
174 /// We don't interleave loops with a known constant trip count below this
175 /// number.
176 static const unsigned TinyTripCountInterleaveThreshold = 128;
177 
179  "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
180  cl::desc("A flag that overrides the target's number of scalar registers."));
181 
183  "force-target-num-vector-regs", cl::init(0), cl::Hidden,
184  cl::desc("A flag that overrides the target's number of vector registers."));
185 
186 /// Maximum vectorization interleave count.
187 static const unsigned MaxInterleaveFactor = 16;
188 
190  "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
191  cl::desc("A flag that overrides the target's max interleave factor for "
192  "scalar loops."));
193 
195  "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
196  cl::desc("A flag that overrides the target's max interleave factor for "
197  "vectorized loops."));
198 
200  "force-target-instruction-cost", cl::init(0), cl::Hidden,
201  cl::desc("A flag that overrides the target's expected cost for "
202  "an instruction to a single constant value. Mostly "
203  "useful for getting consistent testing."));
204 
206  "small-loop-cost", cl::init(20), cl::Hidden,
207  cl::desc(
208  "The cost of a loop that is considered 'small' by the interleaver."));
209 
211  "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
212  cl::desc("Enable the use of the block frequency analysis to access PGO "
213  "heuristics minimizing code growth in cold regions and being more "
214  "aggressive in hot regions."));
215 
216 // Runtime interleave loops for load/store throughput.
218  "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
219  cl::desc(
220  "Enable runtime interleaving until load/store ports are saturated"));
221 
222 /// The number of stores in a loop that are allowed to need predication.
224  "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
225  cl::desc("Max number of stores to be predicated behind an if."));
226 
228  "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
229  cl::desc("Count the induction variable only once when interleaving"));
230 
232  "enable-cond-stores-vec", cl::init(true), cl::Hidden,
233  cl::desc("Enable if predication of stores during vectorization."));
234 
236  "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
237  cl::desc("The maximum interleave count to use when interleaving a scalar "
238  "reduction in a nested loop."));
239 
241  "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
242  cl::desc("The maximum allowed number of runtime memory checks with a "
243  "vectorize(enable) pragma."));
244 
246  "vectorize-scev-check-threshold", cl::init(16), cl::Hidden,
247  cl::desc("The maximum number of SCEV checks allowed."));
248 
250  "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden,
251  cl::desc("The maximum number of SCEV checks allowed with a "
252  "vectorize(enable) pragma"));
253 
254 /// Create an analysis remark that explains why vectorization failed
255 ///
256 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
257 /// RemarkName is the identifier for the remark. If \p I is passed it is an
258 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for
259 /// the location of the remark. \return the remark object that can be
260 /// streamed to.
262 createMissedAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
263  Instruction *I = nullptr) {
264  Value *CodeRegion = TheLoop->getHeader();
265  DebugLoc DL = TheLoop->getStartLoc();
266 
267  if (I) {
268  CodeRegion = I->getParent();
269  // If there is no debug location attached to the instruction, revert back to
270  // using the loop's.
271  if (I->getDebugLoc())
272  DL = I->getDebugLoc();
273  }
274 
275  OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
276  R << "loop not vectorized: ";
277  return R;
278 }
279 
280 namespace {
281 
282 class LoopVectorizationRequirements;
283 
284 } // end anonymous namespace
285 
286 /// Returns true if the given loop body has a cycle, excluding the loop
287 /// itself.
288 static bool hasCyclesInLoopBody(const Loop &L) {
289  if (!L.empty())
290  return true;
291 
292  for (const auto &SCC :
295  if (SCC.size() > 1) {
296  DEBUG(dbgs() << "LVL: Detected a cycle in the loop body:\n");
297  DEBUG(L.dump());
298  return true;
299  }
300  }
301  return false;
302 }
303 
304 /// A helper function for converting Scalar types to vector types.
305 /// If the incoming type is void, we return void. If the VF is 1, we return
306 /// the scalar type.
307 static Type *ToVectorTy(Type *Scalar, unsigned VF) {
308  if (Scalar->isVoidTy() || VF == 1)
309  return Scalar;
310  return VectorType::get(Scalar, VF);
311 }
312 
313 // FIXME: The following helper functions have multiple implementations
314 // in the project. They can be effectively organized in a common Load/Store
315 // utilities unit.
316 
317 /// A helper function that returns the pointer operand of a load or store
318 /// instruction.
320  if (auto *LI = dyn_cast<LoadInst>(I))
321  return LI->getPointerOperand();
322  if (auto *SI = dyn_cast<StoreInst>(I))
323  return SI->getPointerOperand();
324  return nullptr;
325 }
326 
327 /// A helper function that returns the type of loaded or stored value.
329  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
330  "Expected Load or Store instruction");
331  if (auto *LI = dyn_cast<LoadInst>(I))
332  return LI->getType();
333  return cast<StoreInst>(I)->getValueOperand()->getType();
334 }
335 
336 /// A helper function that returns the alignment of load or store instruction.
337 static unsigned getMemInstAlignment(Value *I) {
338  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
339  "Expected Load or Store instruction");
340  if (auto *LI = dyn_cast<LoadInst>(I))
341  return LI->getAlignment();
342  return cast<StoreInst>(I)->getAlignment();
343 }
344 
345 /// A helper function that returns the address space of the pointer operand of
346 /// load or store instruction.
347 static unsigned getMemInstAddressSpace(Value *I) {
348  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
349  "Expected Load or Store instruction");
350  if (auto *LI = dyn_cast<LoadInst>(I))
351  return LI->getPointerAddressSpace();
352  return cast<StoreInst>(I)->getPointerAddressSpace();
353 }
354 
355 /// A helper function that returns true if the given type is irregular. The
356 /// type is irregular if its allocated size doesn't equal the store size of an
357 /// element of the corresponding vector type at the given vectorization factor.
358 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
359  // Determine if an array of VF elements of type Ty is "bitcast compatible"
360  // with a <VF x Ty> vector.
361  if (VF > 1) {
362  auto *VectorTy = VectorType::get(Ty, VF);
363  return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
364  }
365 
366  // If the vectorization factor is one, we just check if an array of type Ty
367  // requires padding between elements.
368  return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
369 }
370 
371 /// A helper function that returns the reciprocal of the block probability of
372 /// predicated blocks. If we return X, we are assuming the predicated block
373 /// will execute once for every X iterations of the loop header.
374 ///
375 /// TODO: We should use actual block probability here, if available. Currently,
376 /// we always assume predicated blocks have a 50% chance of executing.
377 static unsigned getReciprocalPredBlockProb() { return 2; }
378 
379 /// A helper function that adds a 'fast' flag to floating-point operations.
381  if (isa<FPMathOperator>(V)) {
382  FastMathFlags Flags;
383  Flags.setFast();
384  cast<Instruction>(V)->setFastMathFlags(Flags);
385  }
386  return V;
387 }
388 
389 /// A helper function that returns an integer or floating-point constant with
390 /// value C.
391 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
392  return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
393  : ConstantFP::get(Ty, C);
394 }
395 
396 namespace llvm {
397 
398 /// InnerLoopVectorizer vectorizes loops which contain only one basic
399 /// block to a specified vectorization factor (VF).
400 /// This class performs the widening of scalars into vectors, or multiple
401 /// scalars. This class also implements the following features:
402 /// * It inserts an epilogue loop for handling loops that don't have iteration
403 /// counts that are known to be a multiple of the vectorization factor.
404 /// * It handles the code generation for reduction variables.
405 /// * Scalarization (implementation using scalars) of un-vectorizable
406 /// instructions.
407 /// InnerLoopVectorizer does not perform any vectorization-legality
408 /// checks, and relies on the caller to check for the different legality
409 /// aspects. The InnerLoopVectorizer relies on the
410 /// LoopVectorizationLegality class to provide information about the induction
411 /// and reduction variables that were found to a given vectorization factor.
413 public:
416  const TargetLibraryInfo *TLI,
418  OptimizationRemarkEmitter *ORE, unsigned VecWidth,
419  unsigned UnrollFactor, LoopVectorizationLegality *LVL,
421  : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
422  AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
423  Builder(PSE.getSE()->getContext()),
424  VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
425  virtual ~InnerLoopVectorizer() = default;
426 
427  /// Create a new empty loop. Unlink the old loop and connect the new one.
428  /// Return the pre-header block of the new loop.
430 
431  /// Widen a single instruction within the innermost loop.
433 
434  /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
435  void fixVectorizedLoop();
436 
437  // Return true if any runtime check is added.
439 
440  /// A type for vectorized values in the new loop. Each value from the
441  /// original loop, when vectorized, is represented by UF vector values in the
442  /// new unrolled loop, where UF is the unroll factor.
444 
445  /// Vectorize a single PHINode in a block. This method handles the induction
446  /// variable canonicalization. It supports both VF = 1 for unrolled loops and
447  /// arbitrary length vectors.
448  void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
449 
450  /// A helper function to scalarize a single Instruction in the innermost loop.
451  /// Generates a sequence of scalar instances for each lane between \p MinLane
452  /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
453  /// inclusive..
454  void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
455  bool IfPredicateInstr);
456 
457  /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
458  /// is provided, the integer induction variable will first be truncated to
459  /// the corresponding type.
460  void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
461 
462  /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
463  /// vector or scalar value on-demand if one is not yet available. When
464  /// vectorizing a loop, we visit the definition of an instruction before its
465  /// uses. When visiting the definition, we either vectorize or scalarize the
466  /// instruction, creating an entry for it in the corresponding map. (In some
467  /// cases, such as induction variables, we will create both vector and scalar
468  /// entries.) Then, as we encounter uses of the definition, we derive values
469  /// for each scalar or vector use unless such a value is already available.
470  /// For example, if we scalarize a definition and one of its uses is vector,
471  /// we build the required vector on-demand with an insertelement sequence
472  /// when visiting the use. Otherwise, if the use is scalar, we can use the
473  /// existing scalar definition.
474  ///
475  /// Return a value in the new loop corresponding to \p V from the original
476  /// loop at unroll index \p Part. If the value has already been vectorized,
477  /// the corresponding vector entry in VectorLoopValueMap is returned. If,
478  /// however, the value has a scalar entry in VectorLoopValueMap, we construct
479  /// a new vector value on-demand by inserting the scalar values into a vector
480  /// with an insertelement sequence. If the value has been neither vectorized
481  /// nor scalarized, it must be loop invariant, so we simply broadcast the
482  /// value into a vector.
483  Value *getOrCreateVectorValue(Value *V, unsigned Part);
484 
485  /// Return a value in the new loop corresponding to \p V from the original
486  /// loop at unroll and vector indices \p Instance. If the value has been
487  /// vectorized but not scalarized, the necessary extractelement instruction
488  /// will be generated.
489  Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
490 
491  /// Construct the vector value of a scalarized value \p V one lane at a time.
492  void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
493 
494  /// Try to vectorize the interleaved access group that \p Instr belongs to.
496 
497  /// Vectorize Load and Store instructions, optionally masking the vector
498  /// operations if \p BlockInMask is non-null.
500  VectorParts *BlockInMask = nullptr);
501 
502  /// \brief Set the debug location in the builder using the debug location in
503  /// the instruction.
504  void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
505 
506 protected:
508 
509  /// A small list of PHINodes.
511 
512  /// A type for scalarized values in the new loop. Each value from the
513  /// original loop, when scalarized, is represented by UF x VF scalar values
514  /// in the new unrolled loop, where UF is the unroll factor and VF is the
515  /// vectorization factor.
517 
518  /// Set up the values of the IVs correctly when exiting the vector loop.
519  void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
520  Value *CountRoundDown, Value *EndValue,
521  BasicBlock *MiddleBlock);
522 
523  /// Create a new induction variable inside L.
525  Value *Step, Instruction *DL);
526 
527  /// Handle all cross-iteration phis in the header.
528  void fixCrossIterationPHIs();
529 
530  /// Fix a first-order recurrence. This is the second phase of vectorizing
531  /// this phi node.
532  void fixFirstOrderRecurrence(PHINode *Phi);
533 
534  /// Fix a reduction cross-iteration phi. This is the second phase of
535  /// vectorizing this phi node.
536  void fixReduction(PHINode *Phi);
537 
538  /// \brief The Loop exit block may have single value PHI nodes with some
539  /// incoming value. While vectorizing we only handled real values
540  /// that were defined inside the loop and we should have one value for
541  /// each predecessor of its parent basic block. See PR14725.
542  void fixLCSSAPHIs();
543 
544  /// Iteratively sink the scalarized operands of a predicated instruction into
545  /// the block that was created for it.
546  void sinkScalarOperands(Instruction *PredInst);
547 
548  /// Shrinks vector element sizes to the smallest bitwidth they can be legally
549  /// represented as.
551 
552  /// Insert the new loop to the loop hierarchy and pass manager
553  /// and update the analysis passes.
554  void updateAnalysis();
555 
556  /// Create a broadcast instruction. This method generates a broadcast
557  /// instruction (shuffle) for loop invariant values and for the induction
558  /// value. If this is the induction variable then we extend it to N, N+1, ...
559  /// this is needed because each iteration in the loop corresponds to a SIMD
560  /// element.
561  virtual Value *getBroadcastInstrs(Value *V);
562 
563  /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
564  /// to each vector element of Val. The sequence starts at StartIndex.
565  /// \p Opcode is relevant for FP induction variable.
566  virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
567  Instruction::BinaryOps Opcode =
568  Instruction::BinaryOpsEnd);
569 
570  /// Compute scalar induction steps. \p ScalarIV is the scalar induction
571  /// variable on which to base the steps, \p Step is the size of the step, and
572  /// \p EntryVal is the value from the original loop that maps to the steps.
573  /// Note that \p EntryVal doesn't have to be an induction variable (e.g., it
574  /// can be a truncate instruction).
575  void buildScalarSteps(Value *ScalarIV, Value *Step, Value *EntryVal,
576  const InductionDescriptor &ID);
577 
578  /// Create a vector induction phi node based on an existing scalar one. \p
579  /// EntryVal is the value from the original loop that maps to the vector phi
580  /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
581  /// truncate instruction, instead of widening the original IV, we widen a
582  /// version of the IV truncated to \p EntryVal's type.
584  Value *Step, Instruction *EntryVal);
585 
586  /// Returns true if an instruction \p I should be scalarized instead of
587  /// vectorized for the chosen vectorization factor.
589 
590  /// Returns true if we should generate a scalar version of \p IV.
591  bool needsScalarInduction(Instruction *IV) const;
592 
593  /// If there is a cast involved in the induction variable \p ID, which should
594  /// be ignored in the vectorized loop body, this function records the
595  /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
596  /// cast. We had already proved that the casted Phi is equal to the uncasted
597  /// Phi in the vectorized loop (under a runtime guard), and therefore
598  /// there is no need to vectorize the cast - the same value can be used in the
599  /// vector loop for both the Phi and the cast.
600  /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
601  /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
603  Value *VectorLoopValue,
604  unsigned Part,
605  unsigned Lane = UINT_MAX);
606 
607  /// Generate a shuffle sequence that will reverse the vector Vec.
608  virtual Value *reverseVector(Value *Vec);
609 
610  /// Returns (and creates if needed) the original loop trip count.
611  Value *getOrCreateTripCount(Loop *NewLoop);
612 
613  /// Returns (and creates if needed) the trip count of the widened loop.
615 
616  /// Returns a bitcasted value to the requested vector type.
617  /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
619  const DataLayout &DL);
620 
621  /// Emit a bypass check to see if the vector trip count is zero, including if
622  /// it overflows.
624 
625  /// Emit a bypass check to see if all of the SCEV assumptions we've
626  /// had to make are correct.
627  void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
628 
629  /// Emit bypass checks to check any memory assumptions we may have made.
630  void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
631 
632  /// Add additional metadata to \p To that was not present on \p Orig.
633  ///
634  /// Currently this is used to add the noalias annotations based on the
635  /// inserted memchecks. Use this for instructions that are *cloned* into the
636  /// vector loop.
637  void addNewMetadata(Instruction *To, const Instruction *Orig);
638 
639  /// Add metadata from one instruction to another.
640  ///
641  /// This includes both the original MDs from \p From and additional ones (\see
642  /// addNewMetadata). Use this for *newly created* instructions in the vector
643  /// loop.
644  void addMetadata(Instruction *To, Instruction *From);
645 
646  /// \brief Similar to the previous function but it adds the metadata to a
647  /// vector of instructions.
648  void addMetadata(ArrayRef<Value *> To, Instruction *From);
649 
650  /// The original loop.
652 
653  /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
654  /// dynamic knowledge to simplify SCEV expressions and converts them to a
655  /// more usable form.
657 
658  /// Loop Info.
660 
661  /// Dominator Tree.
663 
664  /// Alias Analysis.
666 
667  /// Target Library Info.
669 
670  /// Target Transform Info.
672 
673  /// Assumption Cache.
675 
676  /// Interface to emit optimization remarks.
678 
679  /// \brief LoopVersioning. It's only set up (non-null) if memchecks were
680  /// used.
681  ///
682  /// This is currently only used to add no-alias metadata based on the
683  /// memchecks. The actually versioning is performed manually.
684  std::unique_ptr<LoopVersioning> LVer;
685 
686  /// The vectorization SIMD factor to use. Each vector will have this many
687  /// vector elements.
688  unsigned VF;
689 
690  /// The vectorization unroll factor to use. Each scalar is vectorized to this
691  /// many different vector instructions.
692  unsigned UF;
693 
694  /// The builder that we use
696 
697  // --- Vectorization state ---
698 
699  /// The vector-loop preheader.
701 
702  /// The scalar-loop preheader.
704 
705  /// Middle Block between the vector and the scalar.
707 
708  /// The ExitBlock of the scalar loop.
710 
711  /// The vector loop body.
713 
714  /// The scalar loop body.
716 
717  /// A list of all bypass blocks. The first block is the entry of the loop.
719 
720  /// The new Induction variable which was added to the new block.
721  PHINode *Induction = nullptr;
722 
723  /// The induction variable of the old basic block.
724  PHINode *OldInduction = nullptr;
725 
726  /// Maps values from the original loop to their corresponding values in the
727  /// vectorized loop. A key value can map to either vector values, scalar
728  /// values or both kinds of values, depending on whether the key was
729  /// vectorized and scalarized.
731 
732  /// Store instructions that were predicated.
734 
735  /// Trip count of the original loop.
736  Value *TripCount = nullptr;
737 
738  /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
739  Value *VectorTripCount = nullptr;
740 
741  /// The legality analysis.
743 
744  /// The profitablity analysis.
746 
747  // Record whether runtime checks are added.
748  bool AddedSafetyChecks = false;
749 
750  // Holds the end values for each induction variable. We save the end values
751  // so we can later fix-up the external users of the induction variables.
753 };
754 
756 public:
759  const TargetLibraryInfo *TLI,
761  OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
764  : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
765  UnrollFactor, LVL, CM) {}
766 
767 private:
768  Value *getBroadcastInstrs(Value *V) override;
769  Value *getStepVector(Value *Val, int StartIdx, Value *Step,
770  Instruction::BinaryOps Opcode =
771  Instruction::BinaryOpsEnd) override;
772  Value *reverseVector(Value *Vec) override;
773 };
774 
775 } // end namespace llvm
776 
777 /// \brief Look for a meaningful debug location on the instruction or it's
778 /// operands.
780  if (!I)
781  return I;
782 
783  DebugLoc Empty;
784  if (I->getDebugLoc() != Empty)
785  return I;
786 
787  for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
788  if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
789  if (OpInst->getDebugLoc() != Empty)
790  return OpInst;
791  }
792 
793  return I;
794 }
795 
797  if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
798  const DILocation *DIL = Inst->getDebugLoc();
799  if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
800  !isa<DbgInfoIntrinsic>(Inst))
802  else
804  } else
806 }
807 
808 #ifndef NDEBUG
809 /// \return string containing a file name and a line # for the given loop.
810 static std::string getDebugLocString(const Loop *L) {
811  std::string Result;
812  if (L) {
813  raw_string_ostream OS(Result);
814  if (const DebugLoc LoopDbgLoc = L->getStartLoc())
815  LoopDbgLoc.print(OS);
816  else
817  // Just print the module name.
818  OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
819  OS.flush();
820  }
821  return Result;
822 }
823 #endif
824 
826  const Instruction *Orig) {
827  // If the loop was versioned with memchecks, add the corresponding no-alias
828  // metadata.
829  if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
830  LVer->annotateInstWithNoAlias(To, Orig);
831 }
832 
834  Instruction *From) {
835  propagateMetadata(To, From);
836  addNewMetadata(To, From);
837 }
838 
840  Instruction *From) {
841  for (Value *V : To) {
842  if (Instruction *I = dyn_cast<Instruction>(V))
843  addMetadata(I, From);
844  }
845 }
846 
847 namespace llvm {
848 
849 /// \brief The group of interleaved loads/stores sharing the same stride and
850 /// close to each other.
851 ///
852 /// Each member in this group has an index starting from 0, and the largest
853 /// index should be less than interleaved factor, which is equal to the absolute
854 /// value of the access's stride.
855 ///
856 /// E.g. An interleaved load group of factor 4:
857 /// for (unsigned i = 0; i < 1024; i+=4) {
858 /// a = A[i]; // Member of index 0
859 /// b = A[i+1]; // Member of index 1
860 /// d = A[i+3]; // Member of index 3
861 /// ...
862 /// }
863 ///
864 /// An interleaved store group of factor 4:
865 /// for (unsigned i = 0; i < 1024; i+=4) {
866 /// ...
867 /// A[i] = a; // Member of index 0
868 /// A[i+1] = b; // Member of index 1
869 /// A[i+2] = c; // Member of index 2
870 /// A[i+3] = d; // Member of index 3
871 /// }
872 ///
873 /// Note: the interleaved load group could have gaps (missing members), but
874 /// the interleaved store group doesn't allow gaps.
876 public:
877  InterleaveGroup(Instruction *Instr, int Stride, unsigned Align)
878  : Align(Align), InsertPos(Instr) {
879  assert(Align && "The alignment should be non-zero");
880 
881  Factor = std::abs(Stride);
882  assert(Factor > 1 && "Invalid interleave factor");
883 
884  Reverse = Stride < 0;
885  Members[0] = Instr;
886  }
887 
888  bool isReverse() const { return Reverse; }
889  unsigned getFactor() const { return Factor; }
890  unsigned getAlignment() const { return Align; }
891  unsigned getNumMembers() const { return Members.size(); }
892 
893  /// \brief Try to insert a new member \p Instr with index \p Index and
894  /// alignment \p NewAlign. The index is related to the leader and it could be
895  /// negative if it is the new leader.
896  ///
897  /// \returns false if the instruction doesn't belong to the group.
898  bool insertMember(Instruction *Instr, int Index, unsigned NewAlign) {
899  assert(NewAlign && "The new member's alignment should be non-zero");
900 
901  int Key = Index + SmallestKey;
902 
903  // Skip if there is already a member with the same index.
904  if (Members.count(Key))
905  return false;
906 
907  if (Key > LargestKey) {
908  // The largest index is always less than the interleave factor.
909  if (Index >= static_cast<int>(Factor))
910  return false;
911 
912  LargestKey = Key;
913  } else if (Key < SmallestKey) {
914  // The largest index is always less than the interleave factor.
915  if (LargestKey - Key >= static_cast<int>(Factor))
916  return false;
917 
918  SmallestKey = Key;
919  }
920 
921  // It's always safe to select the minimum alignment.
922  Align = std::min(Align, NewAlign);
923  Members[Key] = Instr;
924  return true;
925  }
926 
927  /// \brief Get the member with the given index \p Index
928  ///
929  /// \returns nullptr if contains no such member.
930  Instruction *getMember(unsigned Index) const {
931  int Key = SmallestKey + Index;
932  if (!Members.count(Key))
933  return nullptr;
934 
935  return Members.find(Key)->second;
936  }
937 
938  /// \brief Get the index for the given member. Unlike the key in the member
939  /// map, the index starts from 0.
940  unsigned getIndex(Instruction *Instr) const {
941  for (auto I : Members)
942  if (I.second == Instr)
943  return I.first - SmallestKey;
944 
945  llvm_unreachable("InterleaveGroup contains no such member");
946  }
947 
948  Instruction *getInsertPos() const { return InsertPos; }
949  void setInsertPos(Instruction *Inst) { InsertPos = Inst; }
950 
951  /// Add metadata (e.g. alias info) from the instructions in this group to \p
952  /// NewInst.
953  ///
954  /// FIXME: this function currently does not add noalias metadata a'la
955  /// addNewMedata. To do that we need to compute the intersection of the
956  /// noalias info from all members.
957  void addMetadata(Instruction *NewInst) const {
959  std::transform(Members.begin(), Members.end(), std::back_inserter(VL),
960  [](std::pair<int, Instruction *> p) { return p.second; });
961  propagateMetadata(NewInst, VL);
962  }
963 
964 private:
965  unsigned Factor; // Interleave Factor.
966  bool Reverse;
967  unsigned Align;
969  int SmallestKey = 0;
970  int LargestKey = 0;
971 
972  // To avoid breaking dependences, vectorized instructions of an interleave
973  // group should be inserted at either the first load or the last store in
974  // program order.
975  //
976  // E.g. %even = load i32 // Insert Position
977  // %add = add i32 %even // Use of %even
978  // %odd = load i32
979  //
980  // store i32 %even
981  // %odd = add i32 // Def of %odd
982  // store i32 %odd // Insert Position
983  Instruction *InsertPos;
984 };
985 } // end namespace llvm
986 
987 namespace {
988 
989 /// \brief Drive the analysis of interleaved memory accesses in the loop.
990 ///
991 /// Use this class to analyze interleaved accesses only when we can vectorize
992 /// a loop. Otherwise it's meaningless to do analysis as the vectorization
993 /// on interleaved accesses is unsafe.
994 ///
995 /// The analysis collects interleave groups and records the relationships
996 /// between the member and the group in a map.
997 class InterleavedAccessInfo {
998 public:
999  InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,
1001  : PSE(PSE), TheLoop(L), DT(DT), LI(LI) {}
1002 
1003  ~InterleavedAccessInfo() {
1005  // Avoid releasing a pointer twice.
1006  for (auto &I : InterleaveGroupMap)
1007  DelSet.insert(I.second);
1008  for (auto *Ptr : DelSet)
1009  delete Ptr;
1010  }
1011 
1012  /// \brief Analyze the interleaved accesses and collect them in interleave
1013  /// groups. Substitute symbolic strides using \p Strides.
1014  void analyzeInterleaving(const ValueToValueMap &Strides);
1015 
1016  /// \brief Check if \p Instr belongs to any interleave group.
1017  bool isInterleaved(Instruction *Instr) const {
1018  return InterleaveGroupMap.count(Instr);
1019  }
1020 
1021  /// \brief Get the interleave group that \p Instr belongs to.
1022  ///
1023  /// \returns nullptr if doesn't have such group.
1024  InterleaveGroup *getInterleaveGroup(Instruction *Instr) const {
1025  if (InterleaveGroupMap.count(Instr))
1026  return InterleaveGroupMap.find(Instr)->second;
1027  return nullptr;
1028  }
1029 
1030  /// \brief Returns true if an interleaved group that may access memory
1031  /// out-of-bounds requires a scalar epilogue iteration for correctness.
1032  bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; }
1033 
1034  /// \brief Initialize the LoopAccessInfo used for dependence checking.
1035  void setLAI(const LoopAccessInfo *Info) { LAI = Info; }
1036 
1037 private:
1038  /// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
1039  /// Simplifies SCEV expressions in the context of existing SCEV assumptions.
1040  /// The interleaved access analysis can also add new predicates (for example
1041  /// by versioning strides of pointers).
1043 
1044  Loop *TheLoop;
1045  DominatorTree *DT;
1046  LoopInfo *LI;
1047  const LoopAccessInfo *LAI = nullptr;
1048 
1049  /// True if the loop may contain non-reversed interleaved groups with
1050  /// out-of-bounds accesses. We ensure we don't speculatively access memory
1051  /// out-of-bounds by executing at least one scalar epilogue iteration.
1052  bool RequiresScalarEpilogue = false;
1053 
1054  /// Holds the relationships between the members and the interleave group.
1055  DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap;
1056 
1057  /// Holds dependences among the memory accesses in the loop. It maps a source
1058  /// access to a set of dependent sink accesses.
1060 
1061  /// \brief The descriptor for a strided memory access.
1062  struct StrideDescriptor {
1063  StrideDescriptor() = default;
1064  StrideDescriptor(int64_t Stride, const SCEV *Scev, uint64_t Size,
1065  unsigned Align)
1066  : Stride(Stride), Scev(Scev), Size(Size), Align(Align) {}
1067 
1068  // The access's stride. It is negative for a reverse access.
1069  int64_t Stride = 0;
1070 
1071  // The scalar expression of this access.
1072  const SCEV *Scev = nullptr;
1073 
1074  // The size of the memory object.
1075  uint64_t Size = 0;
1076 
1077  // The alignment of this access.
1078  unsigned Align = 0;
1079  };
1080 
1081  /// \brief A type for holding instructions and their stride descriptors.
1082  using StrideEntry = std::pair<Instruction *, StrideDescriptor>;
1083 
1084  /// \brief Create a new interleave group with the given instruction \p Instr,
1085  /// stride \p Stride and alignment \p Align.
1086  ///
1087  /// \returns the newly created interleave group.
1088  InterleaveGroup *createInterleaveGroup(Instruction *Instr, int Stride,
1089  unsigned Align) {
1090  assert(!InterleaveGroupMap.count(Instr) &&
1091  "Already in an interleaved access group");
1092  InterleaveGroupMap[Instr] = new InterleaveGroup(Instr, Stride, Align);
1093  return InterleaveGroupMap[Instr];
1094  }
1095 
1096  /// \brief Release the group and remove all the relationships.
1097  void releaseGroup(InterleaveGroup *Group) {
1098  for (unsigned i = 0; i < Group->getFactor(); i++)
1099  if (Instruction *Member = Group->getMember(i))
1100  InterleaveGroupMap.erase(Member);
1101 
1102  delete Group;
1103  }
1104 
1105  /// \brief Collect all the accesses with a constant stride in program order.
1106  void collectConstStrideAccesses(
1108  const ValueToValueMap &Strides);
1109 
1110  /// \brief Returns true if \p Stride is allowed in an interleaved group.
1111  static bool isStrided(int Stride) {
1112  unsigned Factor = std::abs(Stride);
1113  return Factor >= 2 && Factor <= MaxInterleaveGroupFactor;
1114  }
1115 
1116  /// \brief Returns true if \p BB is a predicated block.
1117  bool isPredicated(BasicBlock *BB) const {
1118  return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
1119  }
1120 
1121  /// \brief Returns true if LoopAccessInfo can be used for dependence queries.
1122  bool areDependencesValid() const {
1123  return LAI && LAI->getDepChecker().getDependences();
1124  }
1125 
1126  /// \brief Returns true if memory accesses \p A and \p B can be reordered, if
1127  /// necessary, when constructing interleaved groups.
1128  ///
1129  /// \p A must precede \p B in program order. We return false if reordering is
1130  /// not necessary or is prevented because \p A and \p B may be dependent.
1131  bool canReorderMemAccessesForInterleavedGroups(StrideEntry *A,
1132  StrideEntry *B) const {
1133  // Code motion for interleaved accesses can potentially hoist strided loads
1134  // and sink strided stores. The code below checks the legality of the
1135  // following two conditions:
1136  //
1137  // 1. Potentially moving a strided load (B) before any store (A) that
1138  // precedes B, or
1139  //
1140  // 2. Potentially moving a strided store (A) after any load or store (B)
1141  // that A precedes.
1142  //
1143  // It's legal to reorder A and B if we know there isn't a dependence from A
1144  // to B. Note that this determination is conservative since some
1145  // dependences could potentially be reordered safely.
1146 
1147  // A is potentially the source of a dependence.
1148  auto *Src = A->first;
1149  auto SrcDes = A->second;
1150 
1151  // B is potentially the sink of a dependence.
1152  auto *Sink = B->first;
1153  auto SinkDes = B->second;
1154 
1155  // Code motion for interleaved accesses can't violate WAR dependences.
1156  // Thus, reordering is legal if the source isn't a write.
1157  if (!Src->mayWriteToMemory())
1158  return true;
1159 
1160  // At least one of the accesses must be strided.
1161  if (!isStrided(SrcDes.Stride) && !isStrided(SinkDes.Stride))
1162  return true;
1163 
1164  // If dependence information is not available from LoopAccessInfo,
1165  // conservatively assume the instructions can't be reordered.
1166  if (!areDependencesValid())
1167  return false;
1168 
1169  // If we know there is a dependence from source to sink, assume the
1170  // instructions can't be reordered. Otherwise, reordering is legal.
1171  return !Dependences.count(Src) || !Dependences.lookup(Src).count(Sink);
1172  }
1173 
1174  /// \brief Collect the dependences from LoopAccessInfo.
1175  ///
1176  /// We process the dependences once during the interleaved access analysis to
1177  /// enable constant-time dependence queries.
1178  void collectDependences() {
1179  if (!areDependencesValid())
1180  return;
1181  auto *Deps = LAI->getDepChecker().getDependences();
1182  for (auto Dep : *Deps)
1183  Dependences[Dep.getSource(*LAI)].insert(Dep.getDestination(*LAI));
1184  }
1185 };
1186 
1187 /// Utility class for getting and setting loop vectorizer hints in the form
1188 /// of loop metadata.
1189 /// This class keeps a number of loop annotations locally (as member variables)
1190 /// and can, upon request, write them back as metadata on the loop. It will
1191 /// initially scan the loop for existing metadata, and will update the local
1192 /// values based on information in the loop.
1193 /// We cannot write all values to metadata, as the mere presence of some info,
1194 /// for example 'force', means a decision has been made. So, we need to be
1195 /// careful NOT to add them if the user hasn't specifically asked so.
1196 class LoopVectorizeHints {
1197  enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE, HK_ISVECTORIZED };
1198 
1199  /// Hint - associates name and validation with the hint value.
1200  struct Hint {
1201  const char *Name;
1202  unsigned Value; // This may have to change for non-numeric values.
1203  HintKind Kind;
1204 
1205  Hint(const char *Name, unsigned Value, HintKind Kind)
1206  : Name(Name), Value(Value), Kind(Kind) {}
1207 
1208  bool validate(unsigned Val) {
1209  switch (Kind) {
1210  case HK_WIDTH:
1211  return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth;
1212  case HK_UNROLL:
1213  return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
1214  case HK_FORCE:
1215  return (Val <= 1);
1216  case HK_ISVECTORIZED:
1217  return (Val==0 || Val==1);
1218  }
1219  return false;
1220  }
1221  };
1222 
1223  /// Vectorization width.
1224  Hint Width;
1225 
1226  /// Vectorization interleave factor.
1227  Hint Interleave;
1228 
1229  /// Vectorization forced
1230  Hint Force;
1231 
1232  /// Already Vectorized
1233  Hint IsVectorized;
1234 
1235  /// Return the loop metadata prefix.
1236  static StringRef Prefix() { return "llvm.loop."; }
1237 
1238  /// True if there is any unsafe math in the loop.
1239  bool PotentiallyUnsafe = false;
1240 
1241 public:
1242  enum ForceKind {
1243  FK_Undefined = -1, ///< Not selected.
1244  FK_Disabled = 0, ///< Forcing disabled.
1245  FK_Enabled = 1, ///< Forcing enabled.
1246  };
1247 
1248  LoopVectorizeHints(const Loop *L, bool DisableInterleaving,
1250  : Width("vectorize.width", VectorizerParams::VectorizationFactor,
1251  HK_WIDTH),
1252  Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
1253  Force("vectorize.enable", FK_Undefined, HK_FORCE),
1254  IsVectorized("isvectorized", 0, HK_ISVECTORIZED), TheLoop(L), ORE(ORE) {
1255  // Populate values with existing loop metadata.
1256  getHintsFromMetadata();
1257 
1258  // force-vector-interleave overrides DisableInterleaving.
1260  Interleave.Value = VectorizerParams::VectorizationInterleave;
1261 
1262  if (IsVectorized.Value != 1)
1263  // If the vectorization width and interleaving count are both 1 then
1264  // consider the loop to have been already vectorized because there's
1265  // nothing more that we can do.
1266  IsVectorized.Value = Width.Value == 1 && Interleave.Value == 1;
1267  DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()
1268  << "LV: Interleaving disabled by the pass manager\n");
1269  }
1270 
1271  /// Mark the loop L as already vectorized by setting the width to 1.
1272  void setAlreadyVectorized() {
1273  IsVectorized.Value = 1;
1274  Hint Hints[] = {IsVectorized};
1275  writeHintsToMetadata(Hints);
1276  }
1277 
1278  bool allowVectorization(Function *F, Loop *L, bool AlwaysVectorize) const {
1279  if (getForce() == LoopVectorizeHints::FK_Disabled) {
1280  DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
1281  emitRemarkWithHints();
1282  return false;
1283  }
1284 
1285  if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) {
1286  DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
1287  emitRemarkWithHints();
1288  return false;
1289  }
1290 
1291  if (getIsVectorized() == 1) {
1292  DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
1293  // FIXME: Add interleave.disable metadata. This will allow
1294  // vectorize.disable to be used without disabling the pass and errors
1295  // to differentiate between disabled vectorization and a width of 1.
1296  ORE.emit([&]() {
1297  return OptimizationRemarkAnalysis(vectorizeAnalysisPassName(),
1298  "AllDisabled", L->getStartLoc(),
1299  L->getHeader())
1300  << "loop not vectorized: vectorization and interleaving are "
1301  "explicitly disabled, or the loop has already been "
1302  "vectorized";
1303  });
1304  return false;
1305  }
1306 
1307  return true;
1308  }
1309 
1310  /// Dumps all the hint information.
1311  void emitRemarkWithHints() const {
1312  using namespace ore;
1313 
1314  ORE.emit([&]() {
1315  if (Force.Value == LoopVectorizeHints::FK_Disabled)
1316  return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled",
1317  TheLoop->getStartLoc(),
1318  TheLoop->getHeader())
1319  << "loop not vectorized: vectorization is explicitly disabled";
1320  else {
1321  OptimizationRemarkMissed R(LV_NAME, "MissedDetails",
1322  TheLoop->getStartLoc(),
1323  TheLoop->getHeader());
1324  R << "loop not vectorized";
1325  if (Force.Value == LoopVectorizeHints::FK_Enabled) {
1326  R << " (Force=" << NV("Force", true);
1327  if (Width.Value != 0)
1328  R << ", Vector Width=" << NV("VectorWidth", Width.Value);
1329  if (Interleave.Value != 0)
1330  R << ", Interleave Count="
1331  << NV("InterleaveCount", Interleave.Value);
1332  R << ")";
1333  }
1334  return R;
1335  }
1336  });
1337  }
1338 
1339  unsigned getWidth() const { return Width.Value; }
1340  unsigned getInterleave() const { return Interleave.Value; }
1341  unsigned getIsVectorized() const { return IsVectorized.Value; }
1342  enum ForceKind getForce() const { return (ForceKind)Force.Value; }
1343 
1344  /// \brief If hints are provided that force vectorization, use the AlwaysPrint
1345  /// pass name to force the frontend to print the diagnostic.
1346  const char *vectorizeAnalysisPassName() const {
1347  if (getWidth() == 1)
1348  return LV_NAME;
1349  if (getForce() == LoopVectorizeHints::FK_Disabled)
1350  return LV_NAME;
1351  if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0)
1352  return LV_NAME;
1354  }
1355 
1356  bool allowReordering() const {
1357  // When enabling loop hints are provided we allow the vectorizer to change
1358  // the order of operations that is given by the scalar loop. This is not
1359  // enabled by default because can be unsafe or inefficient. For example,
1360  // reordering floating-point operations will change the way round-off
1361  // error accumulates in the loop.
1362  return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1;
1363  }
1364 
1365  bool isPotentiallyUnsafe() const {
1366  // Avoid FP vectorization if the target is unsure about proper support.
1367  // This may be related to the SIMD unit in the target not handling
1368  // IEEE 754 FP ops properly, or bad single-to-double promotions.
1369  // Otherwise, a sequence of vectorized loops, even without reduction,
1370  // could lead to different end results on the destination vectors.
1371  return getForce() != LoopVectorizeHints::FK_Enabled && PotentiallyUnsafe;
1372  }
1373 
1374  void setPotentiallyUnsafe() { PotentiallyUnsafe = true; }
1375 
1376 private:
1377  /// Find hints specified in the loop metadata and update local values.
1378  void getHintsFromMetadata() {
1379  MDNode *LoopID = TheLoop->getLoopID();
1380  if (!LoopID)
1381  return;
1382 
1383  // First operand should refer to the loop id itself.
1384  assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
1385  assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
1386 
1387  for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
1388  const MDString *S = nullptr;
1390 
1391  // The expected hint is either a MDString or a MDNode with the first
1392  // operand a MDString.
1393  if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
1394  if (!MD || MD->getNumOperands() == 0)
1395  continue;
1396  S = dyn_cast<MDString>(MD->getOperand(0));
1397  for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
1398  Args.push_back(MD->getOperand(i));
1399  } else {
1400  S = dyn_cast<MDString>(LoopID->getOperand(i));
1401  assert(Args.size() == 0 && "too many arguments for MDString");
1402  }
1403 
1404  if (!S)
1405  continue;
1406 
1407  // Check if the hint starts with the loop metadata prefix.
1408  StringRef Name = S->getString();
1409  if (Args.size() == 1)
1410  setHint(Name, Args[0]);
1411  }
1412  }
1413 
1414  /// Checks string hint with one operand and set value if valid.
1415  void setHint(StringRef Name, Metadata *Arg) {
1416  if (!Name.startswith(Prefix()))
1417  return;
1418  Name = Name.substr(Prefix().size(), StringRef::npos);
1419 
1420  const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
1421  if (!C)
1422  return;
1423  unsigned Val = C->getZExtValue();
1424 
1425  Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized};
1426  for (auto H : Hints) {
1427  if (Name == H->Name) {
1428  if (H->validate(Val))
1429  H->Value = Val;
1430  else
1431  DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
1432  break;
1433  }
1434  }
1435  }
1436 
1437  /// Create a new hint from name / value pair.
1438  MDNode *createHintMetadata(StringRef Name, unsigned V) const {
1439  LLVMContext &Context = TheLoop->getHeader()->getContext();
1440  Metadata *MDs[] = {MDString::get(Context, Name),
1442  ConstantInt::get(Type::getInt32Ty(Context), V))};
1443  return MDNode::get(Context, MDs);
1444  }
1445 
1446  /// Matches metadata with hint name.
1447  bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) {
1448  MDString *Name = dyn_cast<MDString>(Node->getOperand(0));
1449  if (!Name)
1450  return false;
1451 
1452  for (auto H : HintTypes)
1453  if (Name->getString().endswith(H.Name))
1454  return true;
1455  return false;
1456  }
1457 
1458  /// Sets current hints into loop metadata, keeping other values intact.
1459  void writeHintsToMetadata(ArrayRef<Hint> HintTypes) {
1460  if (HintTypes.empty())
1461  return;
1462 
1463  // Reserve the first element to LoopID (see below).
1465  // If the loop already has metadata, then ignore the existing operands.
1466  MDNode *LoopID = TheLoop->getLoopID();
1467  if (LoopID) {
1468  for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
1469  MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
1470  // If node in update list, ignore old value.
1471  if (!matchesHintMetadataName(Node, HintTypes))
1472  MDs.push_back(Node);
1473  }
1474  }
1475 
1476  // Now, add the missing hints.
1477  for (auto H : HintTypes)
1478  MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));
1479 
1480  // Replace current metadata node with new one.
1481  LLVMContext &Context = TheLoop->getHeader()->getContext();
1482  MDNode *NewLoopID = MDNode::get(Context, MDs);
1483  // Set operand 0 to refer to the loop id itself.
1484  NewLoopID->replaceOperandWith(0, NewLoopID);
1485 
1486  TheLoop->setLoopID(NewLoopID);
1487  }
1488 
1489  /// The loop these hints belong to.
1490  const Loop *TheLoop;
1491 
1492  /// Interface to emit optimization remarks.
1494 };
1495 
1496 } // end anonymous namespace
1497 
1499  const LoopVectorizeHints &LH,
1501  LH.emitRemarkWithHints();
1502 
1503  if (LH.getForce() == LoopVectorizeHints::FK_Enabled) {
1504  if (LH.getWidth() != 1)
1506  DEBUG_TYPE, "FailedRequestedVectorization",
1507  L->getStartLoc(), L->getHeader())
1508  << "loop not vectorized: "
1509  << "failed explicitly specified loop vectorization");
1510  else if (LH.getInterleave() != 1)
1512  DEBUG_TYPE, "FailedRequestedInterleaving", L->getStartLoc(),
1513  L->getHeader())
1514  << "loop not interleaved: "
1515  << "failed explicitly specified loop interleaving");
1516  }
1517 }
1518 
1519 namespace llvm {
1520 
1521 /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
1522 /// to what vectorization factor.
1523 /// This class does not look at the profitability of vectorization, only the
1524 /// legality. This class has two main kinds of checks:
1525 /// * Memory checks - The code in canVectorizeMemory checks if vectorization
1526 /// will change the order of memory accesses in a way that will change the
1527 /// correctness of the program.
1528 /// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory
1529 /// checks for a number of different conditions, such as the availability of a
1530 /// single induction variable, that all types are supported and vectorize-able,
1531 /// etc. This code reflects the capabilities of InnerLoopVectorizer.
1532 /// This class is also used by InnerLoopVectorizer for identifying
1533 /// induction variable and the different reduction variables.
1535 public:
1539  const TargetTransformInfo *TTI,
1540  std::function<const LoopAccessInfo &(Loop &)> *GetLAA, LoopInfo *LI,
1541  OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R,
1542  LoopVectorizeHints *H, DemandedBits *DB, AssumptionCache *AC)
1543  : TheLoop(L), PSE(PSE), TLI(TLI), TTI(TTI), DT(DT), GetLAA(GetLAA),
1544  ORE(ORE), InterleaveInfo(PSE, L, DT, LI), Requirements(R), Hints(H),
1545  DB(DB), AC(AC) {}
1546 
1547  /// ReductionList contains the reduction descriptors for all
1548  /// of the reductions that were found in the loop.
1550 
1551  /// InductionList saves induction variables and maps them to the
1552  /// induction descriptor.
1554 
1555  /// RecurrenceSet contains the phi nodes that are recurrences other than
1556  /// inductions and reductions.
1558 
1559  /// Returns true if it is legal to vectorize this loop.
1560  /// This does not mean that it is profitable to vectorize this
1561  /// loop, only that it is legal to do so.
1562  bool canVectorize();
1563 
1564  /// Returns the primary induction variable.
1565  PHINode *getPrimaryInduction() { return PrimaryInduction; }
1566 
1567  /// Returns the reduction variables found in the loop.
1568  ReductionList *getReductionVars() { return &Reductions; }
1569 
1570  /// Returns the induction variables found in the loop.
1571  InductionList *getInductionVars() { return &Inductions; }
1572 
1573  /// Return the first-order recurrences found in the loop.
1574  RecurrenceSet *getFirstOrderRecurrences() { return &FirstOrderRecurrences; }
1575 
1576  /// Return the set of instructions to sink to handle first-order recurrences.
1578 
1579  /// Returns the widest induction type.
1580  Type *getWidestInductionType() { return WidestIndTy; }
1581 
1582  /// Returns True if V is a Phi node of an induction variable in this loop.
1583  bool isInductionPhi(const Value *V);
1584 
1585  /// Returns True if V is a cast that is part of an induction def-use chain,
1586  /// and had been proven to be redundant under a runtime guard (in other
1587  /// words, the cast has the same SCEV expression as the induction phi).
1588  bool isCastedInductionVariable(const Value *V);
1589 
1590  /// Returns True if V can be considered as an induction variable in this
1591  /// loop. V can be the induction phi, or some redundant cast in the def-use
1592  /// chain of the inducion phi.
1593  bool isInductionVariable(const Value *V);
1594 
1595  /// Returns True if PN is a reduction variable in this loop.
1596  bool isReductionVariable(PHINode *PN) { return Reductions.count(PN); }
1597 
1598  /// Returns True if Phi is a first-order recurrence in this loop.
1599  bool isFirstOrderRecurrence(const PHINode *Phi);
1600 
1601  /// Return true if the block BB needs to be predicated in order for the loop
1602  /// to be vectorized.
1603  bool blockNeedsPredication(BasicBlock *BB);
1604 
1605  /// Check if this pointer is consecutive when vectorizing. This happens
1606  /// when the last index of the GEP is the induction variable, or that the
1607  /// pointer itself is an induction variable.
1608  /// This check allows us to vectorize A[idx] into a wide load/store.
1609  /// Returns:
1610  /// 0 - Stride is unknown or non-consecutive.
1611  /// 1 - Address is consecutive.
1612  /// -1 - Address is consecutive, and decreasing.
1613  /// NOTE: This method must only be used before modifying the original scalar
1614  /// loop. Do not use after invoking 'createVectorizedLoopSkeleton' (PR34965).
1615  int isConsecutivePtr(Value *Ptr);
1616 
1617  /// Returns true if the value V is uniform within the loop.
1618  bool isUniform(Value *V);
1619 
1620  /// Returns the information that we collected about runtime memory check.
1622  return LAI->getRuntimePointerChecking();
1623  }
1624 
1625  const LoopAccessInfo *getLAI() const { return LAI; }
1626 
1627  /// \brief Check if \p Instr belongs to any interleaved access group.
1629  return InterleaveInfo.isInterleaved(Instr);
1630  }
1631 
1632  /// \brief Get the interleaved access group that \p Instr belongs to.
1634  return InterleaveInfo.getInterleaveGroup(Instr);
1635  }
1636 
1637  /// \brief Returns true if an interleaved group requires a scalar iteration
1638  /// to handle accesses with gaps.
1639  bool requiresScalarEpilogue() const {
1640  return InterleaveInfo.requiresScalarEpilogue();
1641  }
1642 
1643  unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); }
1644 
1645  uint64_t getMaxSafeRegisterWidth() const {
1646  return LAI->getDepChecker().getMaxSafeRegisterWidth();
1647  }
1648 
1649  bool hasStride(Value *V) { return LAI->hasStride(V); }
1650 
1651  /// Returns true if the target machine supports masked store operation
1652  /// for the given \p DataType and kind of access to \p Ptr.
1654  return isConsecutivePtr(Ptr) && TTI->isLegalMaskedStore(DataType);
1655  }
1656 
1657  /// Returns true if the target machine supports masked load operation
1658  /// for the given \p DataType and kind of access to \p Ptr.
1660  return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType);
1661  }
1662 
1663  /// Returns true if the target machine supports masked scatter operation
1664  /// for the given \p DataType.
1666  return TTI->isLegalMaskedScatter(DataType);
1667  }
1668 
1669  /// Returns true if the target machine supports masked gather operation
1670  /// for the given \p DataType.
1672  return TTI->isLegalMaskedGather(DataType);
1673  }
1674 
1675  /// Returns true if the target machine can represent \p V as a masked gather
1676  /// or scatter operation.
1678  auto *LI = dyn_cast<LoadInst>(V);
1679  auto *SI = dyn_cast<StoreInst>(V);
1680  if (!LI && !SI)
1681  return false;
1682  auto *Ptr = getPointerOperand(V);
1683  auto *Ty = cast<PointerType>(Ptr->getType())->getElementType();
1684  return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
1685  }
1686 
1687  /// Returns true if vector representation of the instruction \p I
1688  /// requires mask.
1689  bool isMaskRequired(const Instruction *I) { return (MaskedOp.count(I) != 0); }
1690 
1691  unsigned getNumStores() const { return LAI->getNumStores(); }
1692  unsigned getNumLoads() const { return LAI->getNumLoads(); }
1693  unsigned getNumPredStores() const { return NumPredStores; }
1694 
1695  /// Returns true if \p I is an instruction that will be scalarized with
1696  /// predication. Such instructions include conditional stores and
1697  /// instructions that may divide by zero.
1698  bool isScalarWithPredication(Instruction *I);
1699 
1700  /// Returns true if \p I is a memory instruction with consecutive memory
1701  /// access that can be widened.
1702  bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1703 
1704  // Returns true if the NoNaN attribute is set on the function.
1705  bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; }
1706 
1707 private:
1708  /// Check if a single basic block loop is vectorizable.
1709  /// At this point we know that this is a loop with a constant trip count
1710  /// and we only need to check individual instructions.
1711  bool canVectorizeInstrs();
1712 
1713  /// When we vectorize loops we may change the order in which
1714  /// we read and write from memory. This method checks if it is
1715  /// legal to vectorize the code, considering only memory constrains.
1716  /// Returns true if the loop is vectorizable
1717  bool canVectorizeMemory();
1718 
1719  /// Return true if we can vectorize this loop using the IF-conversion
1720  /// transformation.
1721  bool canVectorizeWithIfConvert();
1722 
1723  /// Return true if all of the instructions in the block can be speculatively
1724  /// executed. \p SafePtrs is a list of addresses that are known to be legal
1725  /// and we know that we can read from them without segfault.
1726  bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs);
1727 
1728  /// Updates the vectorization state by adding \p Phi to the inductions list.
1729  /// This can set \p Phi as the main induction of the loop if \p Phi is a
1730  /// better choice for the main induction than the existing one.
1731  void addInductionPhi(PHINode *Phi, const InductionDescriptor &ID,
1732  SmallPtrSetImpl<Value *> &AllowedExit);
1733 
1734  /// Create an analysis remark that explains why vectorization failed
1735  ///
1736  /// \p RemarkName is the identifier for the remark. If \p I is passed it is
1737  /// an instruction that prevents vectorization. Otherwise the loop is used
1738  /// for the location of the remark. \return the remark object that can be
1739  /// streamed to.
1741  createMissedAnalysis(StringRef RemarkName, Instruction *I = nullptr) const {
1742  return ::createMissedAnalysis(Hints->vectorizeAnalysisPassName(),
1743  RemarkName, TheLoop, I);
1744  }
1745 
1746  /// \brief If an access has a symbolic strides, this maps the pointer value to
1747  /// the stride symbol.
1748  const ValueToValueMap *getSymbolicStrides() {
1749  // FIXME: Currently, the set of symbolic strides is sometimes queried before
1750  // it's collected. This happens from canVectorizeWithIfConvert, when the
1751  // pointer is checked to reference consecutive elements suitable for a
1752  // masked access.
1753  return LAI ? &LAI->getSymbolicStrides() : nullptr;
1754  }
1755 
1756  unsigned NumPredStores = 0;
1757 
1758  /// The loop that we evaluate.
1759  Loop *TheLoop;
1760 
1761  /// A wrapper around ScalarEvolution used to add runtime SCEV checks.
1762  /// Applies dynamic knowledge to simplify SCEV expressions in the context
1763  /// of existing SCEV assumptions. The analysis will also add a minimal set
1764  /// of new predicates if this is required to enable vectorization and
1765  /// unrolling.
1767 
1768  /// Target Library Info.
1770 
1771  /// Target Transform Info
1772  const TargetTransformInfo *TTI;
1773 
1774  /// Dominator Tree.
1775  DominatorTree *DT;
1776 
1777  // LoopAccess analysis.
1778  std::function<const LoopAccessInfo &(Loop &)> *GetLAA;
1779 
1780  // And the loop-accesses info corresponding to this loop. This pointer is
1781  // null until canVectorizeMemory sets it up.
1782  const LoopAccessInfo *LAI = nullptr;
1783 
1784  /// Interface to emit optimization remarks.
1786 
1787  /// The interleave access information contains groups of interleaved accesses
1788  /// with the same stride and close to each other.
1789  InterleavedAccessInfo InterleaveInfo;
1790 
1791  // --- vectorization state --- //
1792 
1793  /// Holds the primary induction variable. This is the counter of the
1794  /// loop.
1795  PHINode *PrimaryInduction = nullptr;
1796 
1797  /// Holds the reduction variables.
1798  ReductionList Reductions;
1799 
1800  /// Holds all of the induction variables that we found in the loop.
1801  /// Notice that inductions don't need to start at zero and that induction
1802  /// variables can be pointers.
1803  InductionList Inductions;
1804 
1805  /// Holds all the casts that participate in the update chain of the induction
1806  /// variables, and that have been proven to be redundant (possibly under a
1807  /// runtime guard). These casts can be ignored when creating the vectorized
1808  /// loop body.
1809  SmallPtrSet<Instruction *, 4> InductionCastsToIgnore;
1810 
1811  /// Holds the phi nodes that are first-order recurrences.
1812  RecurrenceSet FirstOrderRecurrences;
1813 
1814  /// Holds instructions that need to sink past other instructions to handle
1815  /// first-order recurrences.
1817 
1818  /// Holds the widest induction type encountered.
1819  Type *WidestIndTy = nullptr;
1820 
1821  /// Allowed outside users. This holds the induction and reduction
1822  /// vars which can be accessed from outside the loop.
1823  SmallPtrSet<Value *, 4> AllowedExit;
1824 
1825  /// Can we assume the absence of NaNs.
1826  bool HasFunNoNaNAttr = false;
1827 
1828  /// Vectorization requirements that will go through late-evaluation.
1829  LoopVectorizationRequirements *Requirements;
1830 
1831  /// Used to emit an analysis of any legality issues.
1832  LoopVectorizeHints *Hints;
1833 
1834  /// The demanded bits analsyis is used to compute the minimum type size in
1835  /// which a reduction can be computed.
1836  DemandedBits *DB;
1837 
1838  /// The assumption cache analysis is used to compute the minimum type size in
1839  /// which a reduction can be computed.
1841 
1842  /// While vectorizing these instructions we have to generate a
1843  /// call to the appropriate masked intrinsic
1845 };
1846 
1847 /// LoopVectorizationCostModel - estimates the expected speedups due to
1848 /// vectorization.
1849 /// In many cases vectorization is not profitable. This can happen because of
1850 /// a number of reasons. In this class we mainly attempt to predict the
1851 /// expected speedup/slowdowns due to the supported instruction set. We use the
1852 /// TargetTransformInfo to query the different backends for the cost of
1853 /// different operations.
1855 public:
1858  const TargetTransformInfo &TTI,
1859  const TargetLibraryInfo *TLI, DemandedBits *DB,
1862  const LoopVectorizeHints *Hints)
1863  : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
1864  AC(AC), ORE(ORE), TheFunction(F), Hints(Hints) {}
1865 
1866  /// \return An upper bound for the vectorization factor, or None if
1867  /// vectorization should be avoided up front.
1868  Optional<unsigned> computeMaxVF(bool OptForSize);
1869 
1870  /// \return The most profitable vectorization factor and the cost of that VF.
1871  /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
1872  /// then this vectorization factor will be selected if vectorization is
1873  /// possible.
1874  VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
1875 
1876  /// Setup cost-based decisions for user vectorization factor.
1877  void selectUserVectorizationFactor(unsigned UserVF) {
1878  collectUniformsAndScalars(UserVF);
1879  collectInstsToScalarize(UserVF);
1880  }
1881 
1882  /// \return The size (in bits) of the smallest and widest types in the code
1883  /// that needs to be vectorized. We ignore values that remain scalar such as
1884  /// 64 bit loop indices.
1885  std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1886 
1887  /// \return The desired interleave count.
1888  /// If interleave count has been specified by metadata it will be returned.
1889  /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1890  /// are the selected vectorization factor and the cost of the selected VF.
1891  unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
1892  unsigned LoopCost);
1893 
1894  /// Memory access instruction may be vectorized in more than one way.
1895  /// Form of instruction after vectorization depends on cost.
1896  /// This function takes cost-based decisions for Load/Store instructions
1897  /// and collects them in a map. This decisions map is used for building
1898  /// the lists of loop-uniform and loop-scalar instructions.
1899  /// The calculated cost is saved with widening decision in order to
1900  /// avoid redundant calculations.
1901  void setCostBasedWideningDecision(unsigned VF);
1902 
1903  /// \brief A struct that represents some properties of the register usage
1904  /// of a loop.
1905  struct RegisterUsage {
1906  /// Holds the number of loop invariant values that are used in the loop.
1908 
1909  /// Holds the maximum number of concurrent live intervals in the loop.
1910  unsigned MaxLocalUsers;
1911 
1912  /// Holds the number of instructions in the loop.
1914  };
1915 
1916  /// \return Returns information about the register usages of the loop for the
1917  /// given vectorization factors.
1918  SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
1919 
1920  /// Collect values we want to ignore in the cost model.
1921  void collectValuesToIgnore();
1922 
1923  /// \returns The smallest bitwidth each instruction can be represented with.
1924  /// The vector equivalents of these instructions should be truncated to this
1925  /// type.
1927  return MinBWs;
1928  }
1929 
1930  /// \returns True if it is more profitable to scalarize instruction \p I for
1931  /// vectorization factor \p VF.
1932  bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1933  assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1934  auto Scalars = InstsToScalarize.find(VF);
1935  assert(Scalars != InstsToScalarize.end() &&
1936  "VF not yet analyzed for scalarization profitability");
1937  return Scalars->second.count(I);
1938  }
1939 
1940  /// Returns true if \p I is known to be uniform after vectorization.
1941  bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1942  if (VF == 1)
1943  return true;
1944  assert(Uniforms.count(VF) && "VF not yet analyzed for uniformity");
1945  auto UniformsPerVF = Uniforms.find(VF);
1946  return UniformsPerVF->second.count(I);
1947  }
1948 
1949  /// Returns true if \p I is known to be scalar after vectorization.
1950  bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1951  if (VF == 1)
1952  return true;
1953  assert(Scalars.count(VF) && "Scalar values are not calculated for VF");
1954  auto ScalarsPerVF = Scalars.find(VF);
1955  return ScalarsPerVF->second.count(I);
1956  }
1957 
1958  /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1959  /// for vectorization factor \p VF.
1960  bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1961  return VF > 1 && MinBWs.count(I) && !isProfitableToScalarize(I, VF) &&
1962  !isScalarAfterVectorization(I, VF);
1963  }
1964 
1965  /// Decision that was taken during cost calculation for memory instruction.
1968  CM_Widen, // For consecutive accesses with stride +1.
1969  CM_Widen_Reverse, // For consecutive accesses with stride -1.
1972  CM_Scalarize
1973  };
1974 
1975  /// Save vectorization decision \p W and \p Cost taken by the cost model for
1976  /// instruction \p I and vector width \p VF.
1978  unsigned Cost) {
1979  assert(VF >= 2 && "Expected VF >=2");
1980  WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1981  }
1982 
1983  /// Save vectorization decision \p W and \p Cost taken by the cost model for
1984  /// interleaving group \p Grp and vector width \p VF.
1985  void setWideningDecision(const InterleaveGroup *Grp, unsigned VF,
1986  InstWidening W, unsigned Cost) {
1987  assert(VF >= 2 && "Expected VF >=2");
1988  /// Broadcast this decicion to all instructions inside the group.
1989  /// But the cost will be assigned to one instruction only.
1990  for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1991  if (auto *I = Grp->getMember(i)) {
1992  if (Grp->getInsertPos() == I)
1993  WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1994  else
1995  WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1996  }
1997  }
1998  }
1999 
2000  /// Return the cost model decision for the given instruction \p I and vector
2001  /// width \p VF. Return CM_Unknown if this instruction did not pass
2002  /// through the cost modeling.
2004  assert(VF >= 2 && "Expected VF >=2");
2005  std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
2006  auto Itr = WideningDecisions.find(InstOnVF);
2007  if (Itr == WideningDecisions.end())
2008  return CM_Unknown;
2009  return Itr->second.first;
2010  }
2011 
2012  /// Return the vectorization cost for the given instruction \p I and vector
2013  /// width \p VF.
2014  unsigned getWideningCost(Instruction *I, unsigned VF) {
2015  assert(VF >= 2 && "Expected VF >=2");
2016  std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
2017  assert(WideningDecisions.count(InstOnVF) && "The cost is not calculated");
2018  return WideningDecisions[InstOnVF].second;
2019  }
2020 
2021  /// Return True if instruction \p I is an optimizable truncate whose operand
2022  /// is an induction variable. Such a truncate will be removed by adding a new
2023  /// induction variable with the destination type.
2024  bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
2025  // If the instruction is not a truncate, return false.
2026  auto *Trunc = dyn_cast<TruncInst>(I);
2027  if (!Trunc)
2028  return false;
2029 
2030  // Get the source and destination types of the truncate.
2031  Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
2032  Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
2033 
2034  // If the truncate is free for the given types, return false. Replacing a
2035  // free truncate with an induction variable would add an induction variable
2036  // update instruction to each iteration of the loop. We exclude from this
2037  // check the primary induction variable since it will need an update
2038  // instruction regardless.
2039  Value *Op = Trunc->getOperand(0);
2040  if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
2041  return false;
2042 
2043  // If the truncated value is not an induction variable, return false.
2044  return Legal->isInductionPhi(Op);
2045  }
2046 
2047  /// Collects the instructions to scalarize for each predicated instruction in
2048  /// the loop.
2049  void collectInstsToScalarize(unsigned VF);
2050 
2051  /// Collect Uniform and Scalar values for the given \p VF.
2052  /// The sets depend on CM decision for Load/Store instructions
2053  /// that may be vectorized as interleave, gather-scatter or scalarized.
2054  void collectUniformsAndScalars(unsigned VF) {
2055  // Do the analysis once.
2056  if (VF == 1 || Uniforms.count(VF))
2057  return;
2058  setCostBasedWideningDecision(VF);
2059  collectLoopUniforms(VF);
2060  collectLoopScalars(VF);
2061  }
2062 
2063 private:
2064  /// \return An upper bound for the vectorization factor, larger than zero.
2065  /// One is returned if vectorization should best be avoided due to cost.
2066  unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount);
2067 
2068  /// The vectorization cost is a combination of the cost itself and a boolean
2069  /// indicating whether any of the contributing operations will actually
2070  /// operate on
2071  /// vector values after type legalization in the backend. If this latter value
2072  /// is
2073  /// false, then all operations will be scalarized (i.e. no vectorization has
2074  /// actually taken place).
2075  using VectorizationCostTy = std::pair<unsigned, bool>;
2076 
2077  /// Returns the expected execution cost. The unit of the cost does
2078  /// not matter because we use the 'cost' units to compare different
2079  /// vector widths. The cost that is returned is *not* normalized by
2080  /// the factor width.
2081  VectorizationCostTy expectedCost(unsigned VF);
2082 
2083  /// Returns the execution time cost of an instruction for a given vector
2084  /// width. Vector width of one means scalar.
2085  VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
2086 
2087  /// The cost-computation logic from getInstructionCost which provides
2088  /// the vector type as an output parameter.
2089  unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
2090 
2091  /// Calculate vectorization cost of memory instruction \p I.
2092  unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
2093 
2094  /// The cost computation for scalarized memory instruction.
2095  unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
2096 
2097  /// The cost computation for interleaving group of memory instructions.
2098  unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
2099 
2100  /// The cost computation for Gather/Scatter instruction.
2101  unsigned getGatherScatterCost(Instruction *I, unsigned VF);
2102 
2103  /// The cost computation for widening instruction \p I with consecutive
2104  /// memory access.
2105  unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
2106 
2107  /// The cost calculation for Load instruction \p I with uniform pointer -
2108  /// scalar load + broadcast.
2109  unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
2110 
2111  /// Returns whether the instruction is a load or store and will be a emitted
2112  /// as a vector operation.
2113  bool isConsecutiveLoadOrStore(Instruction *I);
2114 
2115  /// Create an analysis remark that explains why vectorization failed
2116  ///
2117  /// \p RemarkName is the identifier for the remark. \return the remark object
2118  /// that can be streamed to.
2120  return ::createMissedAnalysis(Hints->vectorizeAnalysisPassName(),
2121  RemarkName, TheLoop);
2122  }
2123 
2124  /// Map of scalar integer values to the smallest bitwidth they can be legally
2125  /// represented as. The vector equivalents of these values should be truncated
2126  /// to this type.
2128 
2129  /// A type representing the costs for instructions if they were to be
2130  /// scalarized rather than vectorized. The entries are Instruction-Cost
2131  /// pairs.
2133 
2134  /// A set containing all BasicBlocks that are known to present after
2135  /// vectorization as a predicated block.
2136  SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
2137 
2138  /// A map holding scalar costs for different vectorization factors. The
2139  /// presence of a cost for an instruction in the mapping indicates that the
2140  /// instruction will be scalarized when vectorizing with the associated
2141  /// vectorization factor. The entries are VF-ScalarCostTy pairs.
2142  DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
2143 
2144  /// Holds the instructions known to be uniform after vectorization.
2145  /// The data is collected per VF.
2147 
2148  /// Holds the instructions known to be scalar after vectorization.
2149  /// The data is collected per VF.
2151 
2152  /// Holds the instructions (address computations) that are forced to be
2153  /// scalarized.
2155 
2156  /// Returns the expected difference in cost from scalarizing the expression
2157  /// feeding a predicated instruction \p PredInst. The instructions to
2158  /// scalarize and their scalar costs are collected in \p ScalarCosts. A
2159  /// non-negative return value implies the expression will be scalarized.
2160  /// Currently, only single-use chains are considered for scalarization.
2161  int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
2162  unsigned VF);
2163 
2164  /// Collect the instructions that are uniform after vectorization. An
2165  /// instruction is uniform if we represent it with a single scalar value in
2166  /// the vectorized loop corresponding to each vector iteration. Examples of
2167  /// uniform instructions include pointer operands of consecutive or
2168  /// interleaved memory accesses. Note that although uniformity implies an
2169  /// instruction will be scalar, the reverse is not true. In general, a
2170  /// scalarized instruction will be represented by VF scalar values in the
2171  /// vectorized loop, each corresponding to an iteration of the original
2172  /// scalar loop.
2173  void collectLoopUniforms(unsigned VF);
2174 
2175  /// Collect the instructions that are scalar after vectorization. An
2176  /// instruction is scalar if it is known to be uniform or will be scalarized
2177  /// during vectorization. Non-uniform scalarized instructions will be
2178  /// represented by VF values in the vectorized loop, each corresponding to an
2179  /// iteration of the original scalar loop.
2180  void collectLoopScalars(unsigned VF);
2181 
2182  /// Keeps cost model vectorization decision and cost for instructions.
2183  /// Right now it is used for memory instructions only.
2185  std::pair<InstWidening, unsigned>>;
2186 
2187  DecisionList WideningDecisions;
2188 
2189 public:
2190  /// The loop that we evaluate.
2192 
2193  /// Predicated scalar evolution analysis.
2195 
2196  /// Loop Info analysis.
2198 
2199  /// Vectorization legality.
2201 
2202  /// Vector target information.
2204 
2205  /// Target Library Info.
2207 
2208  /// Demanded bits analysis.
2210 
2211  /// Assumption cache.
2213 
2214  /// Interface to emit optimization remarks.
2216 
2218 
2219  /// Loop Vectorize Hint.
2220  const LoopVectorizeHints *Hints;
2221 
2222  /// Values to ignore in the cost model.
2224 
2225  /// Values to ignore in the cost model when VF > 1.
2227 };
2228 
2229 } // end namespace llvm
2230 
2231 namespace {
2232 
2233 /// \brief This holds vectorization requirements that must be verified late in
2234 /// the process. The requirements are set by legalize and costmodel. Once
2235 /// vectorization has been determined to be possible and profitable the
2236 /// requirements can be verified by looking for metadata or compiler options.
2237 /// For example, some loops require FP commutativity which is only allowed if
2238 /// vectorization is explicitly specified or if the fast-math compiler option
2239 /// has been provided.
2240 /// Late evaluation of these requirements allows helpful diagnostics to be
2241 /// composed that tells the user what need to be done to vectorize the loop. For
2242 /// example, by specifying #pragma clang loop vectorize or -ffast-math. Late
2243 /// evaluation should be used only when diagnostics can generated that can be
2244 /// followed by a non-expert user.
2245 class LoopVectorizationRequirements {
2246 public:
2247  LoopVectorizationRequirements(OptimizationRemarkEmitter &ORE) : ORE(ORE) {}
2248 
2249  void addUnsafeAlgebraInst(Instruction *I) {
2250  // First unsafe algebra instruction.
2251  if (!UnsafeAlgebraInst)
2252  UnsafeAlgebraInst = I;
2253  }
2254 
2255  void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; }
2256 
2257  bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints) {
2258  const char *PassName = Hints.vectorizeAnalysisPassName();
2259  bool Failed = false;
2260  if (UnsafeAlgebraInst && !Hints.allowReordering()) {
2261  ORE.emit([&]() {
2263  PassName, "CantReorderFPOps",
2264  UnsafeAlgebraInst->getDebugLoc(),
2265  UnsafeAlgebraInst->getParent())
2266  << "loop not vectorized: cannot prove it is safe to reorder "
2267  "floating-point operations";
2268  });
2269  Failed = true;
2270  }
2271 
2272  // Test if runtime memcheck thresholds are exceeded.
2273  bool PragmaThresholdReached =
2274  NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
2275  bool ThresholdReached =
2276  NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
2277  if ((ThresholdReached && !Hints.allowReordering()) ||
2278  PragmaThresholdReached) {
2279  ORE.emit([&]() {
2280  return OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps",
2281  L->getStartLoc(),
2282  L->getHeader())
2283  << "loop not vectorized: cannot prove it is safe to reorder "
2284  "memory operations";
2285  });
2286  DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
2287  Failed = true;
2288  }
2289 
2290  return Failed;
2291  }
2292 
2293 private:
2294  unsigned NumRuntimePointerChecks = 0;
2295  Instruction *UnsafeAlgebraInst = nullptr;
2296 
2297  /// Interface to emit optimization remarks.
2299 };
2300 
2301 } // end anonymous namespace
2302 
2304  if (L.empty()) {
2305  if (!hasCyclesInLoopBody(L))
2306  V.push_back(&L);
2307  return;
2308  }
2309  for (Loop *InnerL : L)
2310  addAcyclicInnerLoop(*InnerL, V);
2311 }
2312 
2313 namespace {
2314 
2315 /// The LoopVectorize Pass.
2316 struct LoopVectorize : public FunctionPass {
2317  /// Pass identification, replacement for typeid
2318  static char ID;
2319 
2320  LoopVectorizePass Impl;
2321 
2322  explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true)
2323  : FunctionPass(ID) {
2324  Impl.DisableUnrolling = NoUnrolling;
2325  Impl.AlwaysVectorize = AlwaysVectorize;
2327  }
2328 
2329  bool runOnFunction(Function &F) override {
2330  if (skipFunction(F))
2331  return false;
2332 
2333  auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2334  auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2335  auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2336  auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2337  auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2338  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2339  auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
2340  auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2341  auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2342  auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2343  auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2344  auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2345 
2346  std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2347  [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2348 
2349  return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2350  GetLAA, *ORE);
2351  }
2352 
2353  void getAnalysisUsage(AnalysisUsage &AU) const override {
2368  }
2369 };
2370 
2371 } // end anonymous namespace
2372 
2373 //===----------------------------------------------------------------------===//
2374 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2375 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2376 //===----------------------------------------------------------------------===//
2377 
2379  // We need to place the broadcast of invariant variables outside the loop.
2380  Instruction *Instr = dyn_cast<Instruction>(V);
2381  bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody);
2382  bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
2383 
2384  // Place the code for broadcasting invariant variables in the new preheader.
2386  if (Invariant)
2388 
2389  // Broadcast the scalar into all locations in the vector.
2390  Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2391 
2392  return Shuf;
2393 }
2394 
2396  const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
2397  Value *Start = II.getStartValue();
2398 
2399  // Construct the initial value of the vector IV in the vector loop preheader
2400  auto CurrIP = Builder.saveIP();
2402  if (isa<TruncInst>(EntryVal)) {
2403  assert(Start->getType()->isIntegerTy() &&
2404  "Truncation requires an integer type");
2405  auto *TruncType = cast<IntegerType>(EntryVal->getType());
2406  Step = Builder.CreateTrunc(Step, TruncType);
2407  Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2408  }
2409  Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
2410  Value *SteppedStart =
2411  getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
2412 
2413  // We create vector phi nodes for both integer and floating-point induction
2414  // variables. Here, we determine the kind of arithmetic we will perform.
2415  Instruction::BinaryOps AddOp;
2416  Instruction::BinaryOps MulOp;
2417  if (Step->getType()->isIntegerTy()) {
2418  AddOp = Instruction::Add;
2419  MulOp = Instruction::Mul;
2420  } else {
2421  AddOp = II.getInductionOpcode();
2422  MulOp = Instruction::FMul;
2423  }
2424 
2425  // Multiply the vectorization factor by the step using integer or
2426  // floating-point arithmetic as appropriate.
2427  Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
2428  Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
2429 
2430  // Create a vector splat to use in the induction update.
2431  //
2432  // FIXME: If the step is non-constant, we create the vector splat with
2433  // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2434  // handle a constant vector splat.
2435  Value *SplatVF = isa<Constant>(Mul)
2436  ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
2437  : Builder.CreateVectorSplat(VF, Mul);
2438  Builder.restoreIP(CurrIP);
2439 
2440  // We may need to add the step a number of times, depending on the unroll
2441  // factor. The last of those goes into the PHI.
2442  PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2444  Instruction *LastInduction = VecInd;
2445  for (unsigned Part = 0; Part < UF; ++Part) {
2446  VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
2447 
2448  if (isa<TruncInst>(EntryVal))
2449  addMetadata(LastInduction, EntryVal);
2450  else
2451  recordVectorLoopValueForInductionCast(II, LastInduction, Part);
2452 
2453  LastInduction = cast<Instruction>(addFastMathFlag(
2454  Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
2455  }
2456 
2457  // Move the last step to the end of the latch block. This ensures consistent
2458  // placement of all induction updates.
2459  auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2460  auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2461  auto *ICmp = cast<Instruction>(Br->getCondition());
2462  LastInduction->moveBefore(ICmp);
2463  LastInduction->setName("vec.ind.next");
2464 
2465  VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2466  VecInd->addIncoming(LastInduction, LoopVectorLatch);
2467 }
2468 
2470  return Cost->isScalarAfterVectorization(I, VF) ||
2472 }
2473 
2476  return true;
2477  auto isScalarInst = [&](User *U) -> bool {
2478  auto *I = cast<Instruction>(U);
2480  };
2481  return llvm::any_of(IV->users(), isScalarInst);
2482 }
2483 
2485  const InductionDescriptor &ID, Value *VectorLoopVal, unsigned Part,
2486  unsigned Lane) {
2487  const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
2488  if (Casts.empty())
2489  return;
2490  // Only the first Cast instruction in the Casts vector is of interest.
2491  // The rest of the Casts (if exist) have no uses outside the
2492  // induction update chain itself.
2493  Instruction *CastInst = *Casts.begin();
2494  if (Lane < UINT_MAX)
2495  VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
2496  else
2497  VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
2498 }
2499 
2501  assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
2502  "Primary induction variable must have an integer type");
2503 
2504  auto II = Legal->getInductionVars()->find(IV);
2505  assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
2506 
2507  auto ID = II->second;
2508  assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
2509 
2510  // The scalar value to broadcast. This will be derived from the canonical
2511  // induction variable.
2512  Value *ScalarIV = nullptr;
2513 
2514  // The value from the original loop to which we are mapping the new induction
2515  // variable.
2516  Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2517 
2518  // True if we have vectorized the induction variable.
2519  auto VectorizedIV = false;
2520 
2521  // Determine if we want a scalar version of the induction variable. This is
2522  // true if the induction variable itself is not widened, or if it has at
2523  // least one user in the loop that is not widened.
2524  auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
2525 
2526  // Generate code for the induction step. Note that induction steps are
2527  // required to be loop-invariant
2528  assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
2529  "Induction step should be loop invariant");
2530  auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2531  Value *Step = nullptr;
2532  if (PSE.getSE()->isSCEVable(IV->getType())) {
2533  SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2534  Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
2536  } else {
2537  Step = cast<SCEVUnknown>(ID.getStep())->getValue();
2538  }
2539 
2540  // Try to create a new independent vector induction variable. If we can't
2541  // create the phi node, we will splat the scalar induction variable in each
2542  // loop iteration.
2543  if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
2544  createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2545  VectorizedIV = true;
2546  }
2547 
2548  // If we haven't yet vectorized the induction variable, or if we will create
2549  // a scalar one, we need to define the scalar induction variable and step
2550  // values. If we were given a truncation type, truncate the canonical
2551  // induction variable and step. Otherwise, derive these values from the
2552  // induction descriptor.
2553  if (!VectorizedIV || NeedsScalarIV) {
2554  ScalarIV = Induction;
2555  if (IV != OldInduction) {
2556  ScalarIV = IV->getType()->isIntegerTy()
2558  : Builder.CreateCast(Instruction::SIToFP, Induction,
2559  IV->getType());
2560  ScalarIV = ID.transform(Builder, ScalarIV, PSE.getSE(), DL);
2561  ScalarIV->setName("offset.idx");
2562  }
2563  if (Trunc) {
2564  auto *TruncType = cast<IntegerType>(Trunc->getType());
2565  assert(Step->getType()->isIntegerTy() &&
2566  "Truncation requires an integer step");
2567  ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2568  Step = Builder.CreateTrunc(Step, TruncType);
2569  }
2570  }
2571 
2572  // If we haven't yet vectorized the induction variable, splat the scalar
2573  // induction variable, and build the necessary step vectors.
2574  // TODO: Don't do it unless the vectorized IV is really required.
2575  if (!VectorizedIV) {
2576  Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2577  for (unsigned Part = 0; Part < UF; ++Part) {
2578  Value *EntryPart =
2579  getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
2580  VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
2581  if (Trunc)
2582  addMetadata(EntryPart, Trunc);
2583  else
2584  recordVectorLoopValueForInductionCast(ID, EntryPart, Part);
2585  }
2586  }
2587 
2588  // If an induction variable is only used for counting loop iterations or
2589  // calculating addresses, it doesn't need to be widened. Create scalar steps
2590  // that can be used by instructions we will later scalarize. Note that the
2591  // addition of the scalar steps will not increase the number of instructions
2592  // in the loop in the common case prior to InstCombine. We will be trading
2593  // one vector extract for each scalar step.
2594  if (NeedsScalarIV)
2595  buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2596 }
2597 
2599  Instruction::BinaryOps BinOp) {
2600  // Create and check the types.
2601  assert(Val->getType()->isVectorTy() && "Must be a vector");
2602  int VLen = Val->getType()->getVectorNumElements();
2603 
2604  Type *STy = Val->getType()->getScalarType();
2605  assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2606  "Induction Step must be an integer or FP");
2607  assert(Step->getType() == STy && "Step has wrong type");
2608 
2610 
2611  if (STy->isIntegerTy()) {
2612  // Create a vector of consecutive numbers from zero to VF.
2613  for (int i = 0; i < VLen; ++i)
2614  Indices.push_back(ConstantInt::get(STy, StartIdx + i));
2615 
2616  // Add the consecutive indices to the vector value.
2617  Constant *Cv = ConstantVector::get(Indices);
2618  assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
2619  Step = Builder.CreateVectorSplat(VLen, Step);
2620  assert(Step->getType() == Val->getType() && "Invalid step vec");
2621  // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2622  // which can be found from the original scalar operations.
2623  Step = Builder.CreateMul(Cv, Step);
2624  return Builder.CreateAdd(Val, Step, "induction");
2625  }
2626 
2627  // Floating point induction.
2628  assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2629  "Binary Opcode should be specified for FP induction");
2630  // Create a vector of consecutive numbers from zero to VF.
2631  for (int i = 0; i < VLen; ++i)
2632  Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
2633 
2634  // Add the consecutive indices to the vector value.
2635  Constant *Cv = ConstantVector::get(Indices);
2636 
2637  Step = Builder.CreateVectorSplat(VLen, Step);
2638 
2639  // Floating point operations had to be 'fast' to enable the induction.
2640  FastMathFlags Flags;
2641  Flags.setFast();
2642 
2643  Value *MulOp = Builder.CreateFMul(Cv, Step);
2644  if (isa<Instruction>(MulOp))
2645  // Have to check, MulOp may be a constant
2646  cast<Instruction>(MulOp)->setFastMathFlags(Flags);
2647 
2648  Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2649  if (isa<Instruction>(BOp))
2650  cast<Instruction>(BOp)->setFastMathFlags(Flags);
2651  return BOp;
2652 }
2653 
2655  Value *EntryVal,
2656  const InductionDescriptor &ID) {
2657  // We shouldn't have to build scalar steps if we aren't vectorizing.
2658  assert(VF > 1 && "VF should be greater than one");
2659 
2660  // Get the value type and ensure it and the step have the same integer type.
2661  Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2662  assert(ScalarIVTy == Step->getType() &&
2663  "Val and Step should have the same type");
2664 
2665  // We build scalar steps for both integer and floating-point induction
2666  // variables. Here, we determine the kind of arithmetic we will perform.
2667  Instruction::BinaryOps AddOp;
2668  Instruction::BinaryOps MulOp;
2669  if (ScalarIVTy->isIntegerTy()) {
2670  AddOp = Instruction::Add;
2671  MulOp = Instruction::Mul;
2672  } else {
2673  AddOp = ID.getInductionOpcode();
2674  MulOp = Instruction::FMul;
2675  }
2676 
2677  // Determine the number of scalars we need to generate for each unroll
2678  // iteration. If EntryVal is uniform, we only need to generate the first
2679  // lane. Otherwise, we generate all VF values.
2680  unsigned Lanes =
2681  Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
2682  : VF;
2683  // Compute the scalar steps and save the results in VectorLoopValueMap.
2684  for (unsigned Part = 0; Part < UF; ++Part) {
2685  for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2686  auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
2687  auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
2688  auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
2689  VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
2690  recordVectorLoopValueForInductionCast(ID, Add, Part, Lane);
2691  }
2692  }
2693 }
2694 
2696  const ValueToValueMap &Strides = getSymbolicStrides() ? *getSymbolicStrides() :
2697  ValueToValueMap();
2698 
2699  int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, true, false);
2700  if (Stride == 1 || Stride == -1)
2701  return Stride;
2702  return 0;
2703 }
2704 
2706  return LAI->isUniform(V);
2707 }
2708 
2710  assert(V != Induction && "The new induction variable should not be used.");
2711  assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2712  assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2713 
2714  // If we have a stride that is replaced by one, do it here.
2715  if (Legal->hasStride(V))
2716  V = ConstantInt::get(V->getType(), 1);
2717 
2718  // If we have a vector mapped to this value, return it.
2719  if (VectorLoopValueMap.hasVectorValue(V, Part))
2720  return VectorLoopValueMap.getVectorValue(V, Part);
2721 
2722  // If the value has not been vectorized, check if it has been scalarized
2723  // instead. If it has been scalarized, and we actually need the value in
2724  // vector form, we will construct the vector values on demand.
2726  Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2727 
2728  // If we've scalarized a value, that value should be an instruction.
2729  auto *I = cast<Instruction>(V);
2730 
2731  // If we aren't vectorizing, we can just copy the scalar map values over to
2732  // the vector map.
2733  if (VF == 1) {
2734  VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2735  return ScalarValue;
2736  }
2737 
2738  // Get the last scalar instruction we generated for V and Part. If the value
2739  // is known to be uniform after vectorization, this corresponds to lane zero
2740  // of the Part unroll iteration. Otherwise, the last instruction is the one
2741  // we created for the last vector lane of the Part unroll iteration.
2742  unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
2743  auto *LastInst = cast<Instruction>(
2744  VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2745 
2746  // Set the insert point after the last scalarized instruction. This ensures
2747  // the insertelement sequence will directly follow the scalar definitions.
2748  auto OldIP = Builder.saveIP();
2749  auto NewIP = std::next(BasicBlock::iterator(LastInst));
2750  Builder.SetInsertPoint(&*NewIP);
2751 
2752  // However, if we are vectorizing, we need to construct the vector values.
2753  // If the value is known to be uniform after vectorization, we can just
2754  // broadcast the scalar value corresponding to lane zero for each unroll
2755  // iteration. Otherwise, we construct the vector values using insertelement
2756  // instructions. Since the resulting vectors are stored in
2757  // VectorLoopValueMap, we will only generate the insertelements once.
2758  Value *VectorValue = nullptr;
2760  VectorValue = getBroadcastInstrs(ScalarValue);
2761  VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2762  } else {
2763  // Initialize packing with insertelements to start from undef.
2765  VectorLoopValueMap.setVectorValue(V, Part, Undef);
2766  for (unsigned Lane = 0; Lane < VF; ++Lane)
2767  packScalarIntoVectorValue(V, {Part, Lane});
2768  VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2769  }
2770  Builder.restoreIP(OldIP);
2771  return VectorValue;
2772  }
2773 
2774  // If this scalar is unknown, assume that it is a constant or that it is
2775  // loop invariant. Broadcast V and save the value for future uses.
2776  Value *B = getBroadcastInstrs(V);
2777  VectorLoopValueMap.setVectorValue(V, Part, B);
2778  return B;
2779 }
2780 
2781 Value *
2783  const VPIteration &Instance) {
2784  // If the value is not an instruction contained in the loop, it should
2785  // already be scalar.
2786  if (OrigLoop->isLoopInvariant(V))
2787  return V;
2788 
2789  assert(Instance.Lane > 0
2790  ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2791  : true && "Uniform values only have lane zero");
2792 
2793  // If the value from the original loop has not been vectorized, it is
2794  // represented by UF x VF scalar values in the new loop. Return the requested
2795  // scalar value.
2796  if (VectorLoopValueMap.hasScalarValue(V, Instance))
2797  return VectorLoopValueMap.getScalarValue(V, Instance);
2798 
2799  // If the value has not been scalarized, get its entry in VectorLoopValueMap
2800  // for the given unroll part. If this entry is not a vector type (i.e., the
2801  // vectorization factor is one), there is no need to generate an
2802  // extractelement instruction.
2803  auto *U = getOrCreateVectorValue(V, Instance.Part);
2804  if (!U->getType()->isVectorTy()) {
2805  assert(VF == 1 && "Value not scalarized has non-vector type");
2806  return U;
2807  }
2808 
2809  // Otherwise, the value from the original loop has been vectorized and is
2810  // represented by UF vector values. Extract and return the requested scalar
2811  // value from the appropriate vector lane.
2812  return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2813 }
2814 
2816  Value *V, const VPIteration &Instance) {
2817  assert(V != Induction && "The new induction variable should not be used.");
2818  assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2819  assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2820 
2821  Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2822  Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2823  VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2824  Builder.getInt32(Instance.Lane));
2825  VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2826 }
2827 
2829  assert(Vec->getType()->isVectorTy() && "Invalid type");
2830  SmallVector<Constant *, 8> ShuffleMask;
2831  for (unsigned i = 0; i < VF; ++i)
2832  ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2833 
2834  return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2835  ConstantVector::get(ShuffleMask),
2836  "reverse");
2837 }
2838 
2839 // Try to vectorize the interleave group that \p Instr belongs to.
2840 //
2841 // E.g. Translate following interleaved load group (factor = 3):
2842 // for (i = 0; i < N; i+=3) {
2843 // R = Pic[i]; // Member of index 0
2844 // G = Pic[i+1]; // Member of index 1
2845 // B = Pic[i+2]; // Member of index 2
2846 // ... // do something to R, G, B
2847 // }
2848 // To:
2849 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2850 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements
2851 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements
2852 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements
2853 //
2854 // Or translate following interleaved store group (factor = 3):
2855 // for (i = 0; i < N; i+=3) {
2856 // ... do something to R, G, B
2857 // Pic[i] = R; // Member of index 0
2858 // Pic[i+1] = G; // Member of index 1
2859 // Pic[i+2] = B; // Member of index 2
2860 // }
2861 // To:
2862 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2863 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2864 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2865 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2866 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2868  const InterleaveGroup *Group = Legal->getInterleavedAccessGroup(Instr);
2869  assert(Group && "Fail to get an interleaved access group.");
2870 
2871  // Skip if current instruction is not the insert position.
2872  if (Instr != Group->getInsertPos())
2873  return;
2874 
2875  const DataLayout &DL = Instr->getModule()->getDataLayout();
2876  Value *Ptr = getPointerOperand(Instr);
2877 
2878  // Prepare for the vector type of the interleaved load/store.
2879  Type *ScalarTy = getMemInstValueType(Instr);
2880  unsigned InterleaveFactor = Group->getFactor();
2881  Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2882  Type *PtrTy = VecTy->getPointerTo(getMemInstAddressSpace(Instr));
2883 
2884  // Prepare for the new pointers.
2886  SmallVector<Value *, 2> NewPtrs;
2887  unsigned Index = Group->getIndex(Instr);
2888 
2889  // If the group is reverse, adjust the index to refer to the last vector lane
2890  // instead of the first. We adjust the index from the first vector lane,
2891  // rather than directly getting the pointer for lane VF - 1, because the
2892  // pointer operand of the interleaved access is supposed to be uniform. For
2893  // uniform instructions, we're only required to generate a value for the
2894  // first vector lane in each unroll iteration.
2895  if (Group->isReverse())
2896  Index += (VF - 1) * Group->getFactor();
2897 
2898  for (unsigned Part = 0; Part < UF; Part++) {
2899  Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
2900 
2901  // Notice current instruction could be any index. Need to adjust the address
2902  // to the member of index 0.
2903  //
2904  // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2905  // b = A[i]; // Member of index 0
2906  // Current pointer is pointed to A[i+1], adjust it to A[i].
2907  //
2908  // E.g. A[i+1] = a; // Member of index 1
2909  // A[i] = b; // Member of index 0
2910  // A[i+2] = c; // Member of index 2 (Current instruction)
2911  // Current pointer is pointed to A[i+2], adjust it to A[i].
2912  NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index));
2913 
2914  // Cast to the vector pointer type.
2915  NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
2916  }
2917 
2918  setDebugLocFromInst(Builder, Instr);
2919  Value *UndefVec = UndefValue::get(VecTy);
2920 
2921  // Vectorize the interleaved load group.
2922  if (isa<LoadInst>(Instr)) {
2923  // For each unroll part, create a wide load for the group.
2924  SmallVector<Value *, 2> NewLoads;
2925  for (unsigned Part = 0; Part < UF; Part++) {
2926  auto *NewLoad = Builder.CreateAlignedLoad(
2927  NewPtrs[Part], Group->getAlignment(), "wide.vec");
2928  Group->addMetadata(NewLoad);
2929  NewLoads.push_back(NewLoad);
2930  }
2931 
2932  // For each member in the group, shuffle out the appropriate data from the
2933  // wide loads.
2934  for (unsigned I = 0; I < InterleaveFactor; ++I) {
2935  Instruction *Member = Group->getMember(I);
2936 
2937  // Skip the gaps in the group.
2938  if (!Member)
2939  continue;
2940 
2941  Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2942  for (unsigned Part = 0; Part < UF; Part++) {
2943  Value *StridedVec = Builder.CreateShuffleVector(
2944  NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2945 
2946  // If this member has different type, cast the result type.
2947  if (Member->getType() != ScalarTy) {
2948  VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2949  StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2950  }
2951 
2952  if (Group->isReverse())
2953  StridedVec = reverseVector(StridedVec);
2954 
2955  VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2956  }
2957  }
2958  return;
2959  }
2960 
2961  // The sub vector type for current instruction.
2962  VectorType *SubVT = VectorType::get(ScalarTy, VF);
2963 
2964  // Vectorize the interleaved store group.
2965  for (unsigned Part = 0; Part < UF; Part++) {
2966  // Collect the stored vector from each member.
2967  SmallVector<Value *, 4> StoredVecs;
2968  for (unsigned i = 0; i < InterleaveFactor; i++) {
2969  // Interleaved store group doesn't allow a gap, so each index has a member
2970  Instruction *Member = Group->getMember(i);
2971  assert(Member && "Fail to get a member from an interleaved store group");
2972 
2973  Value *StoredVec = getOrCreateVectorValue(
2974  cast<StoreInst>(Member)->getValueOperand(), Part);
2975  if (Group->isReverse())
2976  StoredVec = reverseVector(StoredVec);
2977 
2978  // If this member has different type, cast it to a unified type.
2979 
2980  if (StoredVec->getType() != SubVT)
2981  StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2982 
2983  StoredVecs.push_back(StoredVec);
2984  }
2985 
2986  // Concatenate all vectors into a wide vector.
2987  Value *WideVec = concatenateVectors(Builder, StoredVecs);
2988 
2989  // Interleave the elements in the wide vector.
2990  Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2991  Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2992  "interleaved.vec");
2993 
2994  Instruction *NewStoreInstr =
2995  Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment());
2996 
2997  Group->addMetadata(NewStoreInstr);
2998  }
2999 }
3000 
3002  VectorParts *BlockInMask) {
3003  // Attempt to issue a wide load.
3004  LoadInst *LI = dyn_cast<LoadInst>(Instr);
3005  StoreInst *SI = dyn_cast<StoreInst>(Instr);
3006 
3007  assert((LI || SI) && "Invalid Load/Store instruction");
3008 
3010  Cost->getWideningDecision(Instr, VF);
3012  "CM decision should be taken at this point");
3014  return vectorizeInterleaveGroup(Instr);
3015 
3016  Type *ScalarDataTy = getMemInstValueType(Instr);
3017  Type *DataTy = VectorType::get(ScalarDataTy, VF);
3018  Value *Ptr = getPointerOperand(Instr);
3019  unsigned Alignment = getMemInstAlignment(Instr);
3020  // An alignment of 0 means target abi alignment. We need to use the scalar's
3021  // target abi alignment in such a case.
3022  const DataLayout &DL = Instr->getModule()->getDataLayout();
3023  if (!Alignment)
3024  Alignment = DL.getABITypeAlignment(ScalarDataTy);
3025  unsigned AddressSpace = getMemInstAddressSpace(Instr);
3026 
3027  // Determine if the pointer operand of the access is either consecutive or
3028  // reverse consecutive.
3029  bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
3030  bool ConsecutiveStride =
3031  Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
3032  bool CreateGatherScatter =
3034 
3035  // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
3036  // gather/scatter. Otherwise Decision should have been to Scalarize.
3037  assert((ConsecutiveStride || CreateGatherScatter) &&
3038  "The instruction should be scalarized");
3039 
3040  // Handle consecutive loads/stores.
3041  if (ConsecutiveStride)
3042  Ptr = getOrCreateScalarValue(Ptr, {0, 0});
3043 
3044  VectorParts Mask;
3045  bool isMaskRequired = BlockInMask;
3046  if (isMaskRequired)
3047  Mask = *BlockInMask;
3048 
3049  // Handle Stores:
3050  if (SI) {
3051  assert(!Legal->isUniform(SI->getPointerOperand()) &&
3052  "We do not allow storing to uniform addresses");
3054 
3055  for (unsigned Part = 0; Part < UF; ++Part) {
3056  Instruction *NewSI = nullptr;
3057  Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
3058  if (CreateGatherScatter) {
3059  Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
3060  Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
3061  NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
3062  MaskPart);
3063  } else {
3064  // Calculate the pointer for the specific unroll-part.
3065  Value *PartPtr =
3066  Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
3067 
3068  if (Reverse) {
3069  // If we store to reverse consecutive memory locations, then we need
3070  // to reverse the order of elements in the stored value.
3071  StoredVal = reverseVector(StoredVal);
3072  // We don't want to update the value in the map as it might be used in
3073  // another expression. So don't call resetVectorValue(StoredVal).
3074 
3075  // If the address is consecutive but reversed, then the
3076  // wide store needs to start at the last vector element.
3077  PartPtr =
3078  Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
3079  PartPtr =
3080  Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
3081  if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
3082  Mask[Part] = reverseVector(Mask[Part]);
3083  }
3084 
3085  Value *VecPtr =
3086  Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
3087 
3088  if (isMaskRequired)
3089  NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
3090  Mask[Part]);
3091  else
3092  NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
3093  }
3094  addMetadata(NewSI, SI);
3095  }
3096  return;
3097  }
3098 
3099  // Handle loads.
3100  assert(LI && "Must have a load instruction");
3102  for (unsigned Part = 0; Part < UF; ++Part) {
3103  Value *NewLI;
3104  if (CreateGatherScatter) {
3105  Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
3106  Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
3107  NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
3108  nullptr, "wide.masked.gather");
3109  addMetadata(NewLI, LI);
3110  } else {
3111  // Calculate the pointer for the specific unroll-part.
3112  Value *PartPtr =
3113  Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
3114 
3115  if (Reverse) {
3116  // If the address is consecutive but reversed, then the
3117  // wide load needs to start at the last vector element.
3118  PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
3119  PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
3120  if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
3121  Mask[Part] = reverseVector(Mask[Part]);
3122  }
3123 
3124  Value *VecPtr =
3125  Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
3126  if (isMaskRequired)
3127  NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
3128  UndefValue::get(DataTy),
3129  "wide.masked.load");
3130  else
3131  NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");
3132 
3133  // Add metadata to the load, but setVectorValue to the reverse shuffle.
3134  addMetadata(NewLI, LI);
3135  if (Reverse)
3136  NewLI = reverseVector(NewLI);
3137  }
3138  VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
3139  }
3140 }
3141 
3143  const VPIteration &Instance,
3144  bool IfPredicateInstr) {
3145  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
3146 
3147  setDebugLocFromInst(Builder, Instr);
3148 
3149  // Does this instruction return a value ?
3150  bool IsVoidRetTy = Instr->getType()->isVoidTy();
3151 
3152  Instruction *Cloned = Instr->clone();
3153  if (!IsVoidRetTy)
3154  Cloned->setName(Instr->getName() + ".cloned");
3155 
3156  // Replace the operands of the cloned instructions with their scalar
3157  // equivalents in the new loop.
3158  for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
3159  auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
3160  Cloned->setOperand(op, NewOp);
3161  }
3162  addNewMetadata(Cloned, Instr);
3163 
3164  // Place the cloned scalar in the new loop.
3165  Builder.Insert(Cloned);
3166 
3167  // Add the cloned scalar to the scalar map entry.
3168  VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
3169 
3170  // If we just cloned a new assumption, add it the assumption cache.
3171  if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
3172  if (II->getIntrinsicID() == Intrinsic::assume)
3173  AC->registerAssumption(II);
3174 
3175  // End if-block.
3176  if (IfPredicateInstr)
3177  PredicatedInstructions.push_back(Cloned);
3178 }
3179 
3181  Value *End, Value *Step,
3182  Instruction *DL) {
3183  BasicBlock *Header = L->getHeader();
3184  BasicBlock *Latch = L->getLoopLatch();
3185  // As we're just creating this loop, it's possible no latch exists
3186  // yet. If so, use the header as this will be a single block loop.
3187  if (!Latch)
3188  Latch = Header;
3189 
3192  setDebugLocFromInst(Builder, OldInst);
3193  auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
3194 
3196  setDebugLocFromInst(Builder, OldInst);
3197 
3198  // Create i+1 and fill the PHINode.
3199  Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
3200  Induction->addIncoming(Start, L->getLoopPreheader());
3201  Induction->addIncoming(Next, Latch);
3202  // Create the compare.
3203  Value *ICmp = Builder.CreateICmpEQ(Next, End);
3204  Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
3205 
3206  // Now we have two terminators. Remove the old one from the block.
3207  Latch->getTerminator()->eraseFromParent();
3208 
3209  return Induction;
3210 }
3211 
3213  if (TripCount)
3214  return TripCount;
3215 
3217  // Find the loop boundaries.
3218  ScalarEvolution *SE = PSE.getSE();
3219  const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
3220  assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
3221  "Invalid loop count");
3222 
3223  Type *IdxTy = Legal->getWidestInductionType();
3224 
3225  // The exit count might have the type of i64 while the phi is i32. This can
3226  // happen if we have an induction variable that is sign extended before the
3227  // compare. The only way that we get a backedge taken count is that the
3228  // induction variable was signed and as such will not overflow. In such a case
3229  // truncation is legal.
3230  if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
3231  IdxTy->getPrimitiveSizeInBits())
3232  BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
3233  BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
3234 
3235  // Get the total trip count from the count by adding 1.
3236  const SCEV *ExitCount = SE->getAddExpr(
3237  BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3238 
3239  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
3240 
3241  // Expand the trip count and place the new instructions in the preheader.
3242  // Notice that the pre-header does not change, only the loop body.
3243  SCEVExpander Exp(*SE, DL, "induction");
3244 
3245  // Count holds the overall loop count (N).
3246  TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
3248 
3249  if (TripCount->getType()->isPointerTy())
3250  TripCount =
3251  CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
3253 
3254  return TripCount;
3255 }
3256 
3258  if (VectorTripCount)
3259  return VectorTripCount;
3260 
3261  Value *TC = getOrCreateTripCount(L);
3263 
3264  // Now we need to generate the expression for the part of the loop that the
3265  // vectorized body will execute. This is equal to N - (N % Step) if scalar
3266  // iterations are not required for correctness, or N - Step, otherwise. Step
3267  // is equal to the vectorization factor (number of SIMD elements) times the
3268  // unroll factor (number of SIMD instructions).
3269  Constant *Step = ConstantInt::get(TC->getType(), VF * UF);
3270  Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
3271 
3272  // If there is a non-reversed interleaved group that may speculatively access
3273  // memory out-of-bounds, we need to ensure that there will be at least one
3274  // iteration of the scalar epilogue loop. Thus, if the step evenly divides
3275  // the trip count, we set the remainder to be equal to the step. If the step
3276  // does not evenly divide the trip count, no adjustment is necessary since
3277  // there will already be scalar iterations. Note that the minimum iterations
3278  // check ensures that N >= Step.
3279  if (VF > 1 && Legal->requiresScalarEpilogue()) {
3280  auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3281  R = Builder.CreateSelect(IsZero, Step, R);
3282  }
3283 
3284  VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3285 
3286  return VectorTripCount;
3287 }
3288 
3290  const DataLayout &DL) {
3291  // Verify that V is a vector type with same number of elements as DstVTy.
3292  unsigned VF = DstVTy->getNumElements();
3293  VectorType *SrcVecTy = cast<VectorType>(V->getType());
3294  assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
3295  Type *SrcElemTy = SrcVecTy->getElementType();
3296  Type *DstElemTy = DstVTy->getElementType();
3297  assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3298  "Vector elements must have same size");
3299 
3300  // Do a direct cast if element types are castable.
3301  if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3302  return Builder.CreateBitOrPointerCast(V, DstVTy);
3303  }
3304  // V cannot be directly casted to desired vector type.
3305  // May happen when V is a floating point vector but DstVTy is a vector of
3306  // pointers or vice-versa. Handle this using a two-step bitcast using an
3307  // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3308  assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3309  "Only one type should be a pointer type");
3310  assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3311  "Only one type should be a floating point type");
3312  Type *IntTy =
3314  VectorType *VecIntTy = VectorType::get(IntTy, VF);
3315  Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3316  return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
3317 }
3318 
3320  BasicBlock *Bypass) {
3321  Value *Count = getOrCreateTripCount(L);
3322  BasicBlock *BB = L->getLoopPreheader();
3324 
3325  // Generate code to check if the loop's trip count is less than VF * UF, or
3326  // equal to it in case a scalar epilogue is required; this implies that the
3327  // vector trip count is zero. This check also covers the case where adding one
3328  // to the backedge-taken count overflowed leading to an incorrect trip count
3329  // of zero. In this case we will also jump to the scalar loop.
3332  Value *CheckMinIters = Builder.CreateICmp(
3333  P, Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check");
3334 
3335  BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
3336  // Update dominator tree immediately if the generated block is a
3337  // LoopBypassBlock because SCEV expansions to generate loop bypass
3338  // checks may query it before the current function is finished.
3339  DT->addNewBlock(NewBB, BB);
3340  if (L->getParentLoop())
3341  L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
3343  BranchInst::Create(Bypass, NewBB, CheckMinIters));
3344  LoopBypassBlocks.push_back(BB);
3345 }
3346 
3348  BasicBlock *BB = L->getLoopPreheader();
3349 
3350  // Generate the code to check that the SCEV assumptions that we made.
3351  // We want the new basic block to start at the first instruction in a
3352  // sequence of instructions that form a check.
3353  SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
3354  "scev.check");
3355  Value *SCEVCheck =
3356  Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
3357 
3358  if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
3359  if (C->isZero())
3360  return;
3361 
3362  // Create a new block containing the stride check.
3363  BB->setName("vector.scevcheck");
3364  auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
3365  // Update dominator tree immediately if the generated block is a
3366  // LoopBypassBlock because SCEV expansions to generate loop bypass
3367  // checks may query it before the current function is finished.
3368  DT->addNewBlock(NewBB, BB);
3369  if (L->getParentLoop())
3370  L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
3372  BranchInst::Create(Bypass, NewBB, SCEVCheck));
3373  LoopBypassBlocks.push_back(BB);
3374  AddedSafetyChecks = true;
3375 }
3376 
3378  BasicBlock *BB = L->getLoopPreheader();
3379 
3380  // Generate the code that checks in runtime if arrays overlap. We put the
3381  // checks into a separate block to make the more common case of few elements
3382  // faster.
3383  Instruction *FirstCheckInst;
3384  Instruction *MemRuntimeCheck;
3385  std::tie(FirstCheckInst, MemRuntimeCheck) =
3387  if (!MemRuntimeCheck)
3388  return;
3389 
3390  // Create a new block containing the memory check.
3391  BB->setName("vector.memcheck");
3392  auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
3393  // Update dominator tree immediately if the generated block is a
3394  // LoopBypassBlock because SCEV expansions to generate loop bypass
3395  // checks may query it before the current function is finished.
3396  DT->addNewBlock(NewBB, BB);
3397  if (L->getParentLoop())
3398  L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
3400  BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
3401  LoopBypassBlocks.push_back(BB);
3402  AddedSafetyChecks = true;
3403 
3404  // We currently don't use LoopVersioning for the actual loop cloning but we
3405  // still use it to add the noalias metadata.
3406  LVer = llvm::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
3407  PSE.getSE());
3408  LVer->prepareNoAliasMetadata();
3409 }
3410 
3412  /*
3413  In this function we generate a new loop. The new loop will contain
3414  the vectorized instructions while the old loop will continue to run the
3415  scalar remainder.
3416 
3417  [ ] <-- loop iteration number check.
3418  / |
3419  / v
3420  | [ ] <-- vector loop bypass (may consist of multiple blocks).
3421  | / |
3422  | / v
3423  || [ ] <-- vector pre header.
3424  |/ |
3425  | v
3426  | [ ] \
3427  | [ ]_| <-- vector loop.
3428  | |
3429  | v
3430  | -[ ] <--- middle-block.
3431  | / |
3432  | / v
3433  -|- >[ ] <--- new preheader.
3434  | |
3435  | v
3436  | [ ] \
3437  | [ ]_| <-- old scalar loop to handle remainder.
3438  \ |
3439  \ v
3440  >[ ] <-- exit block.
3441  ...
3442  */
3443 
3444  BasicBlock *OldBasicBlock = OrigLoop->getHeader();
3445  BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
3446  BasicBlock *ExitBlock = OrigLoop->getExitBlock();
3447  assert(VectorPH && "Invalid loop structure");
3448  assert(ExitBlock && "Must have an exit block");
3449 
3450  // Some loops have a single integer induction variable, while other loops
3451  // don't. One example is c++ iterators that often have multiple pointer
3452  // induction variables. In the code below we also support a case where we
3453  // don't have a single induction variable.
3454  //
3455  // We try to obtain an induction variable from the original loop as hard
3456  // as possible. However if we don't find one that:
3457  // - is an integer
3458  // - counts from zero, stepping by one
3459  // - is the size of the widest induction variable type
3460  // then we create a new one.
3462  Type *IdxTy = Legal->getWidestInductionType();
3463 
3464  // Split the single block loop into the two loop structure described above.
3465  BasicBlock *VecBody =
3466  VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
3467  BasicBlock *MiddleBlock =
3468  VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
3469  BasicBlock *ScalarPH =
3470  MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
3471 
3472  // Create and register the new vector loop.
3473  Loop *Lp = LI->AllocateLoop();
3474  Loop *ParentLoop = OrigLoop->getParentLoop();
3475 
3476  // Insert the new loop into the loop nest and register the new basic blocks
3477  // before calling any utilities such as SCEV that require valid LoopInfo.
3478  if (ParentLoop) {
3479  ParentLoop->addChildLoop(Lp);
3480  ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
3481  ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
3482  } else {
3483  LI->addTopLevelLoop(Lp);
3484  }
3485  Lp->addBasicBlockToLoop(VecBody, *LI);
3486 
3487  // Find the loop boundaries.
3488  Value *Count = getOrCreateTripCount(Lp);
3489 
3490  Value *StartIdx = ConstantInt::get(IdxTy, 0);
3491 
3492  // Now, compare the new count to zero. If it is zero skip the vector loop and
3493  // jump to the scalar loop. This check also covers the case where the
3494  // backedge-taken count is uint##_max: adding one to it will overflow leading
3495  // to an incorrect trip count of zero. In this (rare) case we will also jump
3496  // to the scalar loop.
3497  emitMinimumIterationCountCheck(Lp, ScalarPH);
3498 
3499  // Generate the code to check any assumptions that we've made for SCEV
3500  // expressions.
3501  emitSCEVChecks(Lp, ScalarPH);
3502 
3503  // Generate the code that checks in runtime if arrays overlap. We put the
3504  // checks into a separate block to make the more common case of few elements
3505  // faster.
3506  emitMemRuntimeChecks(Lp, ScalarPH);
3507 
3508  // Generate the induction variable.
3509  // The loop step is equal to the vectorization factor (num of SIMD elements)
3510  // times the unroll factor (num of SIMD instructions).
3511  Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3512  Constant *Step = ConstantInt::get(IdxTy, VF * UF);
3513  Induction =
3514  createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3516 
3517  // We are going to resume the execution of the scalar loop.
3518  // Go over all of the induction variables that we found and fix the
3519  // PHIs that are left in the scalar version of the loop.
3520  // The starting values of PHI nodes depend on the counter of the last
3521  // iteration in the vectorized loop.
3522  // If we come from a bypass edge then we need to start from the original
3523  // start value.
3524 
3525  // This variable saves the new starting index for the scalar loop. It is used
3526  // to test if there are any tail iterations left once the vector loop has
3527  // completed.
3529  for (auto &InductionEntry : *List) {
3530  PHINode *OrigPhi = InductionEntry.first;
3531  InductionDescriptor II = InductionEntry.second;
3532 
3533  // Create phi nodes to merge from the backedge-taken check block.
3534  PHINode *BCResumeVal = PHINode::Create(
3535  OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
3536  Value *&EndValue = IVEndValues[OrigPhi];
3537  if (OrigPhi == OldInduction) {
3538  // We know what the end value is.
3539  EndValue = CountRoundDown;
3540  } else {
3542  Type *StepType = II.getStep()->getType();
3543  Instruction::CastOps CastOp =
3544  CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
3545  Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
3546  const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
3547  EndValue = II.transform(B, CRD, PSE.getSE(), DL);
3548  EndValue->setName("ind.end");
3549  }
3550 
3551  // The new PHI merges the original incoming value, in case of a bypass,
3552  // or the value at the end of the vectorized loop.
3553  BCResumeVal->addIncoming(EndValue, MiddleBlock);
3554 
3555  // Fix the scalar body counter (PHI node).
3556  unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
3557 
3558  // The old induction's phi node in the scalar body needs the truncated
3559  // value.
3560  for (BasicBlock *BB : LoopBypassBlocks)
3561  BCResumeVal->addIncoming(II.getStartValue(), BB);
3562  OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
3563  }
3564 
3565  // Add a check in the middle block to see if we have completed
3566  // all of the iterations in the first vector loop.
3567  // If (N - N%VF) == N, then we *don't* need to run the remainder.
3568  Value *CmpN =
3569  CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3570  CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
3571  ReplaceInstWithInst(MiddleBlock->getTerminator(),
3572  BranchInst::Create(ExitBlock, ScalarPH, CmpN));
3573 
3574  // Get ready to start creating new instructions into the vectorized body.
3576 
3577  // Save the state.
3579  LoopScalarPreHeader = ScalarPH;
3580  LoopMiddleBlock = MiddleBlock;
3581  LoopExitBlock = ExitBlock;
3582  LoopVectorBody = VecBody;
3583  LoopScalarBody = OldBasicBlock;
3584 
3585  // Keep all loop hints from the original loop on the vector loop (we'll
3586  // replace the vectorizer-specific hints below).
3587  if (MDNode *LID = OrigLoop->getLoopID())
3588  Lp->setLoopID(LID);
3589 
3590  LoopVectorizeHints Hints(Lp, true, *ORE);
3591  Hints.setAlreadyVectorized();
3592 
3593  return LoopVectorPreHeader;
3594 }
3595 
3596 // Fix up external users of the induction variable. At this point, we are
3597 // in LCSSA form, with all external PHIs that use the IV having one input value,
3598 // coming from the remainder loop. We need those PHIs to also have a correct
3599 // value for the IV when arriving directly from the middle block.
3601  const InductionDescriptor &II,
3602  Value *CountRoundDown, Value *EndValue,
3603  BasicBlock *MiddleBlock) {
3604  // There are two kinds of external IV usages - those that use the value
3605  // computed in the last iteration (the PHI) and those that use the penultimate
3606  // value (the value that feeds into the phi from the loop latch).
3607  // We allow both, but they, obviously, have different values.
3608 
3609  assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3610 
3611  DenseMap<Value *, Value *> MissingVals;
3612 
3613  // An external user of the last iteration's value should see the value that
3614  // the remainder loop uses to initialize its own IV.
3616  for (User *U : PostInc->users()) {
3617  Instruction *UI = cast<Instruction>(U);
3618  if (!OrigLoop->contains(UI)) {
3619  assert(isa<PHINode>(UI) && "Expected LCSSA form");
3620  MissingVals[UI] = EndValue;
3621  }
3622  }
3623 
3624  // An external user of the penultimate value need to see EndValue - Step.
3625  // The simplest way to get this is to recompute it from the constituent SCEVs,
3626  // that is Start + (Step * (CRD - 1)).
3627  for (User *U : OrigPhi->users()) {
3628  auto *UI = cast<Instruction>(U);
3629  if (!OrigLoop->contains(UI)) {
3630  const DataLayout &DL =
3632  assert(isa<PHINode>(UI) && "Expected LCSSA form");
3633 
3634  IRBuilder<> B(MiddleBlock->getTerminator());
3635  Value *CountMinusOne = B.CreateSub(
3636  CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3637  Value *CMO =
3638  !II.getStep()->getType()->isIntegerTy()
3639  ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3640  II.getStep()->getType())
3641  : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3642  CMO->setName("cast.cmo");
3643  Value *Escape = II.transform(B, CMO, PSE.getSE(), DL);
3644  Escape->setName("ind.escape");
3645  MissingVals[UI] = Escape;
3646  }
3647  }
3648 
3649  for (auto &I : MissingVals) {
3650  PHINode *PHI = cast<PHINode>(I.first);
3651  // One corner case we have to handle is two IVs "chasing" each-other,
3652  // that is %IV2 = phi [...], [ %IV1, %latch ]
3653  // In this case, if IV1 has an external use, we need to avoid adding both
3654  // "last value of IV1" and "penultimate value of IV2". So, verify that we
3655  // don't already have an incoming value for the middle block.
3656  if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3657  PHI->addIncoming(I.second, MiddleBlock);
3658  }
3659 }
3660 
3661 namespace {
3662 
3663 struct CSEDenseMapInfo {
3664  static bool canHandle(const Instruction *I) {
3665  return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3666  isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3667  }
3668 
3669  static inline Instruction *getEmptyKey() {
3671  }
3672 
3673  static inline Instruction *getTombstoneKey() {
3675  }
3676 
3677  static unsigned getHashValue(const Instruction *I) {
3678  assert(canHandle(I) && "Unknown instruction!");
3680  I->value_op_end()));
3681  }
3682 
3683  static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3684  if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3685  LHS == getTombstoneKey() || RHS == getTombstoneKey())
3686  return LHS == RHS;
3687  return LHS->isIdenticalTo(RHS);
3688  }
3689 };
3690 
3691 } // end anonymous namespace
3692 
3693 ///\brief Perform cse of induction variable instructions.
3694 static void cse(BasicBlock *BB) {
3695  // Perform simple cse.
3697  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3698  Instruction *In = &*I++;
3699 
3700  if (!CSEDenseMapInfo::canHandle(In))
3701  continue;
3702 
3703  // Check if we can replace this instruction with any of the
3704  // visited instructions.
3705  if (Instruction *V = CSEMap.lookup(In)) {
3706  In->replaceAllUsesWith(V);
3707  In->eraseFromParent();
3708  continue;
3709  }
3710 
3711  CSEMap[In] = In;
3712  }
3713 }
3714 
3715 /// \brief Estimate the overhead of scalarizing an instruction. This is a
3716 /// convenience wrapper for the type-based getScalarizationOverhead API.
3717 static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,
3718  const TargetTransformInfo &TTI) {
3719  if (VF == 1)
3720  return 0;
3721 
3722  unsigned Cost = 0;
3723  Type *RetTy = ToVectorTy(I->getType(), VF);
3724  if (!RetTy->isVoidTy() &&
3725  (!isa<LoadInst>(I) ||
3727  Cost += TTI.getScalarizationOverhead(RetTy, true, false);
3728 
3729  if (CallInst *CI = dyn_cast<CallInst>(I)) {
3730  SmallVector<const Value *, 4> Operands(CI->arg_operands());
3731  Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
3732  }
3733  else if (!isa<StoreInst>(I) ||
3736  Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
3737  }
3738 
3739  return Cost;
3740 }
3741 
3742 // Estimate cost of a call instruction CI if it were vectorized with factor VF.
3743 // Return the cost of the instruction, including scalarization overhead if it's
3744 // needed. The flag NeedToScalarize shows if the call needs to be scalarized -
3745 // i.e. either vector version isn't available, or is too expensive.
3746 static unsigned getVectorCallCost(CallInst *CI, unsigned VF,
3747  const TargetTransformInfo &TTI,
3748  const TargetLibraryInfo *TLI,
3749  bool &NeedToScalarize) {
3750  Function *F = CI->getCalledFunction();
3751  StringRef FnName = CI->getCalledFunction()->getName();
3752  Type *ScalarRetTy = CI->getType();
3753  SmallVector<Type *, 4> Tys, ScalarTys;
3754  for (auto &ArgOp : CI->arg_operands())
3755  ScalarTys.push_back(ArgOp->getType());
3756 
3757  // Estimate cost of scalarized vector call. The source operands are assumed
3758  // to be vectors, so we need to extract individual elements from there,
3759  // execute VF scalar calls, and then gather the result into the vector return
3760  // value.
3761  unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3762  if (VF == 1)
3763  return ScalarCallCost;
3764 
3765  // Compute corresponding vector type for return value and arguments.
3766  Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3767  for (Type *ScalarTy : ScalarTys)
3768  Tys.push_back(ToVectorTy(ScalarTy, VF));
3769 
3770  // Compute costs of unpacking argument values for the scalar calls and
3771  // packing the return values to a vector.
3772  unsigned ScalarizationCost = getScalarizationOverhead(CI, VF, TTI);
3773 
3774  unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3775 
3776  // If we can't emit a vector call for this function, then the currently found
3777  // cost is the cost we need to return.
3778  NeedToScalarize = true;
3779  if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
3780  return Cost;
3781 
3782  // If the corresponding vector cost is cheaper, return its cost.
3783  unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3784  if (VectorCallCost < Cost) {
3785  NeedToScalarize = false;
3786  return VectorCallCost;
3787  }
3788  return Cost;
3789 }
3790 
3791 // Estimate cost of an intrinsic call instruction CI if it were vectorized with
3792 // factor VF. Return the cost of the instruction, including scalarization
3793 // overhead if it's needed.
3794 static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF,
3795  const TargetTransformInfo &TTI,
3796  const TargetLibraryInfo *TLI) {
3798  assert(ID && "Expected intrinsic call!");
3799 
3800  FastMathFlags FMF;
3801  if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3802  FMF = FPMO->getFastMathFlags();
3803 
3804  SmallVector<Value *, 4> Operands(CI->arg_operands());
3805  return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3806 }
3807 
3809  auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3810  auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3811  return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3812 }
3814  auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3815  auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3816  return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3817 }
3818 
3820  // For every instruction `I` in MinBWs, truncate the operands, create a
3821  // truncated version of `I` and reextend its result. InstCombine runs
3822  // later and will remove any ext/trunc pairs.
3823  SmallPtrSet<Value *, 4> Erased;
3824  for (const auto &KV : Cost->getMinimalBitwidths()) {
3825  // If the value wasn't vectorized, we must maintain the original scalar
3826  // type. The absence of the value from VectorLoopValueMap indicates that it
3827  // wasn't vectorized.
3828  if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3829  continue;
3830  for (unsigned Part = 0; Part < UF; ++Part) {
3831  Value *I = getOrCreateVectorValue(KV.first, Part);
3832  if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3833  continue;
3834  Type *OriginalTy = I->getType();
3835  Type *ScalarTruncatedTy =
3836  IntegerType::get(OriginalTy->getContext(), KV.second);
3837  Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3838  OriginalTy->getVectorNumElements());
3839  if (TruncatedTy == OriginalTy)
3840  continue;
3841 
3842  IRBuilder<> B(cast<Instruction>(I));
3843  auto ShrinkOperand = [&](Value *V) -> Value * {
3844  if (auto *ZI = dyn_cast<ZExtInst>(V))
3845  if (ZI->getSrcTy() == TruncatedTy)
3846  return ZI->getOperand(0);
3847  return B.CreateZExtOrTrunc(V, TruncatedTy);
3848  };
3849 
3850  // The actual instruction modification depends on the instruction type,
3851  // unfortunately.
3852  Value *NewI = nullptr;
3853  if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3854  NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3855  ShrinkOperand(BO->getOperand(1)));
3856 
3857  // Any wrapping introduced by shrinking this operation shouldn't be
3858  // considered undefined behavior. So, we can't unconditionally copy
3859  // arithmetic wrapping flags to NewI.
3860  cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3861  } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3862  NewI =
3863  B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3864  ShrinkOperand(CI->getOperand(1)));
3865  } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3866  NewI = B.CreateSelect(SI->getCondition(),
3867  ShrinkOperand(SI->getTrueValue()),
3868  ShrinkOperand(SI->getFalseValue()));
3869  } else if (auto *CI = dyn_cast<CastInst>(I)) {
3870  switch (CI->getOpcode()) {
3871  default:
3872  llvm_unreachable("Unhandled cast!");
3873  case Instruction::Trunc:
3874  NewI = ShrinkOperand(CI->getOperand(0));
3875  break;
3876  case Instruction::SExt:
3877  NewI = B.CreateSExtOrTrunc(
3878  CI->getOperand(0),
3879  smallestIntegerVectorType(OriginalTy, TruncatedTy));
3880  break;
3881  case Instruction::ZExt:
3882  NewI = B.CreateZExtOrTrunc(
3883  CI->getOperand(0),
3884  smallestIntegerVectorType(OriginalTy, TruncatedTy));
3885  break;
3886  }
3887  } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3888  auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3889  auto *O0 = B.CreateZExtOrTrunc(
3890  SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3891  auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3892  auto *O1 = B.CreateZExtOrTrunc(
3893  SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3894 
3895  NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3896  } else if (isa<LoadInst>(I)) {
3897  // Don't do anything with the operands, just extend the result.
3898  continue;
3899  } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3900  auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3901  auto *O0 = B.CreateZExtOrTrunc(
3902  IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3903  auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3904  NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3905  } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3906  auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3907  auto *O0 = B.CreateZExtOrTrunc(
3908  EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3909  NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3910  } else {
3911  llvm_unreachable("Unhandled instruction type!");
3912  }
3913 
3914  // Lastly, extend the result.
3915  NewI->takeName(cast<Instruction>(I));
3916  Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3917  I->replaceAllUsesWith(Res);
3918  cast<Instruction>(I)->eraseFromParent();
3919  Erased.insert(I);
3920  VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3921  }
3922  }
3923 
3924  // We'll have created a bunch of ZExts that are now parentless. Clean up.
3925  for (const auto &KV : Cost->getMinimalBitwidths()) {
3926  // If the value wasn't vectorized, we must maintain the original scalar
3927  // type. The absence of the value from VectorLoopValueMap indicates that it
3928  // wasn't vectorized.
3929  if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3930  continue;
3931  for (unsigned Part = 0; Part < UF; ++Part) {
3932  Value *I = getOrCreateVectorValue(KV.first, Part);
3933  ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3934  if (Inst && Inst->use_empty()) {
3935  Value *NewI = Inst->getOperand(0);
3936  Inst->eraseFromParent();
3937  VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3938  }
3939  }
3940  }
3941 }
3942 
3944  // Insert truncates and extends for any truncated instructions as hints to
3945  // InstCombine.
3946  if (VF > 1)
3948 
3949  // At this point every instruction in the original loop is widened to a
3950  // vector form. Now we need to fix the recurrences in the loop. These PHI
3951  // nodes are currently empty because we did not want to introduce cycles.
3952  // This is the second stage of vectorizing recurrences.
3954 
3955  // Update the dominator tree.
3956  //
3957  // FIXME: After creating the structure of the new loop, the dominator tree is
3958  // no longer up-to-date, and it remains that way until we update it
3959  // here. An out-of-date dominator tree is problematic for SCEV,
3960  // because SCEVExpander uses it to guide code generation. The
3961  // vectorizer use SCEVExpanders in several places. Instead, we should
3962  // keep the dominator tree up-to-date as we go.
3963  updateAnalysis();
3964 
3965  // Fix-up external users of the induction variables.
3966  for (auto &Entry : *Legal->getInductionVars())
3967  fixupIVUsers(Entry.first, Entry.second,
3969  IVEndValues[Entry.first], LoopMiddleBlock);
3970 
3971  fixLCSSAPHIs();
3973  sinkScalarOperands(&*PI);
3974 
3975  // Remove redundant induction instructions.
3977 }
3978 
3980  // In order to support recurrences we need to be able to vectorize Phi nodes.
3981  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3982  // stage #2: We now need to fix the recurrences by adding incoming edges to
3983  // the currently empty PHI nodes. At this point every instruction in the
3984  // original loop is widened to a vector form so we can use them to construct
3985  // the incoming edges.
3986  for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3987  // Handle first-order recurrences and reductions that need to be fixed.
3988  if (Legal->isFirstOrderRecurrence(&Phi))
3990  else if (Legal->isReductionVariable(&Phi))
3991  fixReduction(&Phi);
3992  }
3993 }
3994 
3996  // This is the second phase of vectorizing first-order recurrences. An
3997  // overview of the transformation is described below. Suppose we have the
3998  // following loop.
3999  //
4000  // for (int i = 0; i < n; ++i)
4001  // b[i] = a[i] - a[i - 1];
4002  //
4003  // There is a first-order recurrence on "a". For this loop, the shorthand
4004  // scalar IR looks like:
4005  //
4006  // scalar.ph:
4007  // s_init = a[-1]
4008  // br scalar.body
4009  //
4010  // scalar.body:
4011  // i = phi [0, scalar.ph], [i+1, scalar.body]
4012  // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
4013  // s2 = a[i]
4014  // b[i] = s2 - s1
4015  // br cond, scalar.body, ...
4016  //
4017  // In this example, s1 is a recurrence because it's value depends on the
4018  // previous iteration. In the first phase of vectorization, we created a
4019  // temporary value for s1. We now complete the vectorization and produce the
4020  // shorthand vector IR shown below (for VF = 4, UF = 1).
4021  //
4022  // vector.ph:
4023  // v_init = vector(..., ..., ..., a[-1])
4024  // br vector.body
4025  //
4026  // vector.body
4027  // i = phi [0, vector.ph], [i+4, vector.body]
4028  // v1 = phi [v_init, vector.ph], [v2, vector.body]
4029  // v2 = a[i, i+1, i+2, i+3];
4030  // v3 = vector(v1(3), v2(0, 1, 2))
4031  // b[i, i+1, i+2, i+3] = v2 - v3
4032  // br cond, vector.body, middle.block
4033  //
4034  // middle.block:
4035  // x = v2(3)
4036  // br scalar.ph
4037  //
4038  // scalar.ph:
4039  // s_init = phi [x, middle.block], [a[-1], otherwise]
4040  // br scalar.body
4041  //
4042  // After execution completes the vector loop, we extract the next value of
4043  // the recurrence (x) to use as the initial value in the scalar loop.
4044 
4045  // Get the original loop preheader and single loop latch.
4046  auto *Preheader = OrigLoop->getLoopPreheader();
4047  auto *Latch = OrigLoop->getLoopLatch();
4048 
4049  // Get the initial and previous values of the scalar recurrence.
4050  auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
4051  auto *Previous = Phi->getIncomingValueForBlock(Latch);
4052 
4053  // Create a vector from the initial value.
4054  auto *VectorInit = ScalarInit;
4055  if (VF > 1) {
4057  VectorInit = Builder.CreateInsertElement(
4058  UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
4059  Builder.getInt32(VF - 1), "vector.recur.init");
4060  }
4061 
4062  // We constructed a temporary phi node in the first phase of vectorization.
4063  // This phi node will eventually be deleted.
4065  cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
4066 
4067  // Create a phi node for the new recurrence. The current value will either be
4068  // the initial value inserted into a vector or loop-varying vector value.
4069  auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
4070  VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
4071 
4072  // Get the vectorized previous value of the last part UF - 1. It appears last
4073  // among all unrolled iterations, due to the order of their construction.
4074  Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
4075 
4076  // Set the insertion point after the previous value if it is an instruction.
4077  // Note that the previous value may have been constant-folded so it is not
4078  // guaranteed to be an instruction in the vector loop. Also, if the previous
4079  // value is a phi node, we should insert after all the phi nodes to avoid
4080  // breaking basic block verification.
4081  if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||
4082  isa<PHINode>(PreviousLastPart))
4084  else
4086  &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));
4087 
4088  // We will construct a vector for the recurrence by combining the values for
4089  // the current and previous iterations. This is the required shuffle mask.
4090  SmallVector<Constant *, 8> ShuffleMask(VF);
4091  ShuffleMask[0] = Builder.getInt32(VF - 1);
4092  for (unsigned I = 1; I < VF; ++I)
4093  ShuffleMask[I] = Builder.getInt32(I + VF - 1);
4094 
4095  // The vector from which to take the initial value for the current iteration
4096  // (actual or unrolled). Initially, this is the vector phi node.
4097  Value *Incoming = VecPhi;
4098 
4099  // Shuffle the current and previous vector and update the vector parts.
4100  for (unsigned Part = 0; Part < UF; ++Part) {
4101  Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
4102  Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
4103  auto *Shuffle =
4104  VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
4105  ConstantVector::get(ShuffleMask))
4106  : Incoming;
4107  PhiPart->replaceAllUsesWith(Shuffle);
4108  cast<Instruction>(PhiPart)->eraseFromParent();
4109  VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
4110  Incoming = PreviousPart;
4111  }
4112 
4113  // Fix the latch value of the new recurrence in the vector loop.
4114  VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4115 
4116  // Extract the last vector element in the middle block. This will be the
4117  // initial value for the recurrence when jumping to the scalar loop.
4118  auto *ExtractForScalar = Incoming;
4119  if (VF > 1) {
4121  ExtractForScalar = Builder.CreateExtractElement(
4122  ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
4123  }
4124  // Extract the second last element in the middle block if the
4125  // Phi is used outside the loop. We need to extract the phi itself
4126  // and not the last element (the phi update in the current iteration). This
4127  // will be the value when jumping to the exit block from the LoopMiddleBlock,
4128  // when the scalar loop is not run at all.
4129  Value *ExtractForPhiUsedOutsideLoop = nullptr;
4130  if (VF > 1)
4131  ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4132  Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
4133  // When loop is unrolled without vectorizing, initialize
4134  // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
4135  // `Incoming`. This is analogous to the vectorized case above: extracting the
4136  // second last element when VF > 1.
4137  else if (UF > 1)
4138  ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
4139 
4140  // Fix the initial value of the original recurrence in the scalar loop.
4142  auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4143  for (auto *BB : predecessors(LoopScalarPreHeader)) {
4144  auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4145  Start->addIncoming(Incoming, BB);
4146  }
4147 
4149  Phi->setName("scalar.recur");
4150 
4151  // Finally, fix users of the recurrence outside the loop. The users will need
4152  // either the last value of the scalar recurrence or the last value of the
4153  // vector recurrence we extracted in the middle block. Since the loop is in
4154  // LCSSA form, we just need to find the phi node for the original scalar
4155  // recurrence in the exit block, and then add an edge for the middle block.
4156  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4157  if (LCSSAPhi.getIncomingValue(0) == Phi) {
4158  LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4159  break;
4160  }
4161  }
4162 }
4163 
4165  Constant *Zero = Builder.getInt32(0);
4166 
4167  // Get it's reduction variable descriptor.
4169  "Unable to find the reduction variable");
4170  RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
4171 
4173  TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4174  Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4176  RdxDesc.getMinMaxRecurrenceKind();
4177  setDebugLocFromInst(Builder, ReductionStartValue);
4178 
4179  // We need to generate a reduction vector from the incoming scalar.
4180  // To do so, we need to generate the 'identity' vector and override
4181  // one of the elements with the incoming scalar reduction. We need
4182  // to do it in the vector-loop preheader.
4184 
4185  // This is the vector-clone of the value that leaves the loop.
4186  Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
4187 
4188  // Find the reduction identity variable. Zero for addition, or, xor,
4189  // one for multiplication, -1 for And.
4190  Value *Identity;
4191  Value *VectorStart;
4194  // MinMax reduction have the start value as their identify.
4195  if (VF == 1) {
4196  VectorStart = Identity = ReductionStartValue;
4197  } else {
4198  VectorStart = Identity =
4199  Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
4200  }
4201  } else {
4202  // Handle other reduction kinds:
4204  RK, VecTy->getScalarType());
4205  if (VF == 1) {
4206  Identity = Iden;
4207  // This vector is the Identity vector where the first element is the
4208  // incoming scalar reduction.
4209  VectorStart = ReductionStartValue;
4210  } else {
4211  Identity = ConstantVector::getSplat(VF, Iden);
4212 
4213  // This vector is the Identity vector where the first element is the
4214  // incoming scalar reduction.
4215  VectorStart =
4216  Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
4217  }
4218  }
4219 
4220  // Fix the vector-loop phi.
4221 
4222  // Reductions do not have to start at zero. They can start with
4223  // any loop invariant values.
4224  BasicBlock *Latch = OrigLoop->getLoopLatch();
4225  Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
4226  for (unsigned Part = 0; Part < UF; ++Part) {
4227  Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
4228  Value *Val = getOrCreateVectorValue(LoopVal, Part);
4229  // Make sure to add the reduction stat value only to the
4230  // first unroll part.
4231  Value *StartVal = (Part == 0) ? VectorStart : Identity;
4232  cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
4233  cast<PHINode>(VecRdxPhi)
4234  ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4235  }
4236 
4237  // Before each round, move the insertion point right between
4238  // the PHIs and the values we are going to write.
4239  // This allows us to write both PHINodes and the extractelement
4240  // instructions.
4242 
4243  setDebugLocFromInst(Builder, LoopExitInst);
4244 
4245  // If the vector reduction can be performed in a smaller type, we truncate
4246  // then extend the loop exit value to enable InstCombine to evaluate the
4247  // entire expression in the smaller type.
4248  if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
4249  Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4252  VectorParts RdxParts(UF);
4253  for (unsigned Part = 0; Part < UF; ++Part) {
4254  RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4255  Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4256  Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4257  : Builder.CreateZExt(Trunc, VecTy);
4258  for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4259  UI != RdxParts[Part]->user_end();)
4260  if (*UI != Trunc) {
4261  (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4262  RdxParts[Part] = Extnd;
4263  } else {
4264  ++UI;
4265  }
4266  }
4268  for (unsigned Part = 0; Part < UF; ++Part) {
4269  RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4270  VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
4271  }
4272  }
4273 
4274  // Reduce all of the unrolled parts into a single vector.
4275  Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
4277  setDebugLocFromInst(Builder, ReducedPartRdx);
4278  for (unsigned Part = 1; Part < UF; ++Part) {
4279  Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4280  if (Op != Instruction::ICmp && Op != Instruction::FCmp)
4281  // Floating point operations had to be 'fast' to enable the reduction.
4282  ReducedPartRdx = addFastMathFlag(
4284  ReducedPartRdx, "bin.rdx"));
4285  else
4286  ReducedPartRdx = RecurrenceDescriptor::createMinMaxOp(
4287  Builder, MinMaxKind, ReducedPartRdx, RdxPart);
4288  }
4289 
4290  if (VF > 1) {
4291  bool NoNaN = Legal->hasFunNoNaNAttr();
4292  ReducedPartRdx =
4293  createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
4294  // If the reduction can be performed in a smaller type, we need to extend
4295  // the reduction to the wider type before we branch to the original loop.
4296  if (Phi->getType() != RdxDesc.getRecurrenceType())
4297  ReducedPartRdx =
4298  RdxDesc.isSigned()
4299  ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
4300  : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
4301  }
4302 
4303  // Create a phi node that merges control-flow from the backedge-taken check
4304  // block and the middle block.
4305  PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
4307  for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4308  BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4309  BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4310 
4311  // Now, we need to fix the users of the reduction variable
4312  // inside and outside of the scalar remainder loop.
4313  // We know that the loop is in LCSSA form. We need to update the
4314  // PHI nodes in the exit blocks.
4315  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4316  // All PHINodes need to have a single entry edge, or two if
4317  // we already fixed them.
4318  assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
4319 
4320  // We found a reduction value exit-PHI. Update it with the
4321  // incoming bypass edge.
4322  if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
4323  LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4324  } // end of the LCSSA phi scan.
4325 
4326  // Fix the scalar loop reduction variable with the incoming reduction sum
4327  // from the vector body and from the backedge value.
4328  int IncomingEdgeBlockIdx =
4330  assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4331  // Pick the other block.
4332  int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4333  Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4334  Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4335 }
4336 
4338  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4339  if (LCSSAPhi.getNumIncomingValues() == 1) {
4340  assert(OrigLoop->isLoopInvariant(LCSSAPhi.getIncomingValue(0)) &&
4341  "Incoming value isn't loop invariant");
4342  LCSSAPhi.addIncoming(LCSSAPhi.getIncomingValue(0), LoopMiddleBlock);
4343  }
4344  }
4345 }
4346 
4348  // The basic block and loop containing the predicated instruction.
4349  auto *PredBB = PredInst->getParent();
4350  auto *VectorLoop = LI->getLoopFor(PredBB);
4351 
4352  // Initialize a worklist with the operands of the predicated instruction.
4353  SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4354 
4355  // Holds instructions that we need to analyze again. An instruction may be
4356  // reanalyzed if we don't yet know if we can sink it or not.
4357  SmallVector<Instruction *, 8> InstsToReanalyze;
4358 
4359  // Returns true if a given use occurs in the predicated block. Phi nodes use
4360  // their operands in their corresponding predecessor blocks.
4361  auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4362  auto *I = cast<Instruction>(U.getUser());
4363  BasicBlock *BB = I->getParent();
4364  if (auto *Phi = dyn_cast<PHINode>(I))
4365  BB = Phi->getIncomingBlock(
4366  PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4367  return BB == PredBB;
4368  };
4369 
4370  // Iteratively sink the scalarized operands of the predicated instruction
4371  // into the block we created for it. When an instruction is sunk, it's
4372  // operands are then added to the worklist. The algorithm ends after one pass
4373  // through the worklist doesn't sink a single instruction.
4374  bool Changed;
4375  do {
4376  // Add the instructions that need to be reanalyzed to the worklist, and
4377  // reset the changed indicator.
4378  Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4379  InstsToReanalyze.clear();
4380  Changed = false;
4381 
4382  while (!Worklist.empty()) {
4383  auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4384 
4385  // We can't sink an instruction if it is a phi node, is already in the
4386  // predicated block, is not in the loop, or may have side effects.
4387  if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4388  !VectorLoop->contains(I) || I->mayHaveSideEffects())
4389  continue;
4390 
4391  // It's legal to sink the instruction if all its uses occur in the
4392  // predicated block. Otherwise, there's nothing to do yet, and we may
4393  // need to reanalyze the instruction.
4394  if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4395  InstsToReanalyze.push_back(I);
4396  continue;
4397  }
4398 
4399  // Move the instruction to the beginning of the predicated block, and add
4400  // it's operands to the worklist.
4401  I->moveBefore(&*PredBB->getFirstInsertionPt());
4402  Worklist.insert(I->op_begin(), I->op_end());
4403 
4404  // The sinking may have enabled other instructions to be sunk, so we will
4405  // need to iterate.
4406  Changed = true;
4407  }
4408  } while (Changed);
4409 }
4410 
4412  unsigned VF) {
4413  assert(PN->getParent() == OrigLoop->getHeader() &&
4414  "Non-header phis should have been handled elsewhere");
4415 
4416  PHINode *P = cast<PHINode>(PN);
4417  // In order to support recurrences we need to be able to vectorize Phi nodes.
4418  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4419  // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4420  // this value when we vectorize all of the instructions that use the PHI.
4422  for (unsigned Part = 0; Part < UF; ++Part) {
4423  // This is phase one of vectorizing PHIs.
4424  Type *VecTy =
4425  (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4426  Value *EntryPart = PHINode::Create(
4427  VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4428  VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4429  }
4430  return;
4431  }
4432 
4434 
4435  // This PHINode must be an induction variable.
4436  // Make sure that we know about it.
4437  assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
4438 
4440  const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4441 
4442  // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4443  // which can be found from the original scalar operations.
4444  switch (II.getKind()) {
4446  llvm_unreachable("Unknown induction");
4449  llvm_unreachable("Integer/fp induction is handled elsewhere.");
4451  // Handle the pointer induction variable case.
4452  assert(P->getType()->isPointerTy() && "Unexpected type.");
4453  // This is the normalized GEP that starts counting at zero.
4454  Value *PtrInd = Induction;
4455  PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
4456  // Determine the number of scalars we need to generate for each unroll
4457  // iteration. If the instruction is uniform, we only need to generate the
4458  // first lane. Otherwise, we generate all VF values.
4459  unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
4460  // These are the scalar results. Notice that we don't generate vector GEPs
4461  // because scalar GEPs result in better code.
4462  for (unsigned Part = 0; Part < UF; ++Part) {
4463  for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4464  Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
4465  Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4466  Value *SclrGep = II.transform(Builder, GlobalIdx, PSE.getSE(), DL);
4467  SclrGep->setName("next.gep");
4468  VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4469  }
4470  }
4471  return;
4472  }
4473  }
4474 }
4475 
4476 /// A helper function for checking whether an integer division-related
4477 /// instruction may divide by zero (in which case it must be predicated if
4478 /// executed conditionally in the scalar code).
4479 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4480 /// Non-zero divisors that are non compile-time constants will not be
4481 /// converted into multiplication, so we will still end up scalarizing
4482 /// the division, but can do so w/o predication.
4484  assert((I.getOpcode() == Instruction::UDiv ||
4485  I.getOpcode() == Instruction::SDiv ||
4486  I.getOpcode() == Instruction::URem ||
4487  I.getOpcode() == Instruction::SRem) &&
4488  "Unexpected instruction");
4489  Value *Divisor = I.getOperand(1);
4490  auto *CInt = dyn_cast<ConstantInt>(Divisor);
4491  return !CInt || CInt->isZero();
4492 }
4493 
4495  switch (I.getOpcode()) {
4496  case Instruction::Br:
4497  case Instruction::PHI:
4498  llvm_unreachable("This instruction is handled by a different recipe.");
4499  case Instruction::GetElementPtr: {
4500  // Construct a vector GEP by widening the operands of the scalar GEP as
4501  // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4502  // results in a vector of pointers when at least one operand of the GEP
4503  // is vector-typed. Thus, to keep the representation compact, we only use
4504  // vector-typed operands for loop-varying values.
4505  auto *GEP = cast<GetElementPtrInst>(&I);
4506 
4507  if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
4508  // If we are vectorizing, but the GEP has only loop-invariant operands,
4509  // the GEP we build (by only using vector-typed operands for
4510  // loop-varying values) would be a scalar pointer. Thus, to ensure we
4511  // produce a vector of pointers, we need to either arbitrarily pick an
4512  // operand to broadcast, or broadcast a clone of the original GEP.
4513  // Here, we broadcast a clone of the original.
4514  //
4515  // TODO: If at some point we decide to scalarize instructions having
4516  // loop-invariant operands, this special case will no longer be
4517  // required. We would add the scalarization decision to
4518  // collectLoopScalars() and teach getVectorValue() to broadcast
4519  // the lane-zero scalar value.
4520  auto *Clone = Builder.Insert(GEP->clone());
4521  for (unsigned Part = 0; Part < UF; ++Part) {
4522  Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4523  VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
4524  addMetadata(EntryPart, GEP);
4525  }
4526  } else {
4527  // If the GEP has at least one loop-varying operand, we are sure to
4528  // produce a vector of pointers. But if we are only unrolling, we want
4529  // to produce a scalar GEP for each unroll part. Thus, the GEP we
4530  // produce with the code below will be scalar (if VF == 1) or vector
4531  // (otherwise). Note that for the unroll-only case, we still maintain
4532  // values in the vector mapping with initVector, as we do for other
4533  // instructions.
4534  for (unsigned Part = 0; Part < UF; ++Part) {
4535  // The pointer operand of the new GEP. If it's loop-invariant, we
4536  // won't broadcast it.
4537  auto *Ptr =
4538  OrigLoop->isLoopInvariant(GEP->getPointerOperand())
4539  ? GEP->getPointerOperand()
4540  : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
4541 
4542  // Collect all the indices for the new GEP. If any index is
4543  // loop-invariant, we won't broadcast it.
4544  SmallVector<Value *, 4> Indices;
4545  for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
4546  if (OrigLoop->isLoopInvariant(U.get()))
4547  Indices.push_back(U.get());
4548  else
4549  Indices.push_back(getOrCreateVectorValue(U.get(), Part));
4550  }
4551 
4552  // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4553  // but it should be a vector, otherwise.
4554  auto *NewGEP = GEP->isInBounds()
4555  ? Builder.CreateInBoundsGEP(Ptr, Indices)
4556  : Builder.CreateGEP(Ptr, Indices);
4557  assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4558  "NewGEP is not a pointer vector");
4559  VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
4560  addMetadata(NewGEP, GEP);
4561  }
4562  }
4563 
4564  break;
4565  }
4566  case Instruction::UDiv:
4567  case Instruction::SDiv:
4568  case Instruction::SRem:
4569  case Instruction::URem:
4570  case Instruction::Add:
4571  case Instruction::FAdd:
4572  case Instruction::Sub:
4573  case Instruction::FSub:
4574  case Instruction::Mul:
4575  case Instruction::FMul:
4576  case Instruction::FDiv:
4577  case Instruction::FRem:
4578  case Instruction::Shl:
4579  case Instruction::LShr:
4580  case Instruction::AShr:
4581  case Instruction::And:
4582  case Instruction::Or:
4583  case Instruction::Xor: {
4584  // Just widen binops.
4585  auto *BinOp = cast<BinaryOperator>(&I);
4586  setDebugLocFromInst(Builder, BinOp);
4587 
4588  for (unsigned Part = 0; Part < UF; ++Part) {
4589  Value *A = getOrCreateVectorValue(BinOp->getOperand(0), Part);
4590  Value *B = getOrCreateVectorValue(BinOp->getOperand(1), Part);
4591  Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
4592 
4593  if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
4594  VecOp->copyIRFlags(BinOp);
4595 
4596  // Use this vector value for all users of the original instruction.
4597  VectorLoopValueMap.setVectorValue(&I, Part, V);
4598  addMetadata(V, BinOp);
4599  }
4600 
4601  break;
4602  }
4603  case Instruction::Select: {
4604  // Widen selects.
4605  // If the selector is loop invariant we can create a select
4606  // instruction with a scalar condition. Otherwise, use vector-select.
4607  auto *SE = PSE.getSE();
4608  bool InvariantCond =
4611 
4612  // The condition can be loop invariant but still defined inside the
4613  // loop. This means that we can't just use the original 'cond' value.
4614  // We have to take the 'vectorized' value and pick the first lane.
4615  // Instcombine will make this a no-op.
4616 
4617  auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4618 
4619  for (unsigned Part = 0; Part < UF; ++Part) {
4620  Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4621  Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4622  Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4623  Value *Sel =
4624  Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4625  VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4626  addMetadata(Sel, &I);
4627  }
4628 
4629  break;
4630  }
4631 
4632  case Instruction::ICmp:
4633  case Instruction::FCmp: {
4634  // Widen compares. Generate vector compares.
4635  bool FCmp = (I.getOpcode() == Instruction::FCmp);
4636  auto *Cmp = dyn_cast<CmpInst>(&I);
4638  for (unsigned Part = 0; Part < UF; ++Part) {
4639  Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4640  Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4641  Value *C = nullptr;
4642  if (FCmp) {
4643  // Propagate fast math flags.
4645  Builder.setFastMathFlags(Cmp->getFastMathFlags());
4646  C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4647  } else {
4648  C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4649  }
4650  VectorLoopValueMap.setVectorValue(&I, Part, C);
4651  addMetadata(C, &I);
4652  }
4653 
4654  break;
4655  }
4656 
4657  case Instruction::ZExt:
4658  case Instruction::SExt:
4659  case Instruction::FPToUI:
4660  case Instruction::FPToSI:
4661  case Instruction::FPExt:
4662  case Instruction::PtrToInt:
4663  case Instruction::IntToPtr:
4664  case Instruction::SIToFP:
4665  case Instruction::UIToFP:
4666  case Instruction::Trunc:
4667  case Instruction::FPTrunc:
4668  case Instruction::BitCast: {
4669  auto *CI = dyn_cast<CastInst>(&I);
4671 
4672  /// Vectorize casts.
4673  Type *DestTy =
4674  (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4675 
4676  for (unsigned Part = 0; Part < UF; ++Part) {
4677  Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4678  Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4679  VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4680  addMetadata(Cast, &I);
4681  }
4682  break;
4683  }
4684 
4685  case Instruction::Call: {
4686  // Ignore dbg intrinsics.
4687  if (isa<DbgInfoIntrinsic>(I))
4688  break;
4690 
4691  Module *M = I.getParent()->getParent()->getParent();
4692  auto *CI = cast<CallInst>(&I);
4693 
4694  StringRef FnName = CI->getCalledFunction()->getName();
4695  Function *F = CI->getCalledFunction();
4696  Type *RetTy = ToVectorTy(CI->getType(), VF);
4698  for (Value *ArgOperand : CI->arg_operands())
4699  Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4700 
4702 
4703  // The flag shows whether we use Intrinsic or a usual Call for vectorized
4704  // version of the instruction.
4705  // Is it beneficial to perform intrinsic call compared to lib call?
4706  bool NeedToScalarize;
4707  unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
4708  bool UseVectorIntrinsic =
4709  ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
4710  assert((UseVectorIntrinsic || !NeedToScalarize) &&
4711  "Instruction should be scalarized elsewhere.");
4712 
4713  for (unsigned Part = 0; Part < UF; ++Part) {
4715  for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4716  Value *Arg = CI->getArgOperand(i);
4717  // Some intrinsics have a scalar argument - don't replace it with a
4718  // vector.
4719  if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4720  Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4721  Args.push_back(Arg);
4722  }
4723 
4724  Function *VectorF;
4725  if (UseVectorIntrinsic) {
4726  // Use vector version of the intrinsic.
4727  Type *TysForDecl[] = {CI->getType()};
4728  if (VF > 1)
4729  TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4730  VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4731  } else {
4732  // Use vector version of the library call.
4733  StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
4734  assert(!VFnName.empty() && "Vector function name is empty.");
4735  VectorF = M->getFunction(VFnName);
4736  if (!VectorF) {
4737  // Generate a declaration
4738  FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
4739  VectorF =
4740  Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
4741  VectorF->copyAttributesFrom(F);
4742  }
4743  }
4744  assert(VectorF && "Can't create vector function.");
4745 
4747  CI->getOperandBundlesAsDefs(OpBundles);
4748  CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4749 
4750  if (isa<FPMathOperator>(V))
4751  V->copyFastMathFlags(CI);
4752 
4753  VectorLoopValueMap.setVectorValue(&I, Part, V);
4754  addMetadata(V, &I);
4755  }
4756 
4757  break;
4758  }
4759 
4760  default:
4761  // This instruction is not vectorized by simple widening.
4762  DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4763  llvm_unreachable("Unhandled instruction!");
4764  } // end of switch.
4765 }
4766 
4768  // Forget the original basic block.
4770 
4771  // Update the dominator tree information.
4773  "Entry does not dominate exit.");
4774 
4780  DEBUG(DT->verifyDomTree());
4781 }
4782 
4783 /// \brief Check whether it is safe to if-convert this phi node.
4784 ///
4785 /// Phi nodes with constant expressions that can trap are not safe to if
4786 /// convert.
4788  for (PHINode &Phi : BB->phis()) {
4789  for (Value *V : Phi.incoming_values())
4790  if (auto *C = dyn_cast<Constant>(V))
4791  if (C->canTrap())
4792  return false;
4793  }
4794  return true;
4795 }
4796 
4797 bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
4798  if (!EnableIfConversion) {
4799  ORE->emit(createMissedAnalysis("IfConversionDisabled")
4800  << "if-conversion is disabled");
4801  return false;
4802  }
4803 
4804  assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
4805 
4806  // A list of pointers that we can safely read and write to.
4807  SmallPtrSet<Value *, 8> SafePointes;
4808 
4809  // Collect safe addresses.
4810  for (BasicBlock *BB : TheLoop->blocks()) {
4811  if (blockNeedsPredication(BB))
4812  continue;
4813 
4814  for (Instruction &I : *BB)
4815  if (auto *Ptr = getPointerOperand(&I))
4816  SafePointes.insert(Ptr);
4817  }
4818 
4819  // Collect the blocks that need predication.
4820  BasicBlock *Header = TheLoop->getHeader();
4821  for (BasicBlock *BB : TheLoop->blocks()) {
4822  // We don't support switch statements inside loops.
4823  if (!isa<BranchInst>(BB->getTerminator())) {
4824  ORE->emit(createMissedAnalysis("LoopContainsSwitch", BB->getTerminator())
4825  << "loop contains a switch statement");
4826  return false;
4827  }
4828 
4829  // We must be able to predicate all blocks that need to be predicated.
4830  if (blockNeedsPredication(BB)) {
4831  if (!blockCanBePredicated(BB, SafePointes)) {
4832  ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator())
4833  << "control flow cannot be substituted for a select");
4834  return false;
4835  }
4836  } else if (BB != Header && !canIfConvertPHINodes(BB)) {
4837  ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator())
4838  << "control flow cannot be substituted for a select");
4839  return false;
4840  }
4841  }
4842 
4843  // We can if-convert this loop.
4844  return true;
4845 }
4846 
4848  // Store the result and return it at the end instead of exiting early, in case
4849  // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
4850  bool Result = true;
4851 
4852  bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
4853  // We must have a loop in canonical form. Loops with indirectbr in them cannot
4854  // be canonicalized.
4855  if (!TheLoop->getLoopPreheader()) {
4856  DEBUG(dbgs() << "LV: Loop doesn't have a legal pre-header.\n");
4857  ORE->emit(createMissedAnalysis("CFGNotUnderstood")
4858  << "loop control flow is not understood by vectorizer");
4859  if (DoExtraAnalysis)
4860  Result = false;
4861  else
4862  return false;
4863  }
4864 
4865  // FIXME: The code is currently dead, since the loop gets sent to
4866  // LoopVectorizationLegality is already an innermost loop.
4867  //
4868  // We can only vectorize innermost loops.
4869  if (!TheLoop->empty()) {
4870  ORE->emit(createMissedAnalysis("NotInnermostLoop")
4871  << "loop is not the innermost loop");
4872  if (DoExtraAnalysis)
4873  Result = false;
4874  else
4875  return false;
4876  }
4877 
4878  // We must have a single backedge.
4879  if (TheLoop->getNumBackEdges() != 1) {
4880  ORE->emit(createMissedAnalysis("CFGNotUnderstood")
4881  << "loop control flow is not understood by vectorizer");
4882  if (DoExtraAnalysis)
4883  Result = false;
4884  else
4885  return false;
4886  }
4887 
4888  // We must have a single exiting block.
4889  if (!TheLoop->getExitingBlock()) {
4890  ORE->emit(createMissedAnalysis("CFGNotUnderstood")
4891  << "loop control flow is not understood by vectorizer");
4892  if (DoExtraAnalysis)
4893  Result = false;
4894  else
4895  return false;
4896  }
4897 
4898  // We only handle bottom-tested loops, i.e. loop in which the condition is
4899  // checked at the end of each iteration. With that we can assume that all
4900  // instructions in the loop are executed the same number of times.
4901  if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
4902  ORE->emit(createMissedAnalysis("CFGNotUnderstood")
4903  << "loop control flow is not understood by vectorizer");
4904  if (DoExtraAnalysis)
4905  Result = false;
4906  else
4907  return false;
4908  }
4909 
4910  // We need to have a loop header.
4911  DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName()
4912  << '\n');
4913 
4914  // Check if we can if-convert non-single-bb loops.
4915  unsigned NumBlocks = TheLoop->getNumBlocks();
4916  if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
4917  DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
4918  if (DoExtraAnalysis)
4919  Result = false;
4920  else
4921  return false;
4922  }
4923 
4924  // Check if we can vectorize the instructions and CFG in this loop.
4925  if (!canVectorizeInstrs()) {
4926  DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
4927  if (DoExtraAnalysis)
4928  Result = false;
4929  else
4930  return false;
4931  }
4932 
4933  // Go over each instruction and look at memory deps.
4934  if (!canVectorizeMemory()) {
4935  DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
4936  if (DoExtraAnalysis)
4937  Result = false;
4938  else
4939  return false;
4940  }
4941 
4942  DEBUG(dbgs() << "LV: We can vectorize this loop"
4943  << (LAI->getRuntimePointerChecking()->Need
4944  ? " (with a runtime bound check)"
4945  : "")
4946  << "!\n");
4947 
4948  bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
4949 
4950  // If an override option has been passed in for interleaved accesses, use it.
4951  if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
4952  UseInterleaved = EnableInterleavedMemAccesses;
4953 
4954  // Analyze interleaved memory accesses.
4955  if (UseInterleaved)
4956  InterleaveInfo.analyzeInterleaving(*getSymbolicStrides());
4957 
4958  unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
4959  if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
4960  SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
4961 
4962  if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) {
4963  ORE->emit(createMissedAnalysis("TooManySCEVRunTimeChecks")
4964  << "Too many SCEV assumptions need to be made and checked "
4965  << "at runtime");
4966  DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n");
4967  if (DoExtraAnalysis)
4968  Result = false;
4969  else
4970  return false;
4971  }
4972 
4973  // Okay! We've done all the tests. If any have failed, return false. Otherwise
4974  // we can vectorize, and at this point we don't have any other mem analysis
4975  // which may limit our maximum vectorization factor, so just return true with
4976  // no restrictions.
4977  return Result;
4978 }
4979 
4981  if (Ty->isPointerTy())
4982  return DL.getIntPtrType(Ty);
4983 
4984  // It is possible that char's or short's overflow when we ask for the loop's
4985  // trip count, work around this by changing the type size.
4986  if (Ty->getScalarSizeInBits() < 32)
4987  return Type::getInt32Ty(Ty->getContext());
4988 
4989  return Ty;
4990 }
4991 
4992 static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
4993  Ty0 = convertPointerToIntegerType(DL, Ty0);
4994  Ty1 = convertPointerToIntegerType(DL, Ty1);
4995  if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
4996  return Ty0;
4997  return Ty1;
4998 }
4999 
5000 /// \brief Check that the instruction has outside loop users and is not an
5001 /// identified reduction variable.
5002 static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
5003  SmallPtrSetImpl<Value *> &AllowedExit) {
5004  // Reduction and Induction instructions are allowed to have exit users. All
5005  // other instructions must not have external users.
5006  if (!AllowedExit.count(Inst))
5007  // Check that all of the users of the loop are inside the BB.
5008  for (User *U : Inst->users()) {
5009  Instruction *UI = cast<Instruction>(U);
5010  // This user may be a reduction exit value.
5011  if (!TheLoop->contains(UI)) {
5012  DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n');
5013  return true;
5014  }
5015  }
5016  return false;
5017 }
5018 
5019 void LoopVectorizationLegality::addInductionPhi(
5020  PHINode *Phi, const InductionDescriptor &ID,
5021  SmallPtrSetImpl<Value *> &AllowedExit) {
5022  Inductions[Phi] = ID;
5023 
5024  // In case this induction also comes with casts that we know we can ignore
5025  // in the vectorized loop body, record them here. All casts could be recorded
5026  // here for ignoring, but suffices to record only the first (as it is the
5027  // only one that may bw used outside the cast sequence).
5028  const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
5029  if (!Casts.empty())
5030  InductionCastsToIgnore.insert(*Casts.begin());
5031 
5032  Type *PhiTy = Phi->getType();