LLVM  8.0.0svn
LoopVectorize.cpp
Go to the documentation of this file.
1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
11 // and generates target-independent LLVM-IR.
12 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
13 // of instructions in order to estimate the profitability of vectorization.
14 //
15 // The loop vectorizer combines consecutive loop iterations into a single
16 // 'wide' iteration. After this transformation the index is incremented
17 // by the SIMD vector width, and not by one.
18 //
19 // This pass has three parts:
20 // 1. The main loop pass that drives the different parts.
21 // 2. LoopVectorizationLegality - A unit that checks for the legality
22 // of the vectorization.
23 // 3. InnerLoopVectorizer - A unit that performs the actual
24 // widening of instructions.
25 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
26 // of vectorization. It decides on the optimal vector width, which
27 // can be one, if vectorization is not profitable.
28 //
29 // There is a development effort going on to migrate loop vectorizer to the
30 // VPlan infrastructure and to introduce outer loop vectorization support (see
31 // docs/Proposal/VectorizationPlan.rst and
32 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
33 // purpose, we temporarily introduced the VPlan-native vectorization path: an
34 // alternative vectorization path that is natively implemented on top of the
35 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
36 //
37 //===----------------------------------------------------------------------===//
38 //
39 // The reduction-variable vectorization is based on the paper:
40 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
41 //
42 // Variable uniformity checks are inspired by:
43 // Karrenberg, R. and Hack, S. Whole Function Vectorization.
44 //
45 // The interleaved access vectorization is based on the paper:
46 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
47 // Data for SIMD
48 //
49 // Other ideas/concepts are from:
50 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
51 //
52 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
53 // Vectorizing Compilers.
54 //
55 //===----------------------------------------------------------------------===//
56 
59 #include "VPRecipeBuilder.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanHCFGTransforms.h"
62 #include "llvm/ADT/APInt.h"
63 #include "llvm/ADT/ArrayRef.h"
64 #include "llvm/ADT/DenseMap.h"
65 #include "llvm/ADT/DenseMapInfo.h"
66 #include "llvm/ADT/Hashing.h"
67 #include "llvm/ADT/MapVector.h"
68 #include "llvm/ADT/None.h"
69 #include "llvm/ADT/Optional.h"
70 #include "llvm/ADT/STLExtras.h"
71 #include "llvm/ADT/SetVector.h"
72 #include "llvm/ADT/SmallPtrSet.h"
73 #include "llvm/ADT/SmallVector.h"
74 #include "llvm/ADT/Statistic.h"
75 #include "llvm/ADT/StringRef.h"
76 #include "llvm/ADT/Twine.h"
81 #include "llvm/Analysis/CFG.h"
87 #include "llvm/Analysis/LoopInfo.h"
96 #include "llvm/IR/Attributes.h"
97 #include "llvm/IR/BasicBlock.h"
98 #include "llvm/IR/CFG.h"
99 #include "llvm/IR/Constant.h"
100 #include "llvm/IR/Constants.h"
101 #include "llvm/IR/DataLayout.h"
103 #include "llvm/IR/DebugLoc.h"
104 #include "llvm/IR/DerivedTypes.h"
105 #include "llvm/IR/DiagnosticInfo.h"
106 #include "llvm/IR/Dominators.h"
107 #include "llvm/IR/Function.h"
108 #include "llvm/IR/IRBuilder.h"
109 #include "llvm/IR/InstrTypes.h"
110 #include "llvm/IR/Instruction.h"
111 #include "llvm/IR/Instructions.h"
112 #include "llvm/IR/IntrinsicInst.h"
113 #include "llvm/IR/Intrinsics.h"
114 #include "llvm/IR/LLVMContext.h"
115 #include "llvm/IR/Metadata.h"
116 #include "llvm/IR/Module.h"
117 #include "llvm/IR/Operator.h"
118 #include "llvm/IR/Type.h"
119 #include "llvm/IR/Use.h"
120 #include "llvm/IR/User.h"
121 #include "llvm/IR/Value.h"
122 #include "llvm/IR/ValueHandle.h"
123 #include "llvm/IR/Verifier.h"
124 #include "llvm/Pass.h"
125 #include "llvm/Support/Casting.h"
127 #include "llvm/Support/Compiler.h"
128 #include "llvm/Support/Debug.h"
130 #include "llvm/Support/MathExtras.h"
137 #include <algorithm>
138 #include <cassert>
139 #include <cstdint>
140 #include <cstdlib>
141 #include <functional>
142 #include <iterator>
143 #include <limits>
144 #include <memory>
145 #include <string>
146 #include <tuple>
147 #include <utility>
148 #include <vector>
149 
150 using namespace llvm;
151 
152 #define LV_NAME "loop-vectorize"
153 #define DEBUG_TYPE LV_NAME
154 
155 STATISTIC(LoopsVectorized, "Number of loops vectorized");
156 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
157 
158 /// Loops with a known constant trip count below this number are vectorized only
159 /// if no scalar iteration overheads are incurred.
161  "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
162  cl::desc("Loops with a constant trip count that is smaller than this "
163  "value are vectorized only if no scalar iteration overheads "
164  "are incurred."));
165 
167  "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
168  cl::desc("Maximize bandwidth when selecting vectorization factor which "
169  "will be determined by the smallest type in loop."));
170 
172  "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
173  cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
174 
175 /// An interleave-group may need masking if it resides in a block that needs
176 /// predication, or in order to mask away gaps.
178  "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
179  cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
180 
181 /// We don't interleave loops with a known constant trip count below this
182 /// number.
183 static const unsigned TinyTripCountInterleaveThreshold = 128;
184 
186  "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
187  cl::desc("A flag that overrides the target's number of scalar registers."));
188 
190  "force-target-num-vector-regs", cl::init(0), cl::Hidden,
191  cl::desc("A flag that overrides the target's number of vector registers."));
192 
194  "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
195  cl::desc("A flag that overrides the target's max interleave factor for "
196  "scalar loops."));
197 
199  "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
200  cl::desc("A flag that overrides the target's max interleave factor for "
201  "vectorized loops."));
202 
204  "force-target-instruction-cost", cl::init(0), cl::Hidden,
205  cl::desc("A flag that overrides the target's expected cost for "
206  "an instruction to a single constant value. Mostly "
207  "useful for getting consistent testing."));
208 
210  "small-loop-cost", cl::init(20), cl::Hidden,
211  cl::desc(
212  "The cost of a loop that is considered 'small' by the interleaver."));
213 
215  "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
216  cl::desc("Enable the use of the block frequency analysis to access PGO "
217  "heuristics minimizing code growth in cold regions and being more "
218  "aggressive in hot regions."));
219 
220 // Runtime interleave loops for load/store throughput.
222  "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
223  cl::desc(
224  "Enable runtime interleaving until load/store ports are saturated"));
225 
226 /// The number of stores in a loop that are allowed to need predication.
228  "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
229  cl::desc("Max number of stores to be predicated behind an if."));
230 
232  "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
233  cl::desc("Count the induction variable only once when interleaving"));
234 
236  "enable-cond-stores-vec", cl::init(true), cl::Hidden,
237  cl::desc("Enable if predication of stores during vectorization."));
238 
240  "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
241  cl::desc("The maximum interleave count to use when interleaving a scalar "
242  "reduction in a nested loop."));
243 
245  "enable-vplan-native-path", cl::init(false), cl::Hidden,
246  cl::desc("Enable VPlan-native vectorization path with "
247  "support for outer loop vectorization."));
248 
249 // This flag enables the stress testing of the VPlan H-CFG construction in the
250 // VPlan-native vectorization path. It must be used in conjuction with
251 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
252 // verification of the H-CFGs built.
254  "vplan-build-stress-test", cl::init(false), cl::Hidden,
255  cl::desc(
256  "Build VPlan for every supported loop nest in the function and bail "
257  "out right after the build (stress test the VPlan H-CFG construction "
258  "in the VPlan-native vectorization path)."));
259 
260 /// A helper function for converting Scalar types to vector types.
261 /// If the incoming type is void, we return void. If the VF is 1, we return
262 /// the scalar type.
263 static Type *ToVectorTy(Type *Scalar, unsigned VF) {
264  if (Scalar->isVoidTy() || VF == 1)
265  return Scalar;
266  return VectorType::get(Scalar, VF);
267 }
268 
269 /// A helper function that returns the type of loaded or stored value.
271  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
272  "Expected Load or Store instruction");
273  if (auto *LI = dyn_cast<LoadInst>(I))
274  return LI->getType();
275  return cast<StoreInst>(I)->getValueOperand()->getType();
276 }
277 
278 /// A helper function that returns true if the given type is irregular. The
279 /// type is irregular if its allocated size doesn't equal the store size of an
280 /// element of the corresponding vector type at the given vectorization factor.
281 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
282  // Determine if an array of VF elements of type Ty is "bitcast compatible"
283  // with a <VF x Ty> vector.
284  if (VF > 1) {
285  auto *VectorTy = VectorType::get(Ty, VF);
286  return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
287  }
288 
289  // If the vectorization factor is one, we just check if an array of type Ty
290  // requires padding between elements.
291  return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
292 }
293 
294 /// A helper function that returns the reciprocal of the block probability of
295 /// predicated blocks. If we return X, we are assuming the predicated block
296 /// will execute once for every X iterations of the loop header.
297 ///
298 /// TODO: We should use actual block probability here, if available. Currently,
299 /// we always assume predicated blocks have a 50% chance of executing.
300 static unsigned getReciprocalPredBlockProb() { return 2; }
301 
302 /// A helper function that adds a 'fast' flag to floating-point operations.
304  if (isa<FPMathOperator>(V)) {
305  FastMathFlags Flags;
306  Flags.setFast();
307  cast<Instruction>(V)->setFastMathFlags(Flags);
308  }
309  return V;
310 }
311 
312 /// A helper function that returns an integer or floating-point constant with
313 /// value C.
314 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
315  return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
316  : ConstantFP::get(Ty, C);
317 }
318 
319 namespace llvm {
320 
321 /// InnerLoopVectorizer vectorizes loops which contain only one basic
322 /// block to a specified vectorization factor (VF).
323 /// This class performs the widening of scalars into vectors, or multiple
324 /// scalars. This class also implements the following features:
325 /// * It inserts an epilogue loop for handling loops that don't have iteration
326 /// counts that are known to be a multiple of the vectorization factor.
327 /// * It handles the code generation for reduction variables.
328 /// * Scalarization (implementation using scalars) of un-vectorizable
329 /// instructions.
330 /// InnerLoopVectorizer does not perform any vectorization-legality
331 /// checks, and relies on the caller to check for the different legality
332 /// aspects. The InnerLoopVectorizer relies on the
333 /// LoopVectorizationLegality class to provide information about the induction
334 /// and reduction variables that were found to a given vectorization factor.
336 public:
339  const TargetLibraryInfo *TLI,
341  OptimizationRemarkEmitter *ORE, unsigned VecWidth,
342  unsigned UnrollFactor, LoopVectorizationLegality *LVL,
344  : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
345  AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
346  Builder(PSE.getSE()->getContext()),
347  VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
348  virtual ~InnerLoopVectorizer() = default;
349 
350  /// Create a new empty loop. Unlink the old loop and connect the new one.
351  /// Return the pre-header block of the new loop.
353 
354  /// Widen a single instruction within the innermost loop.
356 
357  /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
358  void fixVectorizedLoop();
359 
360  // Return true if any runtime check is added.
362 
363  /// A type for vectorized values in the new loop. Each value from the
364  /// original loop, when vectorized, is represented by UF vector values in the
365  /// new unrolled loop, where UF is the unroll factor.
367 
368  /// Vectorize a single PHINode in a block. This method handles the induction
369  /// variable canonicalization. It supports both VF = 1 for unrolled loops and
370  /// arbitrary length vectors.
371  void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
372 
373  /// A helper function to scalarize a single Instruction in the innermost loop.
374  /// Generates a sequence of scalar instances for each lane between \p MinLane
375  /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
376  /// inclusive..
377  void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
378  bool IfPredicateInstr);
379 
380  /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
381  /// is provided, the integer induction variable will first be truncated to
382  /// the corresponding type.
383  void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
384 
385  /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
386  /// vector or scalar value on-demand if one is not yet available. When
387  /// vectorizing a loop, we visit the definition of an instruction before its
388  /// uses. When visiting the definition, we either vectorize or scalarize the
389  /// instruction, creating an entry for it in the corresponding map. (In some
390  /// cases, such as induction variables, we will create both vector and scalar
391  /// entries.) Then, as we encounter uses of the definition, we derive values
392  /// for each scalar or vector use unless such a value is already available.
393  /// For example, if we scalarize a definition and one of its uses is vector,
394  /// we build the required vector on-demand with an insertelement sequence
395  /// when visiting the use. Otherwise, if the use is scalar, we can use the
396  /// existing scalar definition.
397  ///
398  /// Return a value in the new loop corresponding to \p V from the original
399  /// loop at unroll index \p Part. If the value has already been vectorized,
400  /// the corresponding vector entry in VectorLoopValueMap is returned. If,
401  /// however, the value has a scalar entry in VectorLoopValueMap, we construct
402  /// a new vector value on-demand by inserting the scalar values into a vector
403  /// with an insertelement sequence. If the value has been neither vectorized
404  /// nor scalarized, it must be loop invariant, so we simply broadcast the
405  /// value into a vector.
406  Value *getOrCreateVectorValue(Value *V, unsigned Part);
407 
408  /// Return a value in the new loop corresponding to \p V from the original
409  /// loop at unroll and vector indices \p Instance. If the value has been
410  /// vectorized but not scalarized, the necessary extractelement instruction
411  /// will be generated.
412  Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
413 
414  /// Construct the vector value of a scalarized value \p V one lane at a time.
415  void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
416 
417  /// Try to vectorize the interleaved access group that \p Instr belongs to,
418  /// optionally masking the vector operations if \p BlockInMask is non-null.
420  VectorParts *BlockInMask = nullptr);
421 
422  /// Vectorize Load and Store instructions, optionally masking the vector
423  /// operations if \p BlockInMask is non-null.
425  VectorParts *BlockInMask = nullptr);
426 
427  /// Set the debug location in the builder using the debug location in
428  /// the instruction.
429  void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
430 
431  /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
432  void fixNonInductionPHIs(void);
433 
434 protected:
436 
437  /// A small list of PHINodes.
439 
440  /// A type for scalarized values in the new loop. Each value from the
441  /// original loop, when scalarized, is represented by UF x VF scalar values
442  /// in the new unrolled loop, where UF is the unroll factor and VF is the
443  /// vectorization factor.
445 
446  /// Set up the values of the IVs correctly when exiting the vector loop.
447  void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
448  Value *CountRoundDown, Value *EndValue,
449  BasicBlock *MiddleBlock);
450 
451  /// Create a new induction variable inside L.
452  PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
453  Value *Step, Instruction *DL);
454 
455  /// Handle all cross-iteration phis in the header.
456  void fixCrossIterationPHIs();
457 
458  /// Fix a first-order recurrence. This is the second phase of vectorizing
459  /// this phi node.
460  void fixFirstOrderRecurrence(PHINode *Phi);
461 
462  /// Fix a reduction cross-iteration phi. This is the second phase of
463  /// vectorizing this phi node.
464  void fixReduction(PHINode *Phi);
465 
466  /// The Loop exit block may have single value PHI nodes with some
467  /// incoming value. While vectorizing we only handled real values
468  /// that were defined inside the loop and we should have one value for
469  /// each predecessor of its parent basic block. See PR14725.
470  void fixLCSSAPHIs();
471 
472  /// Iteratively sink the scalarized operands of a predicated instruction into
473  /// the block that was created for it.
474  void sinkScalarOperands(Instruction *PredInst);
475 
476  /// Shrinks vector element sizes to the smallest bitwidth they can be legally
477  /// represented as.
479 
480  /// Insert the new loop to the loop hierarchy and pass manager
481  /// and update the analysis passes.
482  void updateAnalysis();
483 
484  /// Create a broadcast instruction. This method generates a broadcast
485  /// instruction (shuffle) for loop invariant values and for the induction
486  /// value. If this is the induction variable then we extend it to N, N+1, ...
487  /// this is needed because each iteration in the loop corresponds to a SIMD
488  /// element.
489  virtual Value *getBroadcastInstrs(Value *V);
490 
491  /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
492  /// to each vector element of Val. The sequence starts at StartIndex.
493  /// \p Opcode is relevant for FP induction variable.
494  virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
495  Instruction::BinaryOps Opcode =
496  Instruction::BinaryOpsEnd);
497 
498  /// Compute scalar induction steps. \p ScalarIV is the scalar induction
499  /// variable on which to base the steps, \p Step is the size of the step, and
500  /// \p EntryVal is the value from the original loop that maps to the steps.
501  /// Note that \p EntryVal doesn't have to be an induction variable - it
502  /// can also be a truncate instruction.
503  void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
504  const InductionDescriptor &ID);
505 
506  /// Create a vector induction phi node based on an existing scalar one. \p
507  /// EntryVal is the value from the original loop that maps to the vector phi
508  /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
509  /// truncate instruction, instead of widening the original IV, we widen a
510  /// version of the IV truncated to \p EntryVal's type.
512  Value *Step, Instruction *EntryVal);
513 
514  /// Returns true if an instruction \p I should be scalarized instead of
515  /// vectorized for the chosen vectorization factor.
517 
518  /// Returns true if we should generate a scalar version of \p IV.
519  bool needsScalarInduction(Instruction *IV) const;
520 
521  /// If there is a cast involved in the induction variable \p ID, which should
522  /// be ignored in the vectorized loop body, this function records the
523  /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
524  /// cast. We had already proved that the casted Phi is equal to the uncasted
525  /// Phi in the vectorized loop (under a runtime guard), and therefore
526  /// there is no need to vectorize the cast - the same value can be used in the
527  /// vector loop for both the Phi and the cast.
528  /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
529  /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
530  ///
531  /// \p EntryVal is the value from the original loop that maps to the vector
532  /// phi node and is used to distinguish what is the IV currently being
533  /// processed - original one (if \p EntryVal is a phi corresponding to the
534  /// original IV) or the "newly-created" one based on the proof mentioned above
535  /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
536  /// latter case \p EntryVal is a TruncInst and we must not record anything for
537  /// that IV, but it's error-prone to expect callers of this routine to care
538  /// about that, hence this explicit parameter.
540  const Instruction *EntryVal,
541  Value *VectorLoopValue,
542  unsigned Part,
543  unsigned Lane = UINT_MAX);
544 
545  /// Generate a shuffle sequence that will reverse the vector Vec.
546  virtual Value *reverseVector(Value *Vec);
547 
548  /// Returns (and creates if needed) the original loop trip count.
549  Value *getOrCreateTripCount(Loop *NewLoop);
550 
551  /// Returns (and creates if needed) the trip count of the widened loop.
553 
554  /// Returns a bitcasted value to the requested vector type.
555  /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
557  const DataLayout &DL);
558 
559  /// Emit a bypass check to see if the vector trip count is zero, including if
560  /// it overflows.
562 
563  /// Emit a bypass check to see if all of the SCEV assumptions we've
564  /// had to make are correct.
565  void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
566 
567  /// Emit bypass checks to check any memory assumptions we may have made.
568  void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
569 
570  /// Compute the transformed value of Index at offset StartValue using step
571  /// StepValue.
572  /// For integer induction, returns StartValue + Index * StepValue.
573  /// For pointer induction, returns StartValue[Index * StepValue].
574  /// FIXME: The newly created binary instructions should contain nsw/nuw
575  /// flags, which can be found from the original scalar operations.
577  const DataLayout &DL,
578  const InductionDescriptor &ID) const;
579 
580  /// Add additional metadata to \p To that was not present on \p Orig.
581  ///
582  /// Currently this is used to add the noalias annotations based on the
583  /// inserted memchecks. Use this for instructions that are *cloned* into the
584  /// vector loop.
585  void addNewMetadata(Instruction *To, const Instruction *Orig);
586 
587  /// Add metadata from one instruction to another.
588  ///
589  /// This includes both the original MDs from \p From and additional ones (\see
590  /// addNewMetadata). Use this for *newly created* instructions in the vector
591  /// loop.
593 
594  /// Similar to the previous function but it adds the metadata to a
595  /// vector of instructions.
597 
598  /// The original loop.
600 
601  /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
602  /// dynamic knowledge to simplify SCEV expressions and converts them to a
603  /// more usable form.
605 
606  /// Loop Info.
608 
609  /// Dominator Tree.
611 
612  /// Alias Analysis.
614 
615  /// Target Library Info.
617 
618  /// Target Transform Info.
620 
621  /// Assumption Cache.
623 
624  /// Interface to emit optimization remarks.
626 
627  /// LoopVersioning. It's only set up (non-null) if memchecks were
628  /// used.
629  ///
630  /// This is currently only used to add no-alias metadata based on the
631  /// memchecks. The actually versioning is performed manually.
632  std::unique_ptr<LoopVersioning> LVer;
633 
634  /// The vectorization SIMD factor to use. Each vector will have this many
635  /// vector elements.
636  unsigned VF;
637 
638  /// The vectorization unroll factor to use. Each scalar is vectorized to this
639  /// many different vector instructions.
640  unsigned UF;
641 
642  /// The builder that we use
644 
645  // --- Vectorization state ---
646 
647  /// The vector-loop preheader.
649 
650  /// The scalar-loop preheader.
652 
653  /// Middle Block between the vector and the scalar.
655 
656  /// The ExitBlock of the scalar loop.
658 
659  /// The vector loop body.
661 
662  /// The scalar loop body.
664 
665  /// A list of all bypass blocks. The first block is the entry of the loop.
667 
668  /// The new Induction variable which was added to the new block.
669  PHINode *Induction = nullptr;
670 
671  /// The induction variable of the old basic block.
672  PHINode *OldInduction = nullptr;
673 
674  /// Maps values from the original loop to their corresponding values in the
675  /// vectorized loop. A key value can map to either vector values, scalar
676  /// values or both kinds of values, depending on whether the key was
677  /// vectorized and scalarized.
679 
680  /// Store instructions that were predicated.
682 
683  /// Trip count of the original loop.
684  Value *TripCount = nullptr;
685 
686  /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
687  Value *VectorTripCount = nullptr;
688 
689  /// The legality analysis.
691 
692  /// The profitablity analysis.
694 
695  // Record whether runtime checks are added.
696  bool AddedSafetyChecks = false;
697 
698  // Holds the end values for each induction variable. We save the end values
699  // so we can later fix-up the external users of the induction variables.
701 
702  // Vector of original scalar PHIs whose corresponding widened PHIs need to be
703  // fixed up at the end of vector code generation.
705 };
706 
708 public:
711  const TargetLibraryInfo *TLI,
713  OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
716  : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
717  UnrollFactor, LVL, CM) {}
718 
719 private:
720  Value *getBroadcastInstrs(Value *V) override;
721  Value *getStepVector(Value *Val, int StartIdx, Value *Step,
722  Instruction::BinaryOps Opcode =
723  Instruction::BinaryOpsEnd) override;
724  Value *reverseVector(Value *Vec) override;
725 };
726 
727 } // end namespace llvm
728 
729 /// Look for a meaningful debug location on the instruction or it's
730 /// operands.
732  if (!I)
733  return I;
734 
735  DebugLoc Empty;
736  if (I->getDebugLoc() != Empty)
737  return I;
738 
739  for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
740  if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
741  if (OpInst->getDebugLoc() != Empty)
742  return OpInst;
743  }
744 
745  return I;
746 }
747 
749  if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
750  const DILocation *DIL = Inst->getDebugLoc();
751  if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
752  !isa<DbgInfoIntrinsic>(Inst))
754  else
756  } else
758 }
759 
760 #ifndef NDEBUG
761 /// \return string containing a file name and a line # for the given loop.
762 static std::string getDebugLocString(const Loop *L) {
763  std::string Result;
764  if (L) {
765  raw_string_ostream OS(Result);
766  if (const DebugLoc LoopDbgLoc = L->getStartLoc())
767  LoopDbgLoc.print(OS);
768  else
769  // Just print the module name.
770  OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
771  OS.flush();
772  }
773  return Result;
774 }
775 #endif
776 
778  const Instruction *Orig) {
779  // If the loop was versioned with memchecks, add the corresponding no-alias
780  // metadata.
781  if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
782  LVer->annotateInstWithNoAlias(To, Orig);
783 }
784 
786  Instruction *From) {
787  propagateMetadata(To, From);
788  addNewMetadata(To, From);
789 }
790 
792  Instruction *From) {
793  for (Value *V : To) {
794  if (Instruction *I = dyn_cast<Instruction>(V))
795  addMetadata(I, From);
796  }
797 }
798 
799 static void emitMissedWarning(Function *F, Loop *L,
800  const LoopVectorizeHints &LH,
802  LH.emitRemarkWithHints();
803 
805  if (LH.getWidth() != 1)
807  DEBUG_TYPE, "FailedRequestedVectorization",
808  L->getStartLoc(), L->getHeader())
809  << "loop not vectorized: "
810  << "failed explicitly specified loop vectorization");
811  else if (LH.getInterleave() != 1)
813  DEBUG_TYPE, "FailedRequestedInterleaving", L->getStartLoc(),
814  L->getHeader())
815  << "loop not interleaved: "
816  << "failed explicitly specified loop interleaving");
817  }
818 }
819 
820 namespace llvm {
821 
822 /// LoopVectorizationCostModel - estimates the expected speedups due to
823 /// vectorization.
824 /// In many cases vectorization is not profitable. This can happen because of
825 /// a number of reasons. In this class we mainly attempt to predict the
826 /// expected speedup/slowdowns due to the supported instruction set. We use the
827 /// TargetTransformInfo to query the different backends for the cost of
828 /// different operations.
830 public:
833  const TargetTransformInfo &TTI,
834  const TargetLibraryInfo *TLI, DemandedBits *DB,
837  const LoopVectorizeHints *Hints,
839  : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
840  AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {}
841 
842  /// \return An upper bound for the vectorization factor, or None if
843  /// vectorization should be avoided up front.
844  Optional<unsigned> computeMaxVF(bool OptForSize);
845 
846  /// \return The most profitable vectorization factor and the cost of that VF.
847  /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
848  /// then this vectorization factor will be selected if vectorization is
849  /// possible.
850  VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
851 
852  /// Setup cost-based decisions for user vectorization factor.
853  void selectUserVectorizationFactor(unsigned UserVF) {
854  collectUniformsAndScalars(UserVF);
855  collectInstsToScalarize(UserVF);
856  }
857 
858  /// \return The size (in bits) of the smallest and widest types in the code
859  /// that needs to be vectorized. We ignore values that remain scalar such as
860  /// 64 bit loop indices.
861  std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
862 
863  /// \return The desired interleave count.
864  /// If interleave count has been specified by metadata it will be returned.
865  /// Otherwise, the interleave count is computed and returned. VF and LoopCost
866  /// are the selected vectorization factor and the cost of the selected VF.
867  unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
868  unsigned LoopCost);
869 
870  /// Memory access instruction may be vectorized in more than one way.
871  /// Form of instruction after vectorization depends on cost.
872  /// This function takes cost-based decisions for Load/Store instructions
873  /// and collects them in a map. This decisions map is used for building
874  /// the lists of loop-uniform and loop-scalar instructions.
875  /// The calculated cost is saved with widening decision in order to
876  /// avoid redundant calculations.
877  void setCostBasedWideningDecision(unsigned VF);
878 
879  /// A struct that represents some properties of the register usage
880  /// of a loop.
881  struct RegisterUsage {
882  /// Holds the number of loop invariant values that are used in the loop.
884 
885  /// Holds the maximum number of concurrent live intervals in the loop.
886  unsigned MaxLocalUsers;
887  };
888 
889  /// \return Returns information about the register usages of the loop for the
890  /// given vectorization factors.
891  SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
892 
893  /// Collect values we want to ignore in the cost model.
894  void collectValuesToIgnore();
895 
896  /// \returns The smallest bitwidth each instruction can be represented with.
897  /// The vector equivalents of these instructions should be truncated to this
898  /// type.
900  return MinBWs;
901  }
902 
903  /// \returns True if it is more profitable to scalarize instruction \p I for
904  /// vectorization factor \p VF.
905  bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
906  assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
907 
908  // Cost model is not run in the VPlan-native path - return conservative
909  // result until this changes.
911  return false;
912 
913  auto Scalars = InstsToScalarize.find(VF);
914  assert(Scalars != InstsToScalarize.end() &&
915  "VF not yet analyzed for scalarization profitability");
916  return Scalars->second.find(I) != Scalars->second.end();
917  }
918 
919  /// Returns true if \p I is known to be uniform after vectorization.
920  bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
921  if (VF == 1)
922  return true;
923 
924  // Cost model is not run in the VPlan-native path - return conservative
925  // result until this changes.
927  return false;
928 
929  auto UniformsPerVF = Uniforms.find(VF);
930  assert(UniformsPerVF != Uniforms.end() &&
931  "VF not yet analyzed for uniformity");
932  return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
933  }
934 
935  /// Returns true if \p I is known to be scalar after vectorization.
936  bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
937  if (VF == 1)
938  return true;
939 
940  // Cost model is not run in the VPlan-native path - return conservative
941  // result until this changes.
943  return false;
944 
945  auto ScalarsPerVF = Scalars.find(VF);
946  assert(ScalarsPerVF != Scalars.end() &&
947  "Scalar values are not calculated for VF");
948  return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
949  }
950 
951  /// \returns True if instruction \p I can be truncated to a smaller bitwidth
952  /// for vectorization factor \p VF.
953  bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
954  return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
955  !isProfitableToScalarize(I, VF) &&
956  !isScalarAfterVectorization(I, VF);
957  }
958 
959  /// Decision that was taken during cost calculation for memory instruction.
962  CM_Widen, // For consecutive accesses with stride +1.
963  CM_Widen_Reverse, // For consecutive accesses with stride -1.
966  CM_Scalarize
967  };
968 
969  /// Save vectorization decision \p W and \p Cost taken by the cost model for
970  /// instruction \p I and vector width \p VF.
972  unsigned Cost) {
973  assert(VF >= 2 && "Expected VF >=2");
974  WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
975  }
976 
977  /// Save vectorization decision \p W and \p Cost taken by the cost model for
978  /// interleaving group \p Grp and vector width \p VF.
979  void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
980  InstWidening W, unsigned Cost) {
981  assert(VF >= 2 && "Expected VF >=2");
982  /// Broadcast this decicion to all instructions inside the group.
983  /// But the cost will be assigned to one instruction only.
984  for (unsigned i = 0; i < Grp->getFactor(); ++i) {
985  if (auto *I = Grp->getMember(i)) {
986  if (Grp->getInsertPos() == I)
987  WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
988  else
989  WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
990  }
991  }
992  }
993 
994  /// Return the cost model decision for the given instruction \p I and vector
995  /// width \p VF. Return CM_Unknown if this instruction did not pass
996  /// through the cost modeling.
998  assert(VF >= 2 && "Expected VF >=2");
999 
1000  // Cost model is not run in the VPlan-native path - return conservative
1001  // result until this changes.
1003  return CM_GatherScatter;
1004 
1005  std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1006  auto Itr = WideningDecisions.find(InstOnVF);
1007  if (Itr == WideningDecisions.end())
1008  return CM_Unknown;
1009  return Itr->second.first;
1010  }
1011 
1012  /// Return the vectorization cost for the given instruction \p I and vector
1013  /// width \p VF.
1014  unsigned getWideningCost(Instruction *I, unsigned VF) {
1015  assert(VF >= 2 && "Expected VF >=2");
1016  std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1017  assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1018  "The cost is not calculated");
1019  return WideningDecisions[InstOnVF].second;
1020  }
1021 
1022  /// Return True if instruction \p I is an optimizable truncate whose operand
1023  /// is an induction variable. Such a truncate will be removed by adding a new
1024  /// induction variable with the destination type.
1025  bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1026  // If the instruction is not a truncate, return false.
1027  auto *Trunc = dyn_cast<TruncInst>(I);
1028  if (!Trunc)
1029  return false;
1030 
1031  // Get the source and destination types of the truncate.
1032  Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1033  Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1034 
1035  // If the truncate is free for the given types, return false. Replacing a
1036  // free truncate with an induction variable would add an induction variable
1037  // update instruction to each iteration of the loop. We exclude from this
1038  // check the primary induction variable since it will need an update
1039  // instruction regardless.
1040  Value *Op = Trunc->getOperand(0);
1041  if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1042  return false;
1043 
1044  // If the truncated value is not an induction variable, return false.
1045  return Legal->isInductionPhi(Op);
1046  }
1047 
1048  /// Collects the instructions to scalarize for each predicated instruction in
1049  /// the loop.
1050  void collectInstsToScalarize(unsigned VF);
1051 
1052  /// Collect Uniform and Scalar values for the given \p VF.
1053  /// The sets depend on CM decision for Load/Store instructions
1054  /// that may be vectorized as interleave, gather-scatter or scalarized.
1055  void collectUniformsAndScalars(unsigned VF) {
1056  // Do the analysis once.
1057  if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1058  return;
1059  setCostBasedWideningDecision(VF);
1060  collectLoopUniforms(VF);
1061  collectLoopScalars(VF);
1062  }
1063 
1064  /// Returns true if the target machine supports masked store operation
1065  /// for the given \p DataType and kind of access to \p Ptr.
1067  return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType);
1068  }
1069 
1070  /// Returns true if the target machine supports masked load operation
1071  /// for the given \p DataType and kind of access to \p Ptr.
1073  return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType);
1074  }
1075 
1076  /// Returns true if the target machine supports masked scatter operation
1077  /// for the given \p DataType.
1079  return TTI.isLegalMaskedScatter(DataType);
1080  }
1081 
1082  /// Returns true if the target machine supports masked gather operation
1083  /// for the given \p DataType.
1085  return TTI.isLegalMaskedGather(DataType);
1086  }
1087 
1088  /// Returns true if the target machine can represent \p V as a masked gather
1089  /// or scatter operation.
1091  bool LI = isa<LoadInst>(V);
1092  bool SI = isa<StoreInst>(V);
1093  if (!LI && !SI)
1094  return false;
1095  auto *Ty = getMemInstValueType(V);
1096  return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
1097  }
1098 
1099  /// Returns true if \p I is an instruction that will be scalarized with
1100  /// predication. Such instructions include conditional stores and
1101  /// instructions that may divide by zero.
1102  /// If a non-zero VF has been calculated, we check if I will be scalarized
1103  /// predication for that VF.
1104  bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1105 
1106  // Returns true if \p I is an instruction that will be predicated either
1107  // through scalar predication or masked load/store or masked gather/scatter.
1108  // Superset of instructions that return true for isScalarWithPredication.
1110  if (!blockNeedsPredication(I->getParent()))
1111  return false;
1112  // Loads and stores that need some form of masked operation are predicated
1113  // instructions.
1114  if (isa<LoadInst>(I) || isa<StoreInst>(I))
1115  return Legal->isMaskRequired(I);
1116  return isScalarWithPredication(I);
1117  }
1118 
1119  /// Returns true if \p I is a memory instruction with consecutive memory
1120  /// access that can be widened.
1121  bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1122 
1123  /// Returns true if \p I is a memory instruction in an interleaved-group
1124  /// of memory accesses that can be vectorized with wide vector loads/stores
1125  /// and shuffles.
1126  bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1127 
1128  /// Check if \p Instr belongs to any interleaved access group.
1130  return InterleaveInfo.isInterleaved(Instr);
1131  }
1132 
1133  /// Get the interleaved access group that \p Instr belongs to.
1136  return InterleaveInfo.getInterleaveGroup(Instr);
1137  }
1138 
1139  /// Returns true if an interleaved group requires a scalar iteration
1140  /// to handle accesses with gaps, and there is nothing preventing us from
1141  /// creating a scalar epilogue.
1142  bool requiresScalarEpilogue() const {
1143  return IsScalarEpilogueAllowed && InterleaveInfo.requiresScalarEpilogue();
1144  }
1145 
1146  /// Returns true if a scalar epilogue is not allowed due to optsize.
1147  bool isScalarEpilogueAllowed() const { return IsScalarEpilogueAllowed; }
1148 
1149  /// Returns true if all loop blocks should be masked to fold tail loop.
1150  bool foldTailByMasking() const { return FoldTailByMasking; }
1151 
1153  return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1154  }
1155 
1156 private:
1157  unsigned NumPredStores = 0;
1158 
1159  /// \return An upper bound for the vectorization factor, larger than zero.
1160  /// One is returned if vectorization should best be avoided due to cost.
1161  unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount);
1162 
1163  /// The vectorization cost is a combination of the cost itself and a boolean
1164  /// indicating whether any of the contributing operations will actually
1165  /// operate on
1166  /// vector values after type legalization in the backend. If this latter value
1167  /// is
1168  /// false, then all operations will be scalarized (i.e. no vectorization has
1169  /// actually taken place).
1170  using VectorizationCostTy = std::pair<unsigned, bool>;
1171 
1172  /// Returns the expected execution cost. The unit of the cost does
1173  /// not matter because we use the 'cost' units to compare different
1174  /// vector widths. The cost that is returned is *not* normalized by
1175  /// the factor width.
1176  VectorizationCostTy expectedCost(unsigned VF);
1177 
1178  /// Returns the execution time cost of an instruction for a given vector
1179  /// width. Vector width of one means scalar.
1180  VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1181 
1182  /// The cost-computation logic from getInstructionCost which provides
1183  /// the vector type as an output parameter.
1184  unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1185 
1186  /// Calculate vectorization cost of memory instruction \p I.
1187  unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1188 
1189  /// The cost computation for scalarized memory instruction.
1190  unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1191 
1192  /// The cost computation for interleaving group of memory instructions.
1193  unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1194 
1195  /// The cost computation for Gather/Scatter instruction.
1196  unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1197 
1198  /// The cost computation for widening instruction \p I with consecutive
1199  /// memory access.
1200  unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1201 
1202  /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1203  /// Load: scalar load + broadcast.
1204  /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1205  /// element)
1206  unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1207 
1208  /// Returns whether the instruction is a load or store and will be a emitted
1209  /// as a vector operation.
1210  bool isConsecutiveLoadOrStore(Instruction *I);
1211 
1212  /// Returns true if an artificially high cost for emulated masked memrefs
1213  /// should be used.
1214  bool useEmulatedMaskMemRefHack(Instruction *I);
1215 
1216  /// Create an analysis remark that explains why vectorization failed
1217  ///
1218  /// \p RemarkName is the identifier for the remark. \return the remark object
1219  /// that can be streamed to.
1220  OptimizationRemarkAnalysis createMissedAnalysis(StringRef RemarkName) {
1221  return createLVMissedAnalysis(Hints->vectorizeAnalysisPassName(),
1222  RemarkName, TheLoop);
1223  }
1224 
1225  /// Map of scalar integer values to the smallest bitwidth they can be legally
1226  /// represented as. The vector equivalents of these values should be truncated
1227  /// to this type.
1229 
1230  /// A type representing the costs for instructions if they were to be
1231  /// scalarized rather than vectorized. The entries are Instruction-Cost
1232  /// pairs.
1234 
1235  /// A set containing all BasicBlocks that are known to present after
1236  /// vectorization as a predicated block.
1237  SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1238 
1239  /// Records whether it is allowed to have the original scalar loop execute at
1240  /// least once. This may be needed as a fallback loop in case runtime
1241  /// aliasing/dependence checks fail, or to handle the tail/remainder
1242  /// iterations when the trip count is unknown or doesn't divide by the VF,
1243  /// or as a peel-loop to handle gaps in interleave-groups.
1244  /// Under optsize and when the trip count is very small we don't allow any
1245  /// iterations to execute in the scalar loop.
1246  bool IsScalarEpilogueAllowed = true;
1247 
1248  /// All blocks of loop are to be masked to fold tail of scalar iterations.
1249  bool FoldTailByMasking = false;
1250 
1251  /// A map holding scalar costs for different vectorization factors. The
1252  /// presence of a cost for an instruction in the mapping indicates that the
1253  /// instruction will be scalarized when vectorizing with the associated
1254  /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1255  DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1256 
1257  /// Holds the instructions known to be uniform after vectorization.
1258  /// The data is collected per VF.
1260 
1261  /// Holds the instructions known to be scalar after vectorization.
1262  /// The data is collected per VF.
1264 
1265  /// Holds the instructions (address computations) that are forced to be
1266  /// scalarized.
1268 
1269  /// Returns the expected difference in cost from scalarizing the expression
1270  /// feeding a predicated instruction \p PredInst. The instructions to
1271  /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1272  /// non-negative return value implies the expression will be scalarized.
1273  /// Currently, only single-use chains are considered for scalarization.
1274  int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1275  unsigned VF);
1276 
1277  /// Collect the instructions that are uniform after vectorization. An
1278  /// instruction is uniform if we represent it with a single scalar value in
1279  /// the vectorized loop corresponding to each vector iteration. Examples of
1280  /// uniform instructions include pointer operands of consecutive or
1281  /// interleaved memory accesses. Note that although uniformity implies an
1282  /// instruction will be scalar, the reverse is not true. In general, a
1283  /// scalarized instruction will be represented by VF scalar values in the
1284  /// vectorized loop, each corresponding to an iteration of the original
1285  /// scalar loop.
1286  void collectLoopUniforms(unsigned VF);
1287 
1288  /// Collect the instructions that are scalar after vectorization. An
1289  /// instruction is scalar if it is known to be uniform or will be scalarized
1290  /// during vectorization. Non-uniform scalarized instructions will be
1291  /// represented by VF values in the vectorized loop, each corresponding to an
1292  /// iteration of the original scalar loop.
1293  void collectLoopScalars(unsigned VF);
1294 
1295  /// Keeps cost model vectorization decision and cost for instructions.
1296  /// Right now it is used for memory instructions only.
1298  std::pair<InstWidening, unsigned>>;
1299 
1300  DecisionList WideningDecisions;
1301 
1302 public:
1303  /// The loop that we evaluate.
1305 
1306  /// Predicated scalar evolution analysis.
1308 
1309  /// Loop Info analysis.
1311 
1312  /// Vectorization legality.
1314 
1315  /// Vector target information.
1317 
1318  /// Target Library Info.
1320 
1321  /// Demanded bits analysis.
1323 
1324  /// Assumption cache.
1326 
1327  /// Interface to emit optimization remarks.
1329 
1331 
1332  /// Loop Vectorize Hint.
1334 
1335  /// The interleave access information contains groups of interleaved accesses
1336  /// with the same stride and close to each other.
1338 
1339  /// Values to ignore in the cost model.
1341 
1342  /// Values to ignore in the cost model when VF > 1.
1344 };
1345 
1346 } // end namespace llvm
1347 
1348 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1349 // vectorization. The loop needs to be annotated with #pragma omp simd
1350 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1351 // vector length information is not provided, vectorization is not considered
1352 // explicit. Interleave hints are not allowed either. These limitations will be
1353 // relaxed in the future.
1354 // Please, note that we are currently forced to abuse the pragma 'clang
1355 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1356 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1357 // provides *explicit vectorization hints* (LV can bypass legal checks and
1358 // assume that vectorization is legal). However, both hints are implemented
1359 // using the same metadata (llvm.loop.vectorize, processed by
1360 // LoopVectorizeHints). This will be fixed in the future when the native IR
1361 // representation for pragma 'omp simd' is introduced.
1362 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1364  assert(!OuterLp->empty() && "This is not an outer loop");
1365  LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1366 
1367  // Only outer loops with an explicit vectorization hint are supported.
1368  // Unannotated outer loops are ignored.
1369  if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1370  return false;
1371 
1372  Function *Fn = OuterLp->getHeader()->getParent();
1373  if (!Hints.allowVectorization(Fn, OuterLp, false /*AlwaysVectorize*/)) {
1374  LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1375  return false;
1376  }
1377 
1378  if (!Hints.getWidth()) {
1379  LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No user vector width.\n");
1380  emitMissedWarning(Fn, OuterLp, Hints, ORE);
1381  return false;
1382  }
1383 
1384  if (Hints.getInterleave() > 1) {
1385  // TODO: Interleave support is future work.
1386  LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1387  "outer loops.\n");
1388  emitMissedWarning(Fn, OuterLp, Hints, ORE);
1389  return false;
1390  }
1391 
1392  return true;
1393 }
1394 
1398  // Collect inner loops and outer loops without irreducible control flow. For
1399  // now, only collect outer loops that have explicit vectorization hints. If we
1400  // are stress testing the VPlan H-CFG construction, we collect the outermost
1401  // loop of every loop nest.
1402  if (L.empty() || VPlanBuildStressTest ||
1404  LoopBlocksRPO RPOT(&L);
1405  RPOT.perform(LI);
1406  if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1407  V.push_back(&L);
1408  // TODO: Collect inner loops inside marked outer loops in case
1409  // vectorization fails for the outer loop. Do not invoke
1410  // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1411  // already known to be reducible. We can use an inherited attribute for
1412  // that.
1413  return;
1414  }
1415  }
1416  for (Loop *InnerL : L)
1417  collectSupportedLoops(*InnerL, LI, ORE, V);
1418 }
1419 
1420 namespace {
1421 
1422 /// The LoopVectorize Pass.
1423 struct LoopVectorize : public FunctionPass {
1424  /// Pass identification, replacement for typeid
1425  static char ID;
1426 
1427  LoopVectorizePass Impl;
1428 
1429  explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true)
1430  : FunctionPass(ID) {
1431  Impl.DisableUnrolling = NoUnrolling;
1432  Impl.AlwaysVectorize = AlwaysVectorize;
1434  }
1435 
1436  bool runOnFunction(Function &F) override {
1437  if (skipFunction(F))
1438  return false;
1439 
1440  auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1441  auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1442  auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1443  auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1444  auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1445  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1446  auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
1447  auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1448  auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1449  auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1450  auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1451  auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1452 
1453  std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1454  [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1455 
1456  return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1457  GetLAA, *ORE);
1458  }
1459 
1460  void getAnalysisUsage(AnalysisUsage &AU) const override {
1471 
1472  // We currently do not preserve loopinfo/dominator analyses with outer loop
1473  // vectorization. Until this is addressed, mark these analyses as preserved
1474  // only for non-VPlan-native path.
1475  // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1476  if (!EnableVPlanNativePath) {
1479  }
1480 
1483  }
1484 };
1485 
1486 } // end anonymous namespace
1487 
1488 //===----------------------------------------------------------------------===//
1489 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1490 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1491 //===----------------------------------------------------------------------===//
1492 
1494  // We need to place the broadcast of invariant variables outside the loop,
1495  // but only if it's proven safe to do so. Else, broadcast will be inside
1496  // vector loop body.
1497  Instruction *Instr = dyn_cast<Instruction>(V);
1498  bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1499  (!Instr ||
1501  // Place the code for broadcasting invariant variables in the new preheader.
1503  if (SafeToHoist)
1505 
1506  // Broadcast the scalar into all locations in the vector.
1507  Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1508 
1509  return Shuf;
1510 }
1511 
1513  const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1514  assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1515  "Expected either an induction phi-node or a truncate of it!");
1516  Value *Start = II.getStartValue();
1517 
1518  // Construct the initial value of the vector IV in the vector loop preheader
1519  auto CurrIP = Builder.saveIP();
1521  if (isa<TruncInst>(EntryVal)) {
1522  assert(Start->getType()->isIntegerTy() &&
1523  "Truncation requires an integer type");
1524  auto *TruncType = cast<IntegerType>(EntryVal->getType());
1525  Step = Builder.CreateTrunc(Step, TruncType);
1526  Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1527  }
1528  Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1529  Value *SteppedStart =
1530  getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1531 
1532  // We create vector phi nodes for both integer and floating-point induction
1533  // variables. Here, we determine the kind of arithmetic we will perform.
1534  Instruction::BinaryOps AddOp;
1535  Instruction::BinaryOps MulOp;
1536  if (Step->getType()->isIntegerTy()) {
1537  AddOp = Instruction::Add;
1538  MulOp = Instruction::Mul;
1539  } else {
1540  AddOp = II.getInductionOpcode();
1541  MulOp = Instruction::FMul;
1542  }
1543 
1544  // Multiply the vectorization factor by the step using integer or
1545  // floating-point arithmetic as appropriate.
1546  Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1547  Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1548 
1549  // Create a vector splat to use in the induction update.
1550  //
1551  // FIXME: If the step is non-constant, we create the vector splat with
1552  // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1553  // handle a constant vector splat.
1554  Value *SplatVF = isa<Constant>(Mul)
1555  ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1556  : Builder.CreateVectorSplat(VF, Mul);
1557  Builder.restoreIP(CurrIP);
1558 
1559  // We may need to add the step a number of times, depending on the unroll
1560  // factor. The last of those goes into the PHI.
1561  PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1563  VecInd->setDebugLoc(EntryVal->getDebugLoc());
1564  Instruction *LastInduction = VecInd;
1565  for (unsigned Part = 0; Part < UF; ++Part) {
1566  VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1567 
1568  if (isa<TruncInst>(EntryVal))
1569  addMetadata(LastInduction, EntryVal);
1570  recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1571 
1572  LastInduction = cast<Instruction>(addFastMathFlag(
1573  Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1574  LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1575  }
1576 
1577  // Move the last step to the end of the latch block. This ensures consistent
1578  // placement of all induction updates.
1579  auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1580  auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1581  auto *ICmp = cast<Instruction>(Br->getCondition());
1582  LastInduction->moveBefore(ICmp);
1583  LastInduction->setName("vec.ind.next");
1584 
1585  VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1586  VecInd->addIncoming(LastInduction, LoopVectorLatch);
1587 }
1588 
1590  return Cost->isScalarAfterVectorization(I, VF) ||
1592 }
1593 
1596  return true;
1597  auto isScalarInst = [&](User *U) -> bool {
1598  auto *I = cast<Instruction>(U);
1600  };
1601  return llvm::any_of(IV->users(), isScalarInst);
1602 }
1603 
1605  const InductionDescriptor &ID, const Instruction *EntryVal,
1606  Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1607  assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1608  "Expected either an induction phi-node or a truncate of it!");
1609 
1610  // This induction variable is not the phi from the original loop but the
1611  // newly-created IV based on the proof that casted Phi is equal to the
1612  // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1613  // re-uses the same InductionDescriptor that original IV uses but we don't
1614  // have to do any recording in this case - that is done when original IV is
1615  // processed.
1616  if (isa<TruncInst>(EntryVal))
1617  return;
1618 
1619  const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1620  if (Casts.empty())
1621  return;
1622  // Only the first Cast instruction in the Casts vector is of interest.
1623  // The rest of the Casts (if exist) have no uses outside the
1624  // induction update chain itself.
1625  Instruction *CastInst = *Casts.begin();
1626  if (Lane < UINT_MAX)
1627  VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1628  else
1629  VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1630 }
1631 
1633  assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1634  "Primary induction variable must have an integer type");
1635 
1636  auto II = Legal->getInductionVars()->find(IV);
1637  assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
1638 
1639  auto ID = II->second;
1640  assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1641 
1642  // The scalar value to broadcast. This will be derived from the canonical
1643  // induction variable.
1644  Value *ScalarIV = nullptr;
1645 
1646  // The value from the original loop to which we are mapping the new induction
1647  // variable.
1648  Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1649 
1650  // True if we have vectorized the induction variable.
1651  auto VectorizedIV = false;
1652 
1653  // Determine if we want a scalar version of the induction variable. This is
1654  // true if the induction variable itself is not widened, or if it has at
1655  // least one user in the loop that is not widened.
1656  auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
1657 
1658  // Generate code for the induction step. Note that induction steps are
1659  // required to be loop-invariant
1660  assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
1661  "Induction step should be loop invariant");
1662  auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1663  Value *Step = nullptr;
1664  if (PSE.getSE()->isSCEVable(IV->getType())) {
1665  SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1666  Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
1668  } else {
1669  Step = cast<SCEVUnknown>(ID.getStep())->getValue();
1670  }
1671 
1672  // Try to create a new independent vector induction variable. If we can't
1673  // create the phi node, we will splat the scalar induction variable in each
1674  // loop iteration.
1675  if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
1676  createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1677  VectorizedIV = true;
1678  }
1679 
1680  // If we haven't yet vectorized the induction variable, or if we will create
1681  // a scalar one, we need to define the scalar induction variable and step
1682  // values. If we were given a truncation type, truncate the canonical
1683  // induction variable and step. Otherwise, derive these values from the
1684  // induction descriptor.
1685  if (!VectorizedIV || NeedsScalarIV) {
1686  ScalarIV = Induction;
1687  if (IV != OldInduction) {
1688  ScalarIV = IV->getType()->isIntegerTy()
1690  : Builder.CreateCast(Instruction::SIToFP, Induction,
1691  IV->getType());
1692  ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1693  ScalarIV->setName("offset.idx");
1694  }
1695  if (Trunc) {
1696  auto *TruncType = cast<IntegerType>(Trunc->getType());
1697  assert(Step->getType()->isIntegerTy() &&
1698  "Truncation requires an integer step");
1699  ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1700  Step = Builder.CreateTrunc(Step, TruncType);
1701  }
1702  }
1703 
1704  // If we haven't yet vectorized the induction variable, splat the scalar
1705  // induction variable, and build the necessary step vectors.
1706  // TODO: Don't do it unless the vectorized IV is really required.
1707  if (!VectorizedIV) {
1708  Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1709  for (unsigned Part = 0; Part < UF; ++Part) {
1710  Value *EntryPart =
1711  getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1712  VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1713  if (Trunc)
1714  addMetadata(EntryPart, Trunc);
1715  recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1716  }
1717  }
1718 
1719  // If an induction variable is only used for counting loop iterations or
1720  // calculating addresses, it doesn't need to be widened. Create scalar steps
1721  // that can be used by instructions we will later scalarize. Note that the
1722  // addition of the scalar steps will not increase the number of instructions
1723  // in the loop in the common case prior to InstCombine. We will be trading
1724  // one vector extract for each scalar step.
1725  if (NeedsScalarIV)
1726  buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1727 }
1728 
1730  Instruction::BinaryOps BinOp) {
1731  // Create and check the types.
1732  assert(Val->getType()->isVectorTy() && "Must be a vector");
1733  int VLen = Val->getType()->getVectorNumElements();
1734 
1735  Type *STy = Val->getType()->getScalarType();
1736  assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1737  "Induction Step must be an integer or FP");
1738  assert(Step->getType() == STy && "Step has wrong type");
1739 
1741 
1742  if (STy->isIntegerTy()) {
1743  // Create a vector of consecutive numbers from zero to VF.
1744  for (int i = 0; i < VLen; ++i)
1745  Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1746 
1747  // Add the consecutive indices to the vector value.
1748  Constant *Cv = ConstantVector::get(Indices);
1749  assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1750  Step = Builder.CreateVectorSplat(VLen, Step);
1751  assert(Step->getType() == Val->getType() && "Invalid step vec");
1752  // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1753  // which can be found from the original scalar operations.
1754  Step = Builder.CreateMul(Cv, Step);
1755  return Builder.CreateAdd(Val, Step, "induction");
1756  }
1757 
1758  // Floating point induction.
1759  assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1760  "Binary Opcode should be specified for FP induction");
1761  // Create a vector of consecutive numbers from zero to VF.
1762  for (int i = 0; i < VLen; ++i)
1763  Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1764 
1765  // Add the consecutive indices to the vector value.
1766  Constant *Cv = ConstantVector::get(Indices);
1767 
1768  Step = Builder.CreateVectorSplat(VLen, Step);
1769 
1770  // Floating point operations had to be 'fast' to enable the induction.
1771  FastMathFlags Flags;
1772  Flags.setFast();
1773 
1774  Value *MulOp = Builder.CreateFMul(Cv, Step);
1775  if (isa<Instruction>(MulOp))
1776  // Have to check, MulOp may be a constant
1777  cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1778 
1779  Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1780  if (isa<Instruction>(BOp))
1781  cast<Instruction>(BOp)->setFastMathFlags(Flags);
1782  return BOp;
1783 }
1784 
1786  Instruction *EntryVal,
1787  const InductionDescriptor &ID) {
1788  // We shouldn't have to build scalar steps if we aren't vectorizing.
1789  assert(VF > 1 && "VF should be greater than one");
1790 
1791  // Get the value type and ensure it and the step have the same integer type.
1792  Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1793  assert(ScalarIVTy == Step->getType() &&
1794  "Val and Step should have the same type");
1795 
1796  // We build scalar steps for both integer and floating-point induction
1797  // variables. Here, we determine the kind of arithmetic we will perform.
1798  Instruction::BinaryOps AddOp;
1799  Instruction::BinaryOps MulOp;
1800  if (ScalarIVTy->isIntegerTy()) {
1801  AddOp = Instruction::Add;
1802  MulOp = Instruction::Mul;
1803  } else {
1804  AddOp = ID.getInductionOpcode();
1805  MulOp = Instruction::FMul;
1806  }
1807 
1808  // Determine the number of scalars we need to generate for each unroll
1809  // iteration. If EntryVal is uniform, we only need to generate the first
1810  // lane. Otherwise, we generate all VF values.
1811  unsigned Lanes =
1812  Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1813  : VF;
1814  // Compute the scalar steps and save the results in VectorLoopValueMap.
1815  for (unsigned Part = 0; Part < UF; ++Part) {
1816  for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
1817  auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1818  auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1819  auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
1820  VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
1821  recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
1822  }
1823  }
1824 }
1825 
1827  assert(V != Induction && "The new induction variable should not be used.");
1828  assert(!V->getType()->isVectorTy() && "Can't widen a vector");
1829  assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1830 
1831  // If we have a stride that is replaced by one, do it here. Defer this for
1832  // the VPlan-native path until we start running Legal checks in that path.
1834  V = ConstantInt::get(V->getType(), 1);
1835 
1836  // If we have a vector mapped to this value, return it.
1837  if (VectorLoopValueMap.hasVectorValue(V, Part))
1838  return VectorLoopValueMap.getVectorValue(V, Part);
1839 
1840  // If the value has not been vectorized, check if it has been scalarized
1841  // instead. If it has been scalarized, and we actually need the value in
1842  // vector form, we will construct the vector values on demand.
1844  Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
1845 
1846  // If we've scalarized a value, that value should be an instruction.
1847  auto *I = cast<Instruction>(V);
1848 
1849  // If we aren't vectorizing, we can just copy the scalar map values over to
1850  // the vector map.
1851  if (VF == 1) {
1852  VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
1853  return ScalarValue;
1854  }
1855 
1856  // Get the last scalar instruction we generated for V and Part. If the value
1857  // is known to be uniform after vectorization, this corresponds to lane zero
1858  // of the Part unroll iteration. Otherwise, the last instruction is the one
1859  // we created for the last vector lane of the Part unroll iteration.
1860  unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
1861  auto *LastInst = cast<Instruction>(
1862  VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
1863 
1864  // Set the insert point after the last scalarized instruction. This ensures
1865  // the insertelement sequence will directly follow the scalar definitions.
1866  auto OldIP = Builder.saveIP();
1867  auto NewIP = std::next(BasicBlock::iterator(LastInst));
1868  Builder.SetInsertPoint(&*NewIP);
1869 
1870  // However, if we are vectorizing, we need to construct the vector values.
1871  // If the value is known to be uniform after vectorization, we can just
1872  // broadcast the scalar value corresponding to lane zero for each unroll
1873  // iteration. Otherwise, we construct the vector values using insertelement
1874  // instructions. Since the resulting vectors are stored in
1875  // VectorLoopValueMap, we will only generate the insertelements once.
1876  Value *VectorValue = nullptr;
1878  VectorValue = getBroadcastInstrs(ScalarValue);
1879  VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
1880  } else {
1881  // Initialize packing with insertelements to start from undef.
1883  VectorLoopValueMap.setVectorValue(V, Part, Undef);
1884  for (unsigned Lane = 0; Lane < VF; ++Lane)
1885  packScalarIntoVectorValue(V, {Part, Lane});
1886  VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
1887  }
1888  Builder.restoreIP(OldIP);
1889  return VectorValue;
1890  }
1891 
1892  // If this scalar is unknown, assume that it is a constant or that it is
1893  // loop invariant. Broadcast V and save the value for future uses.
1894  Value *B = getBroadcastInstrs(V);
1895  VectorLoopValueMap.setVectorValue(V, Part, B);
1896  return B;
1897 }
1898 
1899 Value *
1901  const VPIteration &Instance) {
1902  // If the value is not an instruction contained in the loop, it should
1903  // already be scalar.
1904  if (OrigLoop->isLoopInvariant(V))
1905  return V;
1906 
1907  assert(Instance.Lane > 0
1908  ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
1909  : true && "Uniform values only have lane zero");
1910 
1911  // If the value from the original loop has not been vectorized, it is
1912  // represented by UF x VF scalar values in the new loop. Return the requested
1913  // scalar value.
1914  if (VectorLoopValueMap.hasScalarValue(V, Instance))
1915  return VectorLoopValueMap.getScalarValue(V, Instance);
1916 
1917  // If the value has not been scalarized, get its entry in VectorLoopValueMap
1918  // for the given unroll part. If this entry is not a vector type (i.e., the
1919  // vectorization factor is one), there is no need to generate an
1920  // extractelement instruction.
1921  auto *U = getOrCreateVectorValue(V, Instance.Part);
1922  if (!U->getType()->isVectorTy()) {
1923  assert(VF == 1 && "Value not scalarized has non-vector type");
1924  return U;
1925  }
1926 
1927  // Otherwise, the value from the original loop has been vectorized and is
1928  // represented by UF vector values. Extract and return the requested scalar
1929  // value from the appropriate vector lane.
1930  return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
1931 }
1932 
1934  Value *V, const VPIteration &Instance) {
1935  assert(V != Induction && "The new induction variable should not be used.");
1936  assert(!V->getType()->isVectorTy() && "Can't pack a vector");
1937  assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1938 
1939  Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
1940  Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
1941  VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
1942  Builder.getInt32(Instance.Lane));
1943  VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
1944 }
1945 
1947  assert(Vec->getType()->isVectorTy() && "Invalid type");
1948  SmallVector<Constant *, 8> ShuffleMask;
1949  for (unsigned i = 0; i < VF; ++i)
1950  ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
1951 
1952  return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
1953  ConstantVector::get(ShuffleMask),
1954  "reverse");
1955 }
1956 
1957 // Return whether we allow using masked interleave-groups (for dealing with
1958 // strided loads/stores that reside in predicated blocks, or for dealing
1959 // with gaps).
1961  // If an override option has been passed in for interleaved accesses, use it.
1962  if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
1964 
1966 }
1967 
1968 // Try to vectorize the interleave group that \p Instr belongs to.
1969 //
1970 // E.g. Translate following interleaved load group (factor = 3):
1971 // for (i = 0; i < N; i+=3) {
1972 // R = Pic[i]; // Member of index 0
1973 // G = Pic[i+1]; // Member of index 1
1974 // B = Pic[i+2]; // Member of index 2
1975 // ... // do something to R, G, B
1976 // }
1977 // To:
1978 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
1979 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements
1980 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements
1981 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements
1982 //
1983 // Or translate following interleaved store group (factor = 3):
1984 // for (i = 0; i < N; i+=3) {
1985 // ... do something to R, G, B
1986 // Pic[i] = R; // Member of index 0
1987 // Pic[i+1] = G; // Member of index 1
1988 // Pic[i+2] = B; // Member of index 2
1989 // }
1990 // To:
1991 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
1992 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
1993 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
1994 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
1995 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
1997  VectorParts *BlockInMask) {
1998  const InterleaveGroup<Instruction> *Group =
2000  assert(Group && "Fail to get an interleaved access group.");
2001 
2002  // Skip if current instruction is not the insert position.
2003  if (Instr != Group->getInsertPos())
2004  return;
2005 
2006  const DataLayout &DL = Instr->getModule()->getDataLayout();
2007  Value *Ptr = getLoadStorePointerOperand(Instr);
2008 
2009  // Prepare for the vector type of the interleaved load/store.
2010  Type *ScalarTy = getMemInstValueType(Instr);
2011  unsigned InterleaveFactor = Group->getFactor();
2012  Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2013  Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr));
2014 
2015  // Prepare for the new pointers.
2017  SmallVector<Value *, 2> NewPtrs;
2018  unsigned Index = Group->getIndex(Instr);
2019 
2020  VectorParts Mask;
2021  bool IsMaskForCondRequired = BlockInMask;
2022  if (IsMaskForCondRequired) {
2023  Mask = *BlockInMask;
2024  // TODO: extend the masked interleaved-group support to reversed access.
2025  assert(!Group->isReverse() && "Reversed masked interleave-group "
2026  "not supported.");
2027  }
2028 
2029  // If the group is reverse, adjust the index to refer to the last vector lane
2030  // instead of the first. We adjust the index from the first vector lane,
2031  // rather than directly getting the pointer for lane VF - 1, because the
2032  // pointer operand of the interleaved access is supposed to be uniform. For
2033  // uniform instructions, we're only required to generate a value for the
2034  // first vector lane in each unroll iteration.
2035  if (Group->isReverse())
2036  Index += (VF - 1) * Group->getFactor();
2037 
2038  bool InBounds = false;
2039  if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2040  InBounds = gep->isInBounds();
2041 
2042  for (unsigned Part = 0; Part < UF; Part++) {
2043  Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
2044 
2045  // Notice current instruction could be any index. Need to adjust the address
2046  // to the member of index 0.
2047  //
2048  // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2049  // b = A[i]; // Member of index 0
2050  // Current pointer is pointed to A[i+1], adjust it to A[i].
2051  //
2052  // E.g. A[i+1] = a; // Member of index 1
2053  // A[i] = b; // Member of index 0
2054  // A[i+2] = c; // Member of index 2 (Current instruction)
2055  // Current pointer is pointed to A[i+2], adjust it to A[i].
2056  NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index));
2057  if (InBounds)
2058  cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);
2059 
2060  // Cast to the vector pointer type.
2061  NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
2062  }
2063 
2064  setDebugLocFromInst(Builder, Instr);
2065  Value *UndefVec = UndefValue::get(VecTy);
2066 
2067  Value *MaskForGaps = nullptr;
2068  if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2069  MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2070  assert(MaskForGaps && "Mask for Gaps is required but it is null");
2071  }
2072 
2073  // Vectorize the interleaved load group.
2074  if (isa<LoadInst>(Instr)) {
2075  // For each unroll part, create a wide load for the group.
2076  SmallVector<Value *, 2> NewLoads;
2077  for (unsigned Part = 0; Part < UF; Part++) {
2078  Instruction *NewLoad;
2079  if (IsMaskForCondRequired || MaskForGaps) {
2081  "masked interleaved groups are not allowed.");
2082  Value *GroupMask = MaskForGaps;
2083  if (IsMaskForCondRequired) {
2084  auto *Undefs = UndefValue::get(Mask[Part]->getType());
2085  auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2086  Value *ShuffledMask = Builder.CreateShuffleVector(
2087  Mask[Part], Undefs, RepMask, "interleaved.mask");
2088  GroupMask = MaskForGaps
2089  ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2090  MaskForGaps)
2091  : ShuffledMask;
2092  }
2093  NewLoad =
2094  Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),
2095  GroupMask, UndefVec, "wide.masked.vec");
2096  }
2097  else
2098  NewLoad = Builder.CreateAlignedLoad(NewPtrs[Part],
2099  Group->getAlignment(), "wide.vec");
2100  Group->addMetadata(NewLoad);
2101  NewLoads.push_back(NewLoad);
2102  }
2103 
2104  // For each member in the group, shuffle out the appropriate data from the
2105  // wide loads.
2106  for (unsigned I = 0; I < InterleaveFactor; ++I) {
2107  Instruction *Member = Group->getMember(I);
2108 
2109  // Skip the gaps in the group.
2110  if (!Member)
2111  continue;
2112 
2113  Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2114  for (unsigned Part = 0; Part < UF; Part++) {
2115  Value *StridedVec = Builder.CreateShuffleVector(
2116  NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2117 
2118  // If this member has different type, cast the result type.
2119  if (Member->getType() != ScalarTy) {
2120  VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2121  StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2122  }
2123 
2124  if (Group->isReverse())
2125  StridedVec = reverseVector(StridedVec);
2126 
2127  VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2128  }
2129  }
2130  return;
2131  }
2132 
2133  // The sub vector type for current instruction.
2134  VectorType *SubVT = VectorType::get(ScalarTy, VF);
2135 
2136  // Vectorize the interleaved store group.
2137  for (unsigned Part = 0; Part < UF; Part++) {
2138  // Collect the stored vector from each member.
2139  SmallVector<Value *, 4> StoredVecs;
2140  for (unsigned i = 0; i < InterleaveFactor; i++) {
2141  // Interleaved store group doesn't allow a gap, so each index has a member
2142  Instruction *Member = Group->getMember(i);
2143  assert(Member && "Fail to get a member from an interleaved store group");
2144 
2145  Value *StoredVec = getOrCreateVectorValue(
2146  cast<StoreInst>(Member)->getValueOperand(), Part);
2147  if (Group->isReverse())
2148  StoredVec = reverseVector(StoredVec);
2149 
2150  // If this member has different type, cast it to a unified type.
2151 
2152  if (StoredVec->getType() != SubVT)
2153  StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2154 
2155  StoredVecs.push_back(StoredVec);
2156  }
2157 
2158  // Concatenate all vectors into a wide vector.
2159  Value *WideVec = concatenateVectors(Builder, StoredVecs);
2160 
2161  // Interleave the elements in the wide vector.
2162  Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2163  Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2164  "interleaved.vec");
2165 
2166  Instruction *NewStoreInstr;
2167  if (IsMaskForCondRequired) {
2168  auto *Undefs = UndefValue::get(Mask[Part]->getType());
2169  auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2170  Value *ShuffledMask = Builder.CreateShuffleVector(
2171  Mask[Part], Undefs, RepMask, "interleaved.mask");
2172  NewStoreInstr = Builder.CreateMaskedStore(
2173  IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask);
2174  }
2175  else
2176  NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part],
2177  Group->getAlignment());
2178 
2179  Group->addMetadata(NewStoreInstr);
2180  }
2181 }
2182 
2184  VectorParts *BlockInMask) {
2185  // Attempt to issue a wide load.
2186  LoadInst *LI = dyn_cast<LoadInst>(Instr);
2187  StoreInst *SI = dyn_cast<StoreInst>(Instr);
2188 
2189  assert((LI || SI) && "Invalid Load/Store instruction");
2190 
2192  Cost->getWideningDecision(Instr, VF);
2194  "CM decision should be taken at this point");
2196  return vectorizeInterleaveGroup(Instr);
2197 
2198  Type *ScalarDataTy = getMemInstValueType(Instr);
2199  Type *DataTy = VectorType::get(ScalarDataTy, VF);
2200  Value *Ptr = getLoadStorePointerOperand(Instr);
2201  unsigned Alignment = getLoadStoreAlignment(Instr);
2202  // An alignment of 0 means target abi alignment. We need to use the scalar's
2203  // target abi alignment in such a case.
2204  const DataLayout &DL = Instr->getModule()->getDataLayout();
2205  if (!Alignment)
2206  Alignment = DL.getABITypeAlignment(ScalarDataTy);
2207  unsigned AddressSpace = getLoadStoreAddressSpace(Instr);
2208 
2209  // Determine if the pointer operand of the access is either consecutive or
2210  // reverse consecutive.
2211  bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2212  bool ConsecutiveStride =
2213  Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2214  bool CreateGatherScatter =
2216 
2217  // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2218  // gather/scatter. Otherwise Decision should have been to Scalarize.
2219  assert((ConsecutiveStride || CreateGatherScatter) &&
2220  "The instruction should be scalarized");
2221 
2222  // Handle consecutive loads/stores.
2223  if (ConsecutiveStride)
2224  Ptr = getOrCreateScalarValue(Ptr, {0, 0});
2225 
2226  VectorParts Mask;
2227  bool isMaskRequired = BlockInMask;
2228  if (isMaskRequired)
2229  Mask = *BlockInMask;
2230 
2231  bool InBounds = false;
2232  if (auto *gep = dyn_cast<GetElementPtrInst>(
2233  getLoadStorePointerOperand(Instr)->stripPointerCasts()))
2234  InBounds = gep->isInBounds();
2235 
2236  const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2237  // Calculate the pointer for the specific unroll-part.
2238  GetElementPtrInst *PartPtr = nullptr;
2239 
2240  if (Reverse) {
2241  // If the address is consecutive but reversed, then the
2242  // wide store needs to start at the last vector element.
2243  PartPtr = cast<GetElementPtrInst>(
2244  Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)));
2245  PartPtr->setIsInBounds(InBounds);
2246  PartPtr = cast<GetElementPtrInst>(
2247  Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)));
2248  PartPtr->setIsInBounds(InBounds);
2249  if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2250  Mask[Part] = reverseVector(Mask[Part]);
2251  } else {
2252  PartPtr = cast<GetElementPtrInst>(
2253  Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)));
2254  PartPtr->setIsInBounds(InBounds);
2255  }
2256 
2257  return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2258  };
2259 
2260  // Handle Stores:
2261  if (SI) {
2263 
2264  for (unsigned Part = 0; Part < UF; ++Part) {
2265  Instruction *NewSI = nullptr;
2266  Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
2267  if (CreateGatherScatter) {
2268  Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2269  Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2270  NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2271  MaskPart);
2272  } else {
2273  if (Reverse) {
2274  // If we store to reverse consecutive memory locations, then we need
2275  // to reverse the order of elements in the stored value.
2276  StoredVal = reverseVector(StoredVal);
2277  // We don't want to update the value in the map as it might be used in
2278  // another expression. So don't call resetVectorValue(StoredVal).
2279  }
2280  auto *VecPtr = CreateVecPtr(Part, Ptr);
2281  if (isMaskRequired)
2282  NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2283  Mask[Part]);
2284  else
2285  NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2286  }
2287  addMetadata(NewSI, SI);
2288  }
2289  return;
2290  }
2291 
2292  // Handle loads.
2293  assert(LI && "Must have a load instruction");
2295  for (unsigned Part = 0; Part < UF; ++Part) {
2296  Value *NewLI;
2297  if (CreateGatherScatter) {
2298  Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2299  Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2300  NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2301  nullptr, "wide.masked.gather");
2302  addMetadata(NewLI, LI);
2303  } else {
2304  auto *VecPtr = CreateVecPtr(Part, Ptr);
2305  if (isMaskRequired)
2306  NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
2307  UndefValue::get(DataTy),
2308  "wide.masked.load");
2309  else
2310  NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");
2311 
2312  // Add metadata to the load, but setVectorValue to the reverse shuffle.
2313  addMetadata(NewLI, LI);
2314  if (Reverse)
2315  NewLI = reverseVector(NewLI);
2316  }
2317  VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2318  }
2319 }
2320 
2322  const VPIteration &Instance,
2323  bool IfPredicateInstr) {
2324  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2325 
2326  setDebugLocFromInst(Builder, Instr);
2327 
2328  // Does this instruction return a value ?
2329  bool IsVoidRetTy = Instr->getType()->isVoidTy();
2330 
2331  Instruction *Cloned = Instr->clone();
2332  if (!IsVoidRetTy)
2333  Cloned->setName(Instr->getName() + ".cloned");
2334 
2335  // Replace the operands of the cloned instructions with their scalar
2336  // equivalents in the new loop.
2337  for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2338  auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2339  Cloned->setOperand(op, NewOp);
2340  }
2341  addNewMetadata(Cloned, Instr);
2342 
2343  // Place the cloned scalar in the new loop.
2344  Builder.Insert(Cloned);
2345 
2346  // Add the cloned scalar to the scalar map entry.
2347  VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2348 
2349  // If we just cloned a new assumption, add it the assumption cache.
2350  if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2351  if (II->getIntrinsicID() == Intrinsic::assume)
2352  AC->registerAssumption(II);
2353 
2354  // End if-block.
2355  if (IfPredicateInstr)
2356  PredicatedInstructions.push_back(Cloned);
2357 }
2358 
2360  Value *End, Value *Step,
2361  Instruction *DL) {
2362  BasicBlock *Header = L->getHeader();
2363  BasicBlock *Latch = L->getLoopLatch();
2364  // As we're just creating this loop, it's possible no latch exists
2365  // yet. If so, use the header as this will be a single block loop.
2366  if (!Latch)
2367  Latch = Header;
2368 
2371  setDebugLocFromInst(Builder, OldInst);
2372  auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2373 
2375  setDebugLocFromInst(Builder, OldInst);
2376 
2377  // Create i+1 and fill the PHINode.
2378  Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2379  Induction->addIncoming(Start, L->getLoopPreheader());
2380  Induction->addIncoming(Next, Latch);
2381  // Create the compare.
2382  Value *ICmp = Builder.CreateICmpEQ(Next, End);
2383  Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2384 
2385  // Now we have two terminators. Remove the old one from the block.
2386  Latch->getTerminator()->eraseFromParent();
2387 
2388  return Induction;
2389 }
2390 
2392  if (TripCount)
2393  return TripCount;
2394 
2395  assert(L && "Create Trip Count for null loop.");
2397  // Find the loop boundaries.
2398  ScalarEvolution *SE = PSE.getSE();
2399  const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2400  assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2401  "Invalid loop count");
2402 
2403  Type *IdxTy = Legal->getWidestInductionType();
2404  assert(IdxTy && "No type for induction");
2405 
2406  // The exit count might have the type of i64 while the phi is i32. This can
2407  // happen if we have an induction variable that is sign extended before the
2408  // compare. The only way that we get a backedge taken count is that the
2409  // induction variable was signed and as such will not overflow. In such a case
2410  // truncation is legal.
2411  if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
2412  IdxTy->getPrimitiveSizeInBits())
2413  BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2414  BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2415 
2416  // Get the total trip count from the count by adding 1.
2417  const SCEV *ExitCount = SE->getAddExpr(
2418  BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2419 
2420  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2421 
2422  // Expand the trip count and place the new instructions in the preheader.
2423  // Notice that the pre-header does not change, only the loop body.
2424  SCEVExpander Exp(*SE, DL, "induction");
2425 
2426  // Count holds the overall loop count (N).
2427  TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2429 
2430  if (TripCount->getType()->isPointerTy())
2431  TripCount =
2432  CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2434 
2435  return TripCount;
2436 }
2437 
2439  if (VectorTripCount)
2440  return VectorTripCount;
2441 
2442  Value *TC = getOrCreateTripCount(L);
2444 
2445  Type *Ty = TC->getType();
2446  Constant *Step = ConstantInt::get(Ty, VF * UF);
2447 
2448  // If the tail is to be folded by masking, round the number of iterations N
2449  // up to a multiple of Step instead of rounding down. This is done by first
2450  // adding Step-1 and then rounding down. Note that it's ok if this addition
2451  // overflows: the vector induction variable will eventually wrap to zero given
2452  // that it starts at zero and its Step is a power of two; the loop will then
2453  // exit, with the last early-exit vector comparison also producing all-true.
2454  if (Cost->foldTailByMasking()) {
2455  assert(isPowerOf2_32(VF * UF) &&
2456  "VF*UF must be a power of 2 when folding tail by masking");
2457  TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2458  }
2459 
2460  // Now we need to generate the expression for the part of the loop that the
2461  // vectorized body will execute. This is equal to N - (N % Step) if scalar
2462  // iterations are not required for correctness, or N - Step, otherwise. Step
2463  // is equal to the vectorization factor (number of SIMD elements) times the
2464  // unroll factor (number of SIMD instructions).
2465  Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2466 
2467  // If there is a non-reversed interleaved group that may speculatively access
2468  // memory out-of-bounds, we need to ensure that there will be at least one
2469  // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2470  // the trip count, we set the remainder to be equal to the step. If the step
2471  // does not evenly divide the trip count, no adjustment is necessary since
2472  // there will already be scalar iterations. Note that the minimum iterations
2473  // check ensures that N >= Step.
2474  if (VF > 1 && Cost->requiresScalarEpilogue()) {
2475  auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2476  R = Builder.CreateSelect(IsZero, Step, R);
2477  }
2478 
2479  VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2480 
2481  return VectorTripCount;
2482 }
2483 
2485  const DataLayout &DL) {
2486  // Verify that V is a vector type with same number of elements as DstVTy.
2487  unsigned VF = DstVTy->getNumElements();
2488  VectorType *SrcVecTy = cast<VectorType>(V->getType());
2489  assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2490  Type *SrcElemTy = SrcVecTy->getElementType();
2491  Type *DstElemTy = DstVTy->getElementType();
2492  assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2493  "Vector elements must have same size");
2494 
2495  // Do a direct cast if element types are castable.
2496  if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2497  return Builder.CreateBitOrPointerCast(V, DstVTy);
2498  }
2499  // V cannot be directly casted to desired vector type.
2500  // May happen when V is a floating point vector but DstVTy is a vector of
2501  // pointers or vice-versa. Handle this using a two-step bitcast using an
2502  // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2503  assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2504  "Only one type should be a pointer type");
2505  assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2506  "Only one type should be a floating point type");
2507  Type *IntTy =
2509  VectorType *VecIntTy = VectorType::get(IntTy, VF);
2510  Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2511  return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2512 }
2513 
2515  BasicBlock *Bypass) {
2516  Value *Count = getOrCreateTripCount(L);
2517  BasicBlock *BB = L->getLoopPreheader();
2519 
2520  // Generate code to check if the loop's trip count is less than VF * UF, or
2521  // equal to it in case a scalar epilogue is required; this implies that the
2522  // vector trip count is zero. This check also covers the case where adding one
2523  // to the backedge-taken count overflowed leading to an incorrect trip count
2524  // of zero. In this case we will also jump to the scalar loop.
2527 
2528  // If tail is to be folded, vector loop takes care of all iterations.
2529  Value *CheckMinIters = Builder.getFalse();
2530  if (!Cost->foldTailByMasking())
2531  CheckMinIters = Builder.CreateICmp(
2532  P, Count, ConstantInt::get(Count->getType(), VF * UF),
2533  "min.iters.check");
2534 
2535  BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2536  // Update dominator tree immediately if the generated block is a
2537  // LoopBypassBlock because SCEV expansions to generate loop bypass
2538  // checks may query it before the current function is finished.
2539  DT->addNewBlock(NewBB, BB);
2540  if (L->getParentLoop())
2541  L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2543  BranchInst::Create(Bypass, NewBB, CheckMinIters));
2544  LoopBypassBlocks.push_back(BB);
2545 }
2546 
2548  BasicBlock *BB = L->getLoopPreheader();
2549 
2550  // Generate the code to check that the SCEV assumptions that we made.
2551  // We want the new basic block to start at the first instruction in a
2552  // sequence of instructions that form a check.
2553  SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2554  "scev.check");
2555  Value *SCEVCheck =
2556  Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
2557 
2558  if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2559  if (C->isZero())
2560  return;
2561 
2563  "Cannot SCEV check stride or overflow when folding tail");
2564  // Create a new block containing the stride check.
2565  BB->setName("vector.scevcheck");
2566  auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2567  // Update dominator tree immediately if the generated block is a
2568  // LoopBypassBlock because SCEV expansions to generate loop bypass
2569  // checks may query it before the current function is finished.
2570  DT->addNewBlock(NewBB, BB);
2571  if (L->getParentLoop())
2572  L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2574  BranchInst::Create(Bypass, NewBB, SCEVCheck));
2575  LoopBypassBlocks.push_back(BB);
2576  AddedSafetyChecks = true;
2577 }
2578 
2580  // VPlan-native path does not do any analysis for runtime checks currently.
2582  return;
2583 
2584  BasicBlock *BB = L->getLoopPreheader();
2585 
2586  // Generate the code that checks in runtime if arrays overlap. We put the
2587  // checks into a separate block to make the more common case of few elements
2588  // faster.
2589  Instruction *FirstCheckInst;
2590  Instruction *MemRuntimeCheck;
2591  std::tie(FirstCheckInst, MemRuntimeCheck) =
2593  if (!MemRuntimeCheck)
2594  return;
2595 
2596  assert(!Cost->foldTailByMasking() && "Cannot check memory when folding tail");
2597  // Create a new block containing the memory check.
2598  BB->setName("vector.memcheck");
2599  auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2600  // Update dominator tree immediately if the generated block is a
2601  // LoopBypassBlock because SCEV expansions to generate loop bypass
2602  // checks may query it before the current function is finished.
2603  DT->addNewBlock(NewBB, BB);
2604  if (L->getParentLoop())
2605  L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2607  BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
2608  LoopBypassBlocks.push_back(BB);
2609  AddedSafetyChecks = true;
2610 
2611  // We currently don't use LoopVersioning for the actual loop cloning but we
2612  // still use it to add the noalias metadata.
2613  LVer = llvm::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2614  PSE.getSE());
2615  LVer->prepareNoAliasMetadata();
2616 }
2617 
2619  IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2620  const InductionDescriptor &ID) const {
2621 
2622  SCEVExpander Exp(*SE, DL, "induction");
2623  auto Step = ID.getStep();
2624  auto StartValue = ID.getStartValue();
2625  assert(Index->getType() == Step->getType() &&
2626  "Index type does not match StepValue type");
2627 
2628  // Note: the IR at this point is broken. We cannot use SE to create any new
2629  // SCEV and then expand it, hoping that SCEV's simplification will give us
2630  // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2631  // lead to various SCEV crashes. So all we can do is to use builder and rely
2632  // on InstCombine for future simplifications. Here we handle some trivial
2633  // cases only.
2634  auto CreateAdd = [&B](Value *X, Value *Y) {
2635  assert(X->getType() == Y->getType() && "Types don't match!");
2636  if (auto *CX = dyn_cast<ConstantInt>(X))
2637  if (CX->isZero())
2638  return Y;
2639  if (auto *CY = dyn_cast<ConstantInt>(Y))
2640  if (CY->isZero())
2641  return X;
2642  return B.CreateAdd(X, Y);
2643  };
2644 
2645  auto CreateMul = [&B](Value *X, Value *Y) {
2646  assert(X->getType() == Y->getType() && "Types don't match!");
2647  if (auto *CX = dyn_cast<ConstantInt>(X))
2648  if (CX->isOne())
2649  return Y;
2650  if (auto *CY = dyn_cast<ConstantInt>(Y))
2651  if (CY->isOne())
2652  return X;
2653  return B.CreateMul(X, Y);
2654  };
2655 
2656  switch (ID.getKind()) {
2658  assert(Index->getType() == StartValue->getType() &&
2659  "Index type does not match StartValue type");
2661  return B.CreateSub(StartValue, Index);
2662  auto *Offset = CreateMul(
2663  Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2664  return CreateAdd(StartValue, Offset);
2665  }
2667  assert(isa<SCEVConstant>(Step) &&
2668  "Expected constant step for pointer induction");
2669  return B.CreateGEP(
2670  nullptr, StartValue,
2671  CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2672  &*B.GetInsertPoint())));
2673  }
2675  assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2676  auto InductionBinOp = ID.getInductionBinOp();
2677  assert(InductionBinOp &&
2678  (InductionBinOp->getOpcode() == Instruction::FAdd ||
2679  InductionBinOp->getOpcode() == Instruction::FSub) &&
2680  "Original bin op should be defined for FP induction");
2681 
2682  Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2683 
2684  // Floating point operations had to be 'fast' to enable the induction.
2685  FastMathFlags Flags;
2686  Flags.setFast();
2687 
2688  Value *MulExp = B.CreateFMul(StepValue, Index);
2689  if (isa<Instruction>(MulExp))
2690  // We have to check, the MulExp may be a constant.
2691  cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2692 
2693  Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2694  "induction");
2695  if (isa<Instruction>(BOp))
2696  cast<Instruction>(BOp)->setFastMathFlags(Flags);
2697 
2698  return BOp;
2699  }
2701  return nullptr;
2702  }
2703  llvm_unreachable("invalid enum");
2704 }
2705 
2707  /*
2708  In this function we generate a new loop. The new loop will contain
2709  the vectorized instructions while the old loop will continue to run the
2710  scalar remainder.
2711 
2712  [ ] <-- loop iteration number check.
2713  / |
2714  / v
2715  | [ ] <-- vector loop bypass (may consist of multiple blocks).
2716  | / |
2717  | / v
2718  || [ ] <-- vector pre header.
2719  |/ |
2720  | v
2721  | [ ] \
2722  | [ ]_| <-- vector loop.
2723  | |
2724  | v
2725  | -[ ] <--- middle-block.
2726  | / |
2727  | / v
2728  -|- >[ ] <--- new preheader.
2729  | |
2730  | v
2731  | [ ] \
2732  | [ ]_| <-- old scalar loop to handle remainder.
2733  \ |
2734  \ v
2735  >[ ] <-- exit block.
2736  ...
2737  */
2738 
2739  BasicBlock *OldBasicBlock = OrigLoop->getHeader();
2740  BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2741  BasicBlock *ExitBlock = OrigLoop->getExitBlock();
2742  assert(VectorPH && "Invalid loop structure");
2743  assert(ExitBlock && "Must have an exit block");
2744 
2745  // Some loops have a single integer induction variable, while other loops
2746  // don't. One example is c++ iterators that often have multiple pointer
2747  // induction variables. In the code below we also support a case where we
2748  // don't have a single induction variable.
2749  //
2750  // We try to obtain an induction variable from the original loop as hard
2751  // as possible. However if we don't find one that:
2752  // - is an integer
2753  // - counts from zero, stepping by one
2754  // - is the size of the widest induction variable type
2755  // then we create a new one.
2757  Type *IdxTy = Legal->getWidestInductionType();
2758 
2759  // Split the single block loop into the two loop structure described above.
2760  BasicBlock *VecBody =
2761  VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
2762  BasicBlock *MiddleBlock =
2763  VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
2764  BasicBlock *ScalarPH =
2765  MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
2766 
2767  // Create and register the new vector loop.
2768  Loop *Lp = LI->AllocateLoop();
2769  Loop *ParentLoop = OrigLoop->getParentLoop();
2770 
2771  // Insert the new loop into the loop nest and register the new basic blocks
2772  // before calling any utilities such as SCEV that require valid LoopInfo.
2773  if (ParentLoop) {
2774  ParentLoop->addChildLoop(Lp);
2775  ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
2776  ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
2777  } else {
2778  LI->addTopLevelLoop(Lp);
2779  }
2780  Lp->addBasicBlockToLoop(VecBody, *LI);
2781 
2782  // Find the loop boundaries.
2783  Value *Count = getOrCreateTripCount(Lp);
2784 
2785  Value *StartIdx = ConstantInt::get(IdxTy, 0);
2786 
2787  // Now, compare the new count to zero. If it is zero skip the vector loop and
2788  // jump to the scalar loop. This check also covers the case where the
2789  // backedge-taken count is uint##_max: adding one to it will overflow leading
2790  // to an incorrect trip count of zero. In this (rare) case we will also jump
2791  // to the scalar loop.
2792  emitMinimumIterationCountCheck(Lp, ScalarPH);
2793 
2794  // Generate the code to check any assumptions that we've made for SCEV
2795  // expressions.
2796  emitSCEVChecks(Lp, ScalarPH);
2797 
2798  // Generate the code that checks in runtime if arrays overlap. We put the
2799  // checks into a separate block to make the more common case of few elements
2800  // faster.
2801  emitMemRuntimeChecks(Lp, ScalarPH);
2802 
2803  // Generate the induction variable.
2804  // The loop step is equal to the vectorization factor (num of SIMD elements)
2805  // times the unroll factor (num of SIMD instructions).
2806  Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
2807  Constant *Step = ConstantInt::get(IdxTy, VF * UF);
2808  Induction =
2809  createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
2811 
2812  // We are going to resume the execution of the scalar loop.
2813  // Go over all of the induction variables that we found and fix the
2814  // PHIs that are left in the scalar version of the loop.
2815  // The starting values of PHI nodes depend on the counter of the last
2816  // iteration in the vectorized loop.
2817  // If we come from a bypass edge then we need to start from the original
2818  // start value.
2819 
2820  // This variable saves the new starting index for the scalar loop. It is used
2821  // to test if there are any tail iterations left once the vector loop has
2822  // completed.
2824  for (auto &InductionEntry : *List) {
2825  PHINode *OrigPhi = InductionEntry.first;
2826  InductionDescriptor II = InductionEntry.second;
2827 
2828  // Create phi nodes to merge from the backedge-taken check block.
2829  PHINode *BCResumeVal = PHINode::Create(
2830  OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
2831  // Copy original phi DL over to the new one.
2832  BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
2833  Value *&EndValue = IVEndValues[OrigPhi];
2834  if (OrigPhi == OldInduction) {
2835  // We know what the end value is.
2836  EndValue = CountRoundDown;
2837  } else {
2839  Type *StepType = II.getStep()->getType();
2840  Instruction::CastOps CastOp =
2841  CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
2842  Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
2843  const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2844  EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
2845  EndValue->setName("ind.end");
2846  }
2847 
2848  // The new PHI merges the original incoming value, in case of a bypass,
2849  // or the value at the end of the vectorized loop.
2850  BCResumeVal->addIncoming(EndValue, MiddleBlock);
2851 
2852  // Fix the scalar body counter (PHI node).
2853  unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
2854 
2855  // The old induction's phi node in the scalar body needs the truncated
2856  // value.
2857  for (BasicBlock *BB : LoopBypassBlocks)
2858  BCResumeVal->addIncoming(II.getStartValue(), BB);
2859  OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
2860  }
2861 
2862  // Add a check in the middle block to see if we have completed
2863  // all of the iterations in the first vector loop.
2864  // If (N - N%VF) == N, then we *don't* need to run the remainder.
2865  // If tail is to be folded, we know we don't need to run the remainder.
2866  Value *CmpN = Builder.getTrue();
2867  if (!Cost->foldTailByMasking())
2868  CmpN =
2869  CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
2870  CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
2871  ReplaceInstWithInst(MiddleBlock->getTerminator(),
2872  BranchInst::Create(ExitBlock, ScalarPH, CmpN));
2873 
2874  // Get ready to start creating new instructions into the vectorized body.
2876 
2877  // Save the state.
2879  LoopScalarPreHeader = ScalarPH;
2880  LoopMiddleBlock = MiddleBlock;
2881  LoopExitBlock = ExitBlock;
2882  LoopVectorBody = VecBody;
2883  LoopScalarBody = OldBasicBlock;
2884 
2885  // Keep all loop hints from the original loop on the vector loop (we'll
2886  // replace the vectorizer-specific hints below).
2887  if (MDNode *LID = OrigLoop->getLoopID())
2888  Lp->setLoopID(LID);
2889 
2890  LoopVectorizeHints Hints(Lp, true, *ORE);
2891  Hints.setAlreadyVectorized();
2892 
2893  return LoopVectorPreHeader;
2894 }
2895 
2896 // Fix up external users of the induction variable. At this point, we are
2897 // in LCSSA form, with all external PHIs that use the IV having one input value,
2898 // coming from the remainder loop. We need those PHIs to also have a correct
2899 // value for the IV when arriving directly from the middle block.
2901  const InductionDescriptor &II,
2902  Value *CountRoundDown, Value *EndValue,
2903  BasicBlock *MiddleBlock) {
2904  // There are two kinds of external IV usages - those that use the value
2905  // computed in the last iteration (the PHI) and those that use the penultimate
2906  // value (the value that feeds into the phi from the loop latch).
2907  // We allow both, but they, obviously, have different values.
2908 
2909  assert(OrigLoop->getExitBlock() && "Expected a single exit block");
2910 
2911  DenseMap<Value *, Value *> MissingVals;
2912 
2913  // An external user of the last iteration's value should see the value that
2914  // the remainder loop uses to initialize its own IV.
2916  for (User *U : PostInc->users()) {
2917  Instruction *UI = cast<Instruction>(U);
2918  if (!OrigLoop->contains(UI)) {
2919  assert(isa<PHINode>(UI) && "Expected LCSSA form");
2920  MissingVals[UI] = EndValue;
2921  }
2922  }
2923 
2924  // An external user of the penultimate value need to see EndValue - Step.
2925  // The simplest way to get this is to recompute it from the constituent SCEVs,
2926  // that is Start + (Step * (CRD - 1)).
2927  for (User *U : OrigPhi->users()) {
2928  auto *UI = cast<Instruction>(U);
2929  if (!OrigLoop->contains(UI)) {
2930  const DataLayout &DL =
2932  assert(isa<PHINode>(UI) && "Expected LCSSA form");
2933 
2934  IRBuilder<> B(MiddleBlock->getTerminator());
2935  Value *CountMinusOne = B.CreateSub(
2936  CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
2937  Value *CMO =
2938  !II.getStep()->getType()->isIntegerTy()
2939  ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
2940  II.getStep()->getType())
2941  : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
2942  CMO->setName("cast.cmo");
2943  Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
2944  Escape->setName("ind.escape");
2945  MissingVals[UI] = Escape;
2946  }
2947  }
2948 
2949  for (auto &I : MissingVals) {
2950  PHINode *PHI = cast<PHINode>(I.first);
2951  // One corner case we have to handle is two IVs "chasing" each-other,
2952  // that is %IV2 = phi [...], [ %IV1, %latch ]
2953  // In this case, if IV1 has an external use, we need to avoid adding both
2954  // "last value of IV1" and "penultimate value of IV2". So, verify that we
2955  // don't already have an incoming value for the middle block.
2956  if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
2957  PHI->addIncoming(I.second, MiddleBlock);
2958  }
2959 }
2960 
2961 namespace {
2962 
2963 struct CSEDenseMapInfo {
2964  static bool canHandle(const Instruction *I) {
2965  return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
2966  isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
2967  }
2968 
2969  static inline Instruction *getEmptyKey() {
2971  }
2972 
2973  static inline Instruction *getTombstoneKey() {
2975  }
2976 
2977  static unsigned getHashValue(const Instruction *I) {
2978  assert(canHandle(I) && "Unknown instruction!");
2980  I->value_op_end()));
2981  }
2982 
2983  static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2984  if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2985  LHS == getTombstoneKey() || RHS == getTombstoneKey())
2986  return LHS == RHS;
2987  return LHS->isIdenticalTo(RHS);
2988  }
2989 };
2990 
2991 } // end anonymous namespace
2992 
2993 ///Perform cse of induction variable instructions.
2994 static void cse(BasicBlock *BB) {
2995  // Perform simple cse.
2997  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
2998  Instruction *In = &*I++;
2999 
3000  if (!CSEDenseMapInfo::canHandle(In))
3001  continue;
3002 
3003  // Check if we can replace this instruction with any of the
3004  // visited instructions.
3005  if (Instruction *V = CSEMap.lookup(In)) {
3006  In->replaceAllUsesWith(V);
3007  In->eraseFromParent();
3008  continue;
3009  }
3010 
3011  CSEMap[In] = In;
3012  }
3013 }
3014 
3015 /// Estimate the overhead of scalarizing an instruction. This is a
3016 /// convenience wrapper for the type-based getScalarizationOverhead API.
3017 static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,
3018  const TargetTransformInfo &TTI) {
3019  if (VF == 1)
3020  return 0;
3021 
3022  unsigned Cost = 0;
3023  Type *RetTy = ToVectorTy(I->getType(), VF);
3024  if (!RetTy->isVoidTy() &&
3025  (!isa<LoadInst>(I) ||
3027  Cost += TTI.getScalarizationOverhead(RetTy, true, false);
3028 
3029  // Some targets keep addresses scalar.
3030  if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
3031  return Cost;
3032 
3033  if (CallInst *CI = dyn_cast<CallInst>(I)) {
3034  SmallVector<const Value *, 4> Operands(CI->arg_operands());
3035  Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
3036  }
3037  else if (!isa<StoreInst>(I) ||
3040  Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
3041  }
3042 
3043  return Cost;
3044 }
3045 
3046 // Estimate cost of a call instruction CI if it were vectorized with factor VF.
3047 // Return the cost of the instruction, including scalarization overhead if it's
3048 // needed. The flag NeedToScalarize shows if the call needs to be scalarized -
3049 // i.e. either vector version isn't available, or is too expensive.
3050 static unsigned getVectorCallCost(CallInst *CI, unsigned VF,
3051  const TargetTransformInfo &TTI,
3052  const TargetLibraryInfo *TLI,
3053  bool &NeedToScalarize) {
3054  Function *F = CI->getCalledFunction();
3055  StringRef FnName = CI->getCalledFunction()->getName();
3056  Type *ScalarRetTy = CI->getType();
3057  SmallVector<Type *, 4> Tys, ScalarTys;
3058  for (auto &ArgOp : CI->arg_operands())
3059  ScalarTys.push_back(ArgOp->getType());
3060 
3061  // Estimate cost of scalarized vector call. The source operands are assumed
3062  // to be vectors, so we need to extract individual elements from there,
3063  // execute VF scalar calls, and then gather the result into the vector return
3064  // value.
3065  unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3066  if (VF == 1)
3067  return ScalarCallCost;
3068 
3069  // Compute corresponding vector type for return value and arguments.
3070  Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3071  for (Type *ScalarTy : ScalarTys)
3072  Tys.push_back(ToVectorTy(ScalarTy, VF));
3073 
3074  // Compute costs of unpacking argument values for the scalar calls and
3075  // packing the return values to a vector.
3076  unsigned ScalarizationCost = getScalarizationOverhead(CI, VF, TTI);
3077 
3078  unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3079 
3080  // If we can't emit a vector call for this function, then the currently found
3081  // cost is the cost we need to return.
3082  NeedToScalarize = true;
3083  if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
3084  return Cost;
3085 
3086  // If the corresponding vector cost is cheaper, return its cost.
3087  unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3088  if (VectorCallCost < Cost) {
3089  NeedToScalarize = false;
3090  return VectorCallCost;
3091  }
3092  return Cost;
3093 }
3094 
3095 // Estimate cost of an intrinsic call instruction CI if it were vectorized with
3096 // factor VF. Return the cost of the instruction, including scalarization
3097 // overhead if it's needed.
3098 static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF,
3099  const TargetTransformInfo &TTI,
3100  const TargetLibraryInfo *TLI) {
3102  assert(ID && "Expected intrinsic call!");
3103 
3104  FastMathFlags FMF;
3105  if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3106  FMF = FPMO->getFastMathFlags();
3107 
3108  SmallVector<Value *, 4> Operands(CI->arg_operands());
3109  return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3110 }
3111 
3113  auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3114  auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3115  return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3116 }
3118  auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3119  auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3120  return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3121 }
3122 
3124  // For every instruction `I` in MinBWs, truncate the operands, create a
3125  // truncated version of `I` and reextend its result. InstCombine runs
3126  // later and will remove any ext/trunc pairs.
3127  SmallPtrSet<Value *, 4> Erased;
3128  for (const auto &KV : Cost->getMinimalBitwidths()) {
3129  // If the value wasn't vectorized, we must maintain the original scalar
3130  // type. The absence of the value from VectorLoopValueMap indicates that it
3131  // wasn't vectorized.
3132  if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3133  continue;
3134  for (unsigned Part = 0; Part < UF; ++Part) {
3135  Value *I = getOrCreateVectorValue(KV.first, Part);
3136  if (Erased.find(I) != Erased.end() || I->use_empty() ||
3137  !isa<Instruction>(I))
3138  continue;
3139  Type *OriginalTy = I->getType();
3140  Type *ScalarTruncatedTy =
3141  IntegerType::get(OriginalTy->getContext(), KV.second);
3142  Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3143  OriginalTy->getVectorNumElements());
3144  if (TruncatedTy == OriginalTy)
3145  continue;
3146 
3147  IRBuilder<> B(cast<Instruction>(I));
3148  auto ShrinkOperand = [&](Value *V) -> Value * {
3149  if (auto *ZI = dyn_cast<ZExtInst>(V))
3150  if (ZI->getSrcTy() == TruncatedTy)
3151  return ZI->getOperand(0);
3152  return B.CreateZExtOrTrunc(V, TruncatedTy);
3153  };
3154 
3155  // The actual instruction modification depends on the instruction type,
3156  // unfortunately.
3157  Value *NewI = nullptr;
3158  if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3159  NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3160  ShrinkOperand(BO->getOperand(1)));
3161 
3162  // Any wrapping introduced by shrinking this operation shouldn't be
3163  // considered undefined behavior. So, we can't unconditionally copy
3164  // arithmetic wrapping flags to NewI.
3165  cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3166  } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3167  NewI =
3168  B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3169  ShrinkOperand(CI->getOperand(1)));
3170  } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3171  NewI = B.CreateSelect(SI->getCondition(),
3172  ShrinkOperand(SI->getTrueValue()),
3173  ShrinkOperand(SI->getFalseValue()));
3174  } else if (auto *CI = dyn_cast<CastInst>(I)) {
3175  switch (CI->getOpcode()) {
3176  default:
3177  llvm_unreachable("Unhandled cast!");
3178  case Instruction::Trunc:
3179  NewI = ShrinkOperand(CI->getOperand(0));
3180  break;
3181  case Instruction::SExt:
3182  NewI = B.CreateSExtOrTrunc(
3183  CI->getOperand(0),
3184  smallestIntegerVectorType(OriginalTy, TruncatedTy));
3185  break;
3186  case Instruction::ZExt:
3187  NewI = B.CreateZExtOrTrunc(
3188  CI->getOperand(0),
3189  smallestIntegerVectorType(OriginalTy, TruncatedTy));
3190  break;
3191  }
3192  } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3193  auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3194  auto *O0 = B.CreateZExtOrTrunc(
3195  SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3196  auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3197  auto *O1 = B.CreateZExtOrTrunc(
3198  SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3199 
3200  NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3201  } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3202  // Don't do anything with the operands, just extend the result.
3203  continue;
3204  } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3205  auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3206  auto *O0 = B.CreateZExtOrTrunc(
3207  IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3208  auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3209  NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3210  } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3211  auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3212  auto *O0 = B.CreateZExtOrTrunc(
3213  EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3214  NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3215  } else {
3216  // If we don't know what to do, be conservative and don't do anything.
3217  continue;
3218  }
3219 
3220  // Lastly, extend the result.
3221  NewI->takeName(cast<Instruction>(I));
3222  Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3223  I->replaceAllUsesWith(Res);
3224  cast<Instruction>(I)->eraseFromParent();
3225  Erased.insert(I);
3226  VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3227  }
3228  }
3229 
3230  // We'll have created a bunch of ZExts that are now parentless. Clean up.
3231  for (const auto &KV : Cost->getMinimalBitwidths()) {
3232  // If the value wasn't vectorized, we must maintain the original scalar
3233  // type. The absence of the value from VectorLoopValueMap indicates that it
3234  // wasn't vectorized.
3235  if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3236  continue;
3237  for (unsigned Part = 0; Part < UF; ++Part) {
3238  Value *I = getOrCreateVectorValue(KV.first, Part);
3239  ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3240  if (Inst && Inst->use_empty()) {
3241  Value *NewI = Inst->getOperand(0);
3242  Inst->eraseFromParent();
3243  VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3244  }
3245  }
3246  }
3247 }
3248 
3250  // Insert truncates and extends for any truncated instructions as hints to
3251  // InstCombine.
3252  if (VF > 1)
3254 
3255  // Fix widened non-induction PHIs by setting up the PHI operands.
3256  if (OrigPHIsToFix.size()) {
3258  "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3260  }
3261 
3262  // At this point every instruction in the original loop is widened to a
3263  // vector form. Now we need to fix the recurrences in the loop. These PHI
3264  // nodes are currently empty because we did not want to introduce cycles.
3265  // This is the second stage of vectorizing recurrences.
3267 
3268  // Update the dominator tree.
3269  //
3270  // FIXME: After creating the structure of the new loop, the dominator tree is
3271  // no longer up-to-date, and it remains that way until we update it
3272  // here. An out-of-date dominator tree is problematic for SCEV,
3273  // because SCEVExpander uses it to guide code generation. The
3274  // vectorizer use SCEVExpanders in several places. Instead, we should
3275  // keep the dominator tree up-to-date as we go.
3276  updateAnalysis();
3277 
3278  // Fix-up external users of the induction variables.
3279  for (auto &Entry : *Legal->getInductionVars())
3280  fixupIVUsers(Entry.first, Entry.second,
3282  IVEndValues[Entry.first], LoopMiddleBlock);
3283 
3284  fixLCSSAPHIs();
3286  sinkScalarOperands(&*PI);
3287 
3288  // Remove redundant induction instructions.
3290 }
3291 
3293  // In order to support recurrences we need to be able to vectorize Phi nodes.
3294  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3295  // stage #2: We now need to fix the recurrences by adding incoming edges to
3296  // the currently empty PHI nodes. At this point every instruction in the
3297  // original loop is widened to a vector form so we can use them to construct
3298  // the incoming edges.
3299  for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3300  // Handle first-order recurrences and reductions that need to be fixed.
3301  if (Legal->isFirstOrderRecurrence(&Phi))
3303  else if (Legal->isReductionVariable(&Phi))
3304  fixReduction(&Phi);
3305  }
3306 }
3307 
3309  // This is the second phase of vectorizing first-order recurrences. An
3310  // overview of the transformation is described below. Suppose we have the
3311  // following loop.
3312  //
3313  // for (int i = 0; i < n; ++i)
3314  // b[i] = a[i] - a[i - 1];
3315  //
3316  // There is a first-order recurrence on "a". For this loop, the shorthand
3317  // scalar IR looks like:
3318  //
3319  // scalar.ph:
3320  // s_init = a[-1]
3321  // br scalar.body
3322  //
3323  // scalar.body:
3324  // i = phi [0, scalar.ph], [i+1, scalar.body]
3325  // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3326  // s2 = a[i]
3327  // b[i] = s2 - s1
3328  // br cond, scalar.body, ...
3329  //
3330  // In this example, s1 is a recurrence because it's value depends on the
3331  // previous iteration. In the first phase of vectorization, we created a
3332  // temporary value for s1. We now complete the vectorization and produce the
3333  // shorthand vector IR shown below (for VF = 4, UF = 1).
3334  //
3335  // vector.ph:
3336  // v_init = vector(..., ..., ..., a[-1])
3337  // br vector.body
3338  //
3339  // vector.body
3340  // i = phi [0, vector.ph], [i+4, vector.body]
3341  // v1 = phi [v_init, vector.ph], [v2, vector.body]
3342  // v2 = a[i, i+1, i+2, i+3];
3343  // v3 = vector(v1(3), v2(0, 1, 2))
3344  // b[i, i+1, i+2, i+3] = v2 - v3
3345  // br cond, vector.body, middle.block
3346  //
3347  // middle.block:
3348  // x = v2(3)
3349  // br scalar.ph
3350  //
3351  // scalar.ph:
3352  // s_init = phi [x, middle.block], [a[-1], otherwise]
3353  // br scalar.body
3354  //
3355  // After execution completes the vector loop, we extract the next value of
3356  // the recurrence (x) to use as the initial value in the scalar loop.
3357 
3358  // Get the original loop preheader and single loop latch.
3359  auto *Preheader = OrigLoop->getLoopPreheader();
3360  auto *Latch = OrigLoop->getLoopLatch();
3361 
3362  // Get the initial and previous values of the scalar recurrence.
3363  auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3364  auto *Previous = Phi->getIncomingValueForBlock(Latch);
3365 
3366  // Create a vector from the initial value.
3367  auto *VectorInit = ScalarInit;
3368  if (VF > 1) {
3370  VectorInit = Builder.CreateInsertElement(
3371  UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3372  Builder.getInt32(VF - 1), "vector.recur.init");
3373  }
3374 
3375  // We constructed a temporary phi node in the first phase of vectorization.
3376  // This phi node will eventually be deleted.
3378  cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3379 
3380  // Create a phi node for the new recurrence. The current value will either be
3381  // the initial value inserted into a vector or loop-varying vector value.
3382  auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3383  VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3384 
3385  // Get the vectorized previous value of the last part UF - 1. It appears last
3386  // among all unrolled iterations, due to the order of their construction.
3387  Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3388 
3389  // Set the insertion point after the previous value if it is an instruction.
3390  // Note that the previous value may have been constant-folded so it is not
3391  // guaranteed to be an instruction in the vector loop. Also, if the previous
3392  // value is a phi node, we should insert after all the phi nodes to avoid
3393  // breaking basic block verification.
3394  if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||
3395  isa<PHINode>(PreviousLastPart))
3397  else
3399  &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));
3400 
3401  // We will construct a vector for the recurrence by combining the values for
3402  // the current and previous iterations. This is the required shuffle mask.
3403  SmallVector<Constant *, 8> ShuffleMask(VF);
3404  ShuffleMask[0] = Builder.getInt32(VF - 1);
3405  for (unsigned I = 1; I < VF; ++I)
3406  ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3407 
3408  // The vector from which to take the initial value for the current iteration
3409  // (actual or unrolled). Initially, this is the vector phi node.
3410  Value *Incoming = VecPhi;
3411 
3412  // Shuffle the current and previous vector and update the vector parts.
3413  for (unsigned Part = 0; Part < UF; ++Part) {
3414  Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3415  Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3416  auto *Shuffle =
3417  VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3418  ConstantVector::get(ShuffleMask))
3419  : Incoming;
3420  PhiPart->replaceAllUsesWith(Shuffle);
3421  cast<Instruction>(PhiPart)->eraseFromParent();
3422  VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3423  Incoming = PreviousPart;
3424  }
3425 
3426  // Fix the latch value of the new recurrence in the vector loop.
3427  VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3428 
3429  // Extract the last vector element in the middle block. This will be the
3430  // initial value for the recurrence when jumping to the scalar loop.
3431  auto *ExtractForScalar = Incoming;
3432  if (VF > 1) {
3434  ExtractForScalar = Builder.CreateExtractElement(
3435  ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3436  }
3437  // Extract the second last element in the middle block if the
3438  // Phi is used outside the loop. We need to extract the phi itself
3439  // and not the last element (the phi update in the current iteration). This
3440  // will be the value when jumping to the exit block from the LoopMiddleBlock,
3441  // when the scalar loop is not run at all.
3442  Value *ExtractForPhiUsedOutsideLoop = nullptr;
3443  if (VF > 1)
3444  ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3445  Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3446  // When loop is unrolled without vectorizing, initialize
3447  // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3448  // `Incoming`. This is analogous to the vectorized case above: extracting the
3449  // second last element when VF > 1.
3450  else if (UF > 1)
3451  ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3452 
3453  // Fix the initial value of the original recurrence in the scalar loop.
3455  auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3456  for (auto *BB : predecessors(LoopScalarPreHeader)) {
3457  auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3458  Start->addIncoming(Incoming, BB);
3459  }
3460 
3462  Phi->setName("scalar.recur");
3463 
3464  // Finally, fix users of the recurrence outside the loop. The users will need
3465  // either the last value of the scalar recurrence or the last value of the
3466  // vector recurrence we extracted in the middle block. Since the loop is in
3467  // LCSSA form, we just need to find all the phi nodes for the original scalar
3468  // recurrence in the exit block, and then add an edge for the middle block.
3469  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3470  if (LCSSAPhi.getIncomingValue(0) == Phi) {
3471  LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3472  }
3473  }
3474 }
3475 
3477  Constant *Zero = Builder.getInt32(0);
3478 
3479  // Get it's reduction variable descriptor.
3481  "Unable to find the reduction variable");
3482  RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
3483 
3485  TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3486  Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3488  RdxDesc.getMinMaxRecurrenceKind();
3489  setDebugLocFromInst(Builder, ReductionStartValue);
3490 
3491  // We need to generate a reduction vector from the incoming scalar.
3492  // To do so, we need to generate the 'identity' vector and override
3493  // one of the elements with the incoming scalar reduction. We need
3494  // to do it in the vector-loop preheader.
3496 
3497  // This is the vector-clone of the value that leaves the loop.
3498  Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3499 
3500  // Find the reduction identity variable. Zero for addition, or, xor,
3501  // one for multiplication, -1 for And.
3502  Value *Identity;
3503  Value *VectorStart;
3506  // MinMax reduction have the start value as their identify.
3507  if (VF == 1) {
3508  VectorStart = Identity = ReductionStartValue;
3509  } else {
3510  VectorStart = Identity =
3511  Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3512  }
3513  } else {
3514  // Handle other reduction kinds:
3516  RK, VecTy->getScalarType());
3517  if (VF == 1) {
3518  Identity = Iden;
3519  // This vector is the Identity vector where the first element is the
3520  // incoming scalar reduction.
3521  VectorStart = ReductionStartValue;
3522  } else {
3523  Identity = ConstantVector::getSplat(VF, Iden);
3524 
3525  // This vector is the Identity vector where the first element is the
3526  // incoming scalar reduction.
3527  VectorStart =
3528  Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3529  }
3530  }
3531 
3532  // Fix the vector-loop phi.
3533 
3534  // Reductions do not have to start at zero. They can start with
3535  // any loop invariant values.
3536  BasicBlock *Latch = OrigLoop->getLoopLatch();
3537  Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3538  for (unsigned Part = 0; Part < UF; ++Part) {
3539  Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3540  Value *Val = getOrCreateVectorValue(LoopVal, Part);
3541  // Make sure to add the reduction stat value only to the
3542  // first unroll part.
3543  Value *StartVal = (Part == 0) ? VectorStart : Identity;
3544  cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3545  cast<PHINode>(VecRdxPhi)
3546  ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3547  }
3548 
3549  // Before each round, move the insertion point right between
3550  // the PHIs and the values we are going to write.
3551  // This allows us to write both PHINodes and the extractelement
3552  // instructions.
3554 
3555  setDebugLocFromInst(Builder, LoopExitInst);
3556 
3557  // If the vector reduction can be performed in a smaller type, we truncate
3558  // then extend the loop exit value to enable InstCombine to evaluate the
3559  // entire expression in the smaller type.
3560  if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3561  Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3564  VectorParts RdxParts(UF);
3565  for (unsigned Part = 0; Part < UF; ++Part) {
3566  RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3567  Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3568  Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3569  : Builder.CreateZExt(Trunc, VecTy);
3570  for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3571  UI != RdxParts[Part]->user_end();)
3572  if (*UI != Trunc) {
3573  (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3574  RdxParts[Part] = Extnd;
3575  } else {
3576  ++UI;
3577  }
3578  }
3580  for (unsigned Part = 0; Part < UF; ++Part) {
3581  RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3582  VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3583  }
3584  }
3585 
3586  // Reduce all of the unrolled parts into a single vector.
3587  Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3589  setDebugLocFromInst(Builder, ReducedPartRdx);
3590  for (unsigned Part = 1; Part < UF; ++Part) {
3591  Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3592  if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3593  // Floating point operations had to be 'fast' to enable the reduction.
3594  ReducedPartRdx = addFastMathFlag(
3596  ReducedPartRdx, "bin.rdx"));
3597  else
3598  ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3599  RdxPart);
3600  }
3601 
3602  if (VF > 1) {
3603  bool NoNaN = Legal->hasFunNoNaNAttr();
3604  ReducedPartRdx =
3605  createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3606  // If the reduction can be performed in a smaller type, we need to extend
3607  // the reduction to the wider type before we branch to the original loop.
3608  if (Phi->getType() != RdxDesc.getRecurrenceType())
3609  ReducedPartRdx =
3610  RdxDesc.isSigned()
3611  ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3612  : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3613  }
3614 
3615  // Create a phi node that merges control-flow from the backedge-taken check
3616  // block and the middle block.
3617  PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3619  for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3620  BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3621  BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3622 
3623  // Now, we need to fix the users of the reduction variable
3624  // inside and outside of the scalar remainder loop.
3625  // We know that the loop is in LCSSA form. We need to update the
3626  // PHI nodes in the exit blocks.
3627  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3628  // All PHINodes need to have a single entry edge, or two if
3629  // we already fixed them.
3630  assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3631 
3632  // We found a reduction value exit-PHI. Update it with the
3633  // incoming bypass edge.
3634  if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3635  LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3636  } // end of the LCSSA phi scan.
3637 
3638  // Fix the scalar loop reduction variable with the incoming reduction sum
3639  // from the vector body and from the backedge value.
3640  int IncomingEdgeBlockIdx =
3642  assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3643  // Pick the other block.
3644  int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3645  Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3646  Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3647 }
3648 
3650  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3651  if (LCSSAPhi.getNumIncomingValues() == 1) {
3652  auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3653  // Non-instruction incoming values will have only one value.
3654  unsigned LastLane = 0;
3655  if (isa<Instruction>(IncomingValue))
3656  LastLane = Cost->isUniformAfterVectorization(
3657  cast<Instruction>(IncomingValue), VF)
3658  ? 0
3659  : VF - 1;
3660  // Can be a loop invariant incoming value or the last scalar value to be
3661  // extracted from the vectorized loop.
3663  Value *lastIncomingValue =
3664  getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3665  LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3666  }
3667  }
3668 }
3669 
3671  // The basic block and loop containing the predicated instruction.
3672  auto *PredBB = PredInst->getParent();
3673  auto *VectorLoop = LI->getLoopFor(PredBB);
3674 
3675  // Initialize a worklist with the operands of the predicated instruction.
3676  SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3677 
3678  // Holds instructions that we need to analyze again. An instruction may be
3679  // reanalyzed if we don't yet know if we can sink it or not.
3680  SmallVector<Instruction *, 8> InstsToReanalyze;
3681 
3682  // Returns true if a given use occurs in the predicated block. Phi nodes use
3683  // their operands in their corresponding predecessor blocks.
3684  auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3685  auto *I = cast<Instruction>(U.getUser());
3686  BasicBlock *BB = I->getParent();
3687  if (auto *Phi = dyn_cast<PHINode>(I))
3688  BB = Phi->getIncomingBlock(
3689  PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3690  return BB == PredBB;
3691  };
3692 
3693  // Iteratively sink the scalarized operands of the predicated instruction
3694  // into the block we created for it. When an instruction is sunk, it's
3695  // operands are then added to the worklist. The algorithm ends after one pass
3696  // through the worklist doesn't sink a single instruction.
3697  bool Changed;
3698  do {
3699  // Add the instructions that need to be reanalyzed to the worklist, and
3700  // reset the changed indicator.
3701  Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3702  InstsToReanalyze.clear();
3703  Changed = false;
3704 
3705  while (!Worklist.empty()) {
3706  auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3707 
3708  // We can't sink an instruction if it is a phi node, is already in the
3709  // predicated block, is not in the loop, or may have side effects.
3710  if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
3711  !VectorLoop->contains(I) || I->mayHaveSideEffects())
3712  continue;
3713 
3714  // It's legal to sink the instruction if all its uses occur in the
3715  // predicated block. Otherwise, there's nothing to do yet, and we may
3716  // need to reanalyze the instruction.
3717  if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3718  InstsToReanalyze.push_back(I);
3719  continue;
3720  }
3721 
3722  // Move the instruction to the beginning of the predicated block, and add
3723  // it's operands to the worklist.
3724  I->moveBefore(&*PredBB->getFirstInsertionPt());
3725  Worklist.insert(I->op_begin(), I->op_end());
3726 
3727  // The sinking may have enabled other instructions to be sunk, so we will
3728  // need to iterate.
3729  Changed = true;
3730  }
3731  } while (Changed);
3732 }
3733 
3735  for (PHINode *OrigPhi : OrigPHIsToFix) {
3736  PHINode *NewPhi =
3737  cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
3738  unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
3739 
3740  SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
3741  predecessors(OrigPhi->getParent()));
3742  SmallVector<BasicBlock *, 2> VectorBBPredecessors(
3743  predecessors(NewPhi->getParent()));
3744  assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
3745  "Scalar and Vector BB should have the same number of predecessors");
3746 
3747  // The insertion point in Builder may be invalidated by the time we get
3748  // here. Force the Builder insertion point to something valid so that we do
3749  // not run into issues during insertion point restore in
3750  // getOrCreateVectorValue calls below.
3751  Builder.SetInsertPoint(NewPhi);
3752 
3753  // The predecessor order is preserved and we can rely on mapping between
3754  // scalar and vector block predecessors.
3755  for (unsigned i = 0; i < NumIncomingValues; ++i) {
3756  BasicBlock *NewPredBB = VectorBBPredecessors[i];
3757 
3758  // When looking up the new scalar/vector values to fix up, use incoming
3759  // values from original phi.
3760  Value *ScIncV =
3761  OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
3762 
3763  // Scalar incoming value may need a broadcast
3764  Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
3765  NewPhi->addIncoming(NewIncV, NewPredBB);
3766  }
3767  }
3768 }
3769 
3771  unsigned VF) {
3772  PHINode *P = cast<PHINode>(PN);
3773  if (EnableVPlanNativePath) {
3774  // Currently we enter here in the VPlan-native path for non-induction
3775  // PHIs where all control flow is uniform. We simply widen these PHIs.
3776  // Create a vector phi with no operands - the vector phi operands will be
3777  // set at the end of vector code generation.
3778  Type *VecTy =
3779  (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3780  Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
3781  VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
3782  OrigPHIsToFix.push_back(P);
3783 
3784  return;
3785  }
3786 
3787  assert(PN->getParent() == OrigLoop->getHeader() &&
3788  "Non-header phis should have been handled elsewhere");
3789 
3790  // In order to support recurrences we need to be able to vectorize Phi nodes.
3791  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3792  // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3793  // this value when we vectorize all of the instructions that use the PHI.
3795  for (unsigned Part = 0; Part < UF; ++Part) {
3796  // This is phase one of vectorizing PHIs.
3797  Type *VecTy =
3798  (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3799  Value *EntryPart = PHINode::Create(
3800  VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
3801  VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
3802  }
3803  return;
3804  }
3805 
3807 
3808  // This PHINode must be an induction variable.
3809  // Make sure that we know about it.
3810  assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
3811 
3813  const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
3814 
3815  // FIXME: The newly created binary instructions should contain nsw/nuw flags,
3816  // which can be found from the original scalar operations.
3817  switch (II.getKind()) {
3819  llvm_unreachable("Unknown induction");
3822  llvm_unreachable("Integer/fp induction is handled elsewhere.");
3824  // Handle the pointer induction variable case.
3825  assert(P->getType()->isPointerTy() && "Unexpected type.");
3826  // This is the normalized GEP that starts counting at zero.
3827  Value *PtrInd = Induction;
3828  PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
3829  // Determine the number of scalars we need to generate for each unroll
3830  // iteration. If the instruction is uniform, we only need to generate the
3831  // first lane. Otherwise, we generate all VF values.
3832  unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
3833  // These are the scalar results. Notice that we don't generate vector GEPs
3834  // because scalar GEPs result in better code.
3835  for (unsigned Part = 0; Part < UF; ++Part) {
3836  for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
3837  Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
3838  Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
3839  Value *SclrGep =
3840  emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
3841  SclrGep->setName("next.gep");
3842  VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
3843  }
3844  }
3845  return;
3846  }
3847  }
3848 }
3849 
3850 /// A helper function for checking whether an integer division-related
3851 /// instruction may divide by zero (in which case it must be predicated if
3852 /// executed conditionally in the scalar code).
3853 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
3854 /// Non-zero divisors that are non compile-time constants will not be
3855 /// converted into multiplication, so we will still end up scalarizing
3856 /// the division, but can do so w/o predication.
3858  assert((I.getOpcode() == Instruction::UDiv ||
3859  I.getOpcode() == Instruction::SDiv ||
3860  I.getOpcode() == Instruction::URem ||
3861  I.getOpcode() == Instruction::SRem) &&
3862  "Unexpected instruction");
3863  Value *Divisor = I.getOperand(1);
3864  auto *CInt = dyn_cast<ConstantInt>(Divisor);
3865  return !CInt || CInt->isZero();
3866 }
3867 
3869  switch (I.getOpcode()) {
3870  case Instruction::Br:
3871  case Instruction::PHI:
3872  llvm_unreachable("This instruction is handled by a different recipe.");
3873  case Instruction::GetElementPtr: {
3874  // Construct a vector GEP by widening the operands of the scalar GEP as
3875  // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
3876  // results in a vector of pointers when at least one operand of the GEP
3877  // is vector-typed. Thus, to keep the representation compact, we only use
3878  // vector-typed operands for loop-varying values.
3879  auto *GEP = cast<GetElementPtrInst>(&I);
3880 
3881  if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
3882  // If we are vectorizing, but the GEP has only loop-invariant operands,
3883  // the GEP we build (by only using vector-typed operands for
3884  // loop-varying values) would be a scalar pointer. Thus, to ensure we
3885  // produce a vector of pointers, we need to either arbitrarily pick an
3886  // operand to broadcast, or broadcast a clone of the original GEP.
3887  // Here, we broadcast a clone of the original.
3888  //
3889  // TODO: If at some point we decide to scalarize instructions having
3890  // loop-invariant operands, this special case will no longer be
3891  // required. We would add the scalarization decision to
3892  // collectLoopScalars() and teach getVectorValue() to broadcast
3893  // the lane-zero scalar value.
3894  auto *Clone = Builder.Insert(GEP->clone());
3895  for (unsigned Part = 0; Part < UF; ++Part) {
3896  Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
3897  VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
3898  addMetadata(EntryPart, GEP);
3899  }
3900  } else {
3901  // If the GEP has at least one loop-varying operand, we are sure to
3902  // produce a vector of pointers. But if we are only unrolling, we want
3903  // to produce a scalar GEP for each unroll part. Thus, the GEP we
3904  // produce with the code below will be scalar (if VF == 1) or vector
3905  // (otherwise). Note that for the unroll-only case, we still maintain
3906  // values in the vector mapping with initVector, as we do for other
3907  // instructions.
3908  for (unsigned Part = 0; Part < UF; ++Part) {
3909  // The pointer operand of the new GEP. If it's loop-invariant, we
3910  // won't broadcast it.
3911  auto *Ptr =
3912  OrigLoop->isLoopInvariant(GEP->getPointerOperand())
3913  ? GEP->getPointerOperand()
3914  : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
3915 
3916  // Collect all the indices for the new GEP. If any index is
3917  // loop-invariant, we won't broadcast it.
3918  SmallVector<Value *, 4> Indices;
3919  for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
3920  if (OrigLoop->isLoopInvariant(U.get()))
3921  Indices.push_back(U.get());
3922  else
3923  Indices.push_back(getOrCreateVectorValue(U.get(), Part));
3924  }
3925 
3926  // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
3927  // but it should be a vector, otherwise.
3928  auto *NewGEP = GEP->isInBounds()
3929  ? Builder.CreateInBoundsGEP(Ptr, Indices)
3930  : Builder.CreateGEP(Ptr, Indices);
3931  assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
3932  "NewGEP is not a pointer vector");
3933  VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
3934  addMetadata(NewGEP, GEP);
3935  }
3936  }
3937 
3938  break;
3939  }
3940  case Instruction::UDiv:
3941  case Instruction::SDiv:
3942  case Instruction::SRem:
3943  case Instruction::URem:
3944  case Instruction::Add:
3945  case Instruction::FAdd:
3946  case Instruction::Sub:
3947  case Instruction::FSub:
3948  case Instruction::Mul:
3949  case Instruction::FMul:
3950  case Instruction::FDiv:
3951  case Instruction::FRem:
3952  case Instruction::Shl:
3953  case Instruction::LShr:
3954  case Instruction::AShr:
3955  case Instruction::And:
3956  case Instruction::Or:
3957  case Instruction::Xor: {
3958  // Just widen binops.
3959  auto *BinOp = cast<BinaryOperator>(&I);
3960  setDebugLocFromInst(Builder, BinOp);
3961 
3962  for (unsigned Part = 0; Part < UF; ++Part) {
3963  Value *A = getOrCreateVectorValue(BinOp->getOperand(0), Part);
3964  Value *B = getOrCreateVectorValue(BinOp->getOperand(1), Part);
3965  Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
3966 
3967  if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
3968  VecOp->copyIRFlags(BinOp);
3969 
3970  // Use this vector value for all users of the original instruction.
3971  VectorLoopValueMap.setVectorValue(&I, Part, V);
3972  addMetadata(V, BinOp);
3973  }
3974 
3975  break;
3976  }
3977  case Instruction::Select: {
3978  // Widen selects.
3979  // If the selector is loop invariant we can create a select
3980  // instruction with a scalar condition. Otherwise, use vector-select.
3981  auto *SE = PSE.getSE();
3982  bool InvariantCond =
3985 
3986  // The condition can be loop invariant but still defined inside the
3987  // loop. This means that we can't just use the original 'cond' value.
3988  // We have to take the 'vectorized' value and pick the first lane.
3989  // Instcombine will make this a no-op.
3990 
3991  auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
3992 
3993  for (unsigned Part = 0; Part < UF; ++Part) {
3994  Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
3995  Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
3996  Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
3997  Value *Sel =
3998  Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
3999  VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4000  addMetadata(Sel, &I);
4001  }
4002 
4003  break;
4004  }
4005 
4006  case Instruction::ICmp:
4007  case Instruction::FCmp: {
4008  // Widen compares. Generate vector compares.
4009  bool FCmp = (I.getOpcode() == Instruction::FCmp);
4010  auto *Cmp = dyn_cast<CmpInst>(&I);
4012  for (unsigned Part = 0; Part < UF; ++Part) {
4013  Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4014  Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4015  Value *C = nullptr;
4016  if (FCmp) {
4017  // Propagate fast math flags.
4019  Builder.setFastMathFlags(Cmp->getFastMathFlags());
4020  C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4021  } else {
4022  C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4023  }
4024  VectorLoopValueMap.setVectorValue(&I, Part, C);
4025  addMetadata(C, &I);
4026  }
4027 
4028  break;
4029  }
4030 
4031  case Instruction::ZExt:
4032  case Instruction::SExt:
4033  case Instruction::FPToUI:
4034  case Instruction::FPToSI:
4035  case Instruction::FPExt:
4036  case Instruction::PtrToInt:
4037  case Instruction::IntToPtr:
4038  case Instruction::SIToFP:
4039  case Instruction::UIToFP:
4040  case Instruction::Trunc:
4041  case Instruction::FPTrunc:
4042  case Instruction::BitCast: {
4043  auto *CI = dyn_cast<CastInst>(&I);
4045 
4046  /// Vectorize casts.
4047  Type *DestTy =
4048  (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4049 
4050  for (unsigned Part = 0; Part < UF; ++Part) {
4051  Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4052  Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4053  VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4054  addMetadata(Cast, &I);
4055  }
4056  break;
4057  }
4058 
4059  case Instruction::Call: {
4060  // Ignore dbg intrinsics.
4061  if (isa<DbgInfoIntrinsic>(I))
4062  break;
4064 
4065  Module *M = I.getParent()->getParent()->getParent();
4066  auto *CI = cast<CallInst>(&I);
4067 
4068  StringRef FnName = CI->getCalledFunction()->getName();
4069  Function *F = CI->getCalledFunction();
4070  Type *RetTy = ToVectorTy(CI->getType(), VF);
4072  for (Value *ArgOperand : CI->arg_operands())
4073  Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4074 
4076 
4077  // The flag shows whether we use Intrinsic or a usual Call for vectorized
4078  // version of the instruction.
4079  // Is it beneficial to perform intrinsic call compared to lib call?
4080  bool NeedToScalarize;
4081  unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
4082  bool UseVectorIntrinsic =
4083  ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
4084  assert((UseVectorIntrinsic || !NeedToScalarize) &&
4085  "Instruction should be scalarized elsewhere.");
4086 
4087  for (unsigned Part = 0; Part < UF; ++Part) {
4089  for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4090  Value *Arg = CI->getArgOperand(i);
4091  // Some intrinsics have a scalar argument - don't replace it with a
4092  // vector.
4093  if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4094  Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4095  Args.push_back(Arg);
4096  }
4097 
4098  Function *VectorF;
4099  if (UseVectorIntrinsic) {
4100  // Use vector version of the intrinsic.
4101  Type *TysForDecl[] = {CI->getType()};
4102  if (VF > 1)
4103  TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4104  VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4105  } else {
4106  // Use vector version of the library call.
4107  StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
4108  assert(!VFnName.empty() && "Vector function name is empty.");
4109  VectorF = M->getFunction(VFnName);
4110  if (!VectorF) {
4111  // Generate a declaration
4112  FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
4113  VectorF =
4114  Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
4115  VectorF->copyAttributesFrom(F);
4116  }
4117  }
4118  assert(VectorF && "Can't create vector function.");
4119 
4121  CI->getOperandBundlesAsDefs(OpBundles);
4122  CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4123 
4124  if (isa<FPMathOperator>(V))
4125  V->copyFastMathFlags(CI);
4126 
4127  VectorLoopValueMap.setVectorValue(&I, Part, V);
4128  addMetadata(V, &I);
4129  }
4130 
4131  break;
4132  }
4133 
4134  default:
4135  // This instruction is not vectorized by simple widening.
4136  LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4137  llvm_unreachable("Unhandled instruction!");
4138  } // end of switch.
4139 }
4140 
4142  // Forget the original basic block.
4144 
4145  // DT is not kept up-to-date for outer loop vectorization
4147  return;
4148 
4149  // Update the dominator tree information.
4151  "Entry does not dominate exit.");
4152 
4159 }
4160 
4161 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4162  // We should not collect Scalars more than once per VF. Right now, this
4163  // function is called from collectUniformsAndScalars(), which already does
4164  // this check. Collecting Scalars for VF=1 does not make any sense.
4165  assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4166  "This function should not be visited twice for the same VF");
4167 
4169 
4170  // These sets are used to seed the analysis with pointers used by memory
4171  // accesses that will remain scalar.
4173  SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4174 
4175  // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4176  // The pointer operands of loads and stores will be scalar as long as the
4177  // memory access is not a gather or scatter operation. The value operand of a
4178  // store will remain scalar if the store is scalarized.
4179  auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4180  InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4181  assert(WideningDecision != CM_Unknown &&
4182  "Widening decision should be ready at this moment");
4183  if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4184  if (Ptr == Store->getValueOperand())
4185  return WideningDecision == CM_Scalarize;
4186  assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4187  "Ptr is neither a value or pointer operand");
4188  return WideningDecision != CM_GatherScatter;
4189  };
4190 
4191  // A helper that returns true if the given value is a bitcast or
4192  // getelementptr instruction contained in the loop.
4193  auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4194  return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4195  isa<GetElementPtrInst>(V)) &&
4196  !TheLoop->isLoopInvariant(V);
4197  };
4198 
4199  // A helper that evaluates a memory access's use of a pointer. If the use
4200  // will be a scalar use, and the pointer is only used by memory accesses, we
4201  // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4202  // PossibleNonScalarPtrs.
4203  auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4204  // We only care about bitcast and getelementptr instructions contained in
4205  // the loop.
4206  if (!isLoopVaryingBitCastOrGEP(Ptr))
4207  return;
4208 
4209  // If the pointer has already been identified as scalar (e.g., if it was
4210  // also identified as uniform), there's nothing to do.
4211  auto *I = cast<Instruction>(Ptr);
4212  if (Worklist.count(I))
4213  return;
4214 
4215  // If the use of the pointer will be a scalar use, and all users of the
4216  // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4217  // place the pointer in PossibleNonScalarPtrs.
4218  if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4219  return isa<LoadInst>(U) || isa<StoreInst>(U);
4220  }))
4221  ScalarPtrs.insert(I);
4222  else
4223  PossibleNonScalarPtrs.insert(I);
4224  };
4225 
4226  // We seed the scalars analysis with three classes of instructions: (1)
4227  // instructions marked uniform-after-vectorization, (2) bitcast and
4228  // getelementptr instructions used by memory accesses requiring a scalar use,
4229  // and (3) pointer induction variables and their update instructions (we
4230  // currently only scalarize these).
4231  //
4232  // (1) Add to the worklist all instructions that have been identified as
4233  // uniform-after-vectorization.
4234  Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4235 
4236  // (2) Add to the worklist all bitcast and getelementptr instructions used by
4237  // memory accesses requiring a scalar use. The pointer operands of loads and
4238  // stores will be scalar as long as the memory accesses is not a gather or
4239  // scatter operation. The value operand of a store will remain scalar if the
4240  // store is scalarized.
4241  for (auto *BB : TheLoop->blocks())
4242  for (auto &I : *BB) {
4243  if (auto *Load = dyn_cast<LoadInst>(&I)) {
4244  evaluatePtrUse(Load, Load->getPointerOperand());
4245  } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4246  evaluatePtrUse(Store, Store->getPointerOperand());
4247  evaluatePtrUse(Store, Store->getValueOperand());
4248  }
4249  }
4250  for (auto *I : ScalarPtrs)
4251  if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4252  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4253  Worklist.insert(I);
4254  }
4255 
4256  // (3) Add to the worklist all pointer induction variables and their update
4257  // instructions.
4258  //
4259  // TODO: Once we are able to vectorize pointer induction variables we should
4260  // no longer insert them into the worklist here.
4261  auto *Latch = TheLoop->getLoopLatch();
4262  for (auto &Induction : *Legal->getInductionVars()) {
4263  auto *Ind = Induction.first;
4264  auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4265  if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4266  continue;
4267  Worklist.insert(Ind);
4268  Worklist.insert(IndUpdate);
4269  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4270  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4271  << "\n");
4272  }
4273 
4274  // Insert the forced scalars.
4275  // FIXME: Currently widenPHIInstruction() often creates a dead vector
4276  // induction variable when the PHI user is scalarized.
4277  auto ForcedScalar = ForcedScalars.find(VF);
4278  if (ForcedScalar != ForcedScalars.end())
4279  for (auto *I : ForcedScalar->second)
4280  Worklist.insert(I);
4281 
4282  // Expand the worklist by looking through any bitcasts and getelementptr
4283  // instructions we've already identified as scalar. This is similar to the
4284  // expansion step in collectLoopUniforms(); however, here we're only
4285  // expanding to include additional bitcasts and getelementptr instructions.
4286  unsigned Idx = 0;
4287  while (Idx != Worklist.size()) {
4288  Instruction *Dst = Worklist[Idx++];
4289  if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4290  continue;
4291  auto *Src = cast<Instruction>(Dst->getOperand(0));
4292  if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4293  auto *J = cast<Instruction>(U);
4294  return !TheLoop->contains(J) || Worklist.count(J) ||
4295  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4296  isScalarUse(J, Src));
4297  })) {
4298  Worklist.insert(Src);
4299  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4300  }
4301  }
4302 
4303  // An induction variable will remain scalar if all users of the induction
4304  // variable and induction variable update remain scalar.
4305  for (auto &Induction : *Legal->getInductionVars()) {
4306  auto *Ind = Induction.first;
4307  auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4308 
4309  // We already considered pointer induction variables, so there's no reason
4310  // to look at their users again.
4311  //
4312  // TODO: Once we are able to vectorize pointer induction variables we
4313  // should no longer skip over them here.
4314  if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4315  continue;
4316 
4317  // Determine if all users of the induction variable are scalar after
4318  // vectorization.
4319  auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4320  auto *I = cast<Instruction>(U);
4321  return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4322  });
4323  if (!ScalarInd)
4324  continue;
4325 
4326  // Determine if all users of the induction variable update instruction are
4327  // scalar after vectorization.
4328  auto ScalarIndUpdate =
4329  llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4330  auto *I = cast<Instruction>(U);
4331  return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4332  });
4333  if (!ScalarIndUpdate)
4334  continue;
4335 
4336  // The induction variable and its update instruction will remain scalar.
4337  Worklist.insert(Ind);
4338  Worklist.insert(IndUpdate);
4339  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4340  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4341  << "\n");
4342  }
4343 
4344  Scalars[VF].insert(Worklist.begin(), Worklist.end());
4345 }
4346 
4348  if (!blockNeedsPredication(I->getParent()))
4349  return false;
4350  switch(I->getOpcode()) {
4351  default:
4352  break;
4353  case Instruction::Load:
4354  case Instruction::Store: {
4355  if (!Legal->isMaskRequired(I))
4356  return false;
4357  auto *Ptr = getLoadStorePointerOperand(I);
4358  auto *Ty = getMemInstValueType(I);
4359  // We have already decided how to vectorize this instruction, get that
4360  // result.
4361  if (VF > 1) {
4362  InstWidening WideningDecision = getWideningDecision(I, VF);
4363  assert(WideningDecision != CM_Unknown &&
4364  "Widening decision should be ready at this moment");
4365  return WideningDecision == CM_Scalarize;
4366  }
4367  return isa<LoadInst>(I) ?
4368  !(isLegalMaskedLoad(Ty, Ptr) || isLegalMaskedGather(Ty))
4369  : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty));
4370  }
4371  case Instruction::UDiv:
4372  case Instruction::SDiv:
4373  case Instruction::SRem:
4374  case Instruction::URem:
4375  return mayDivideByZero(*I);
4376  }
4377  return false;
4378 }
4379 
4381  unsigned VF) {
4382  assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4383  assert(getWideningDecision(I, VF) == CM_Unknown &&
4384  "Decision should not be set yet.");
4385  auto *Group = getInterleavedAccessGroup(I);
4386  assert(Group && "Must have a group.");
4387 
4388  // Check if masking is required.
4389  // A Group may need masking for one of two reasons: it resides in a block that
4390  // needs predication, or it was decided to use masking to deal with gaps.
4391  bool PredicatedAccessRequiresMasking =
4393  bool AccessWithGapsRequiresMasking =
4394  Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed;
4395  if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4396  return true;
4397 
4398  // If masked interleaving is required, we expect that the user/target had
4399  // enabled it, because otherwise it either wouldn't have been created or
4400  // it should have been invalidated by the CostModel.
4402  "Masked interleave-groups for predicated accesses are not enabled.");
4403 
4404  auto *Ty = getMemInstValueType(I);
4405  return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty)
4406  : TTI.isLegalMaskedStore(Ty);
4407 }
4408 
4410  unsigned VF) {
4411  // Get and ensure we have a valid memory instruction.
4412  LoadInst *LI = dyn_cast<LoadInst>(I);
4414  assert((LI || SI) && "Invalid memory instruction");
4415 
4416  auto *Ptr = getLoadStorePointerOperand(I);
4417 
4418  // In order to be widened, the pointer should be consecutive, first of all.
4419  if (!Legal->isConsecutivePtr(Ptr))
4420  return false;
4421 
4422  // If the instruction is a store located in a predicated block, it will be
4423  // scalarized.
4424  if (isScalarWithPredication(I))
4425  return false;
4426 
4427  // If the instruction's allocated size doesn't equal it's type size, it
4428  // requires padding and will be scalarized.
4429  auto &DL = I->getModule()->getDataLayout();
4430  auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4431  if (hasIrregularType(ScalarTy, DL, VF))
4432  return false;
4433 
4434  return true;
4435 }
4436 
4437 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4438  // We should not collect Uniforms more than once per VF. Right now,
4439  // this function is called from collectUniformsAndScalars(), which
4440  // already does this check. Collecting Uniforms for VF=1 does not make any
4441  // sense.
4442 
4443  assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4444  "This function should not be visited twice for the same VF");
4445 
4446  // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4447  // not analyze again. Uniforms.count(VF) will return 1.
4448  Uniforms[VF].clear();
4449 
4450  // We now know that the loop is vectorizable!
4451  // Collect instructions inside the loop that will remain uniform after
4452  // vectorization.
4453 
4454  // Global values, params and instructions outside of current loop are out of
4455  // scope.
4456  auto isOutOfScope = [&](Value *V) -> bool {
4458  return (!I || !TheLoop->contains(I));
4459  };
4460 
4461  SetVector<Instruction *> Worklist;
4462  BasicBlock *Latch = TheLoop->getLoopLatch();
4463 
4464  // Start with the conditional branch. If the branch condition is an
4465  // instruction contained in the loop that is only used by the branch, it is
4466  // uniform.
4467  auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4468  if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
4469  Worklist.insert(Cmp);
4470  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
4471  }
4472 
4473  // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4474  // are pointers that are treated like consecutive pointers during
4475  // vectorization. The pointer operands of interleaved accesses are an
4476  // example.
4477  SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4478 
4479  // Holds pointer operands of instructions that are possibly non-uniform.
4480  SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4481 
4482  auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4483  InstWidening WideningDecision = getWideningDecision(I, VF);
4484  assert(WideningDecision != CM_Unknown &&
4485  "Widening decision should be ready at this moment");
4486 
4487  return (WideningDecision == CM_Widen ||
4488  WideningDecision == CM_Widen_Reverse ||
4489  WideningDecision == CM_Interleave);
4490  };
4491  // Iterate over the instructions in the loop, and collect all
4492  // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4493  // that a consecutive-like pointer operand will be scalarized, we collect it
4494  // in PossibleNonUniformPtrs instead. We use two sets here because a single
4495  // getelementptr instruction can be used by both vectorized and scalarized
4496  // memory instructions. For example, if a loop loads and stores from the same
4497  // location, but the store is conditional, the store will be scalarized, and
4498  // the getelementptr won't remain uniform.
4499  for (auto *BB : TheLoop->blocks())
4500  for (auto &I : *BB) {
4501  // If there's no pointer operand, there's nothing to do.
4502  auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4503  if (!Ptr)
4504  continue;
4505 
4506  // True if all users of Ptr are memory accesses that have Ptr as their
4507  // pointer operand.
4508  auto UsersAreMemAccesses =
4509  llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4510  return getLoadStorePointerOperand(U) == Ptr;
4511  });
4512 
4513  // Ensure the memory instruction will not be scalarized or used by
4514  // gather/scatter, making its pointer operand non-uniform. If the pointer
4515  // operand is used by any instruction other than a memory access, we
4516  // conservatively assume the pointer operand may be non-uniform.
4517  if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4518  PossibleNonUniformPtrs.insert(Ptr);
4519 
4520  // If the memory instruction will be vectorized and its pointer operand
4521  // is consecutive-like, or interleaving - the pointer operand should
4522  // remain uniform.
4523  else
4524  ConsecutiveLikePtrs.insert(Ptr);
4525  }
4526 
4527  // Add to the Worklist all consecutive and consecutive-like pointers that
4528  // aren't also identified as possibly non-uniform.
4529  for (auto *V : ConsecutiveLikePtrs)
4530  if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) {
4531  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
4532  Worklist.insert(V);
4533  }
4534 
4535  // Expand Worklist in topological order: whenever a new instruction
4536  // is added , its users should be already inside Worklist. It ensures
4537  // a uniform instruction will only be used by uniform instructions.
4538  unsigned idx = 0;
4539  while (idx != Worklist.size()) {
4540  Instruction *I = Worklist[idx++];
4541 
4542  for (auto OV : I->operand_values()) {
4543  // isOutOfScope operands cannot be uniform instructions.
4544  if (isOutOfScope(OV))
4545  continue;
4546  // First order recurrence Phi's should typically be considered
4547  // non-uniform.
4548  auto *OP = dyn_cast<PHINode>(OV);
4549  if (OP && Legal->isFirstOrderRecurrence(OP))
4550  continue;
4551  // If all the users of the operand are uniform, then add the
4552  // operand into the uniform worklist.
4553  auto *OI = cast<Instruction>(OV);
4554  if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4555  auto *J = cast<Instruction>(U);
4556  return Worklist.count(J) ||
4557  (OI == getLoadStorePointerOperand(J) &&
4558  isUniformDecision(J, VF));
4559  })) {
4560  Worklist.insert(OI);
4561  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
4562  }
4563  }
4564  }
4565 
4566  // Returns true if Ptr is the pointer operand of a memory access instruction
4567  // I, and I is known to not require scalarization.
4568  auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4569  return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4570  };
4571 
4572  // For an instruction to be added into Worklist above, all its users inside
4573  // the loop should also be in Worklist. However, this condition cannot be
4574  // true for phi nodes that form a cyclic dependence. We must process phi
4575  // nodes separately. An induction variable will remain uniform if all users
4576  // of the induction variable and induction variable update remain uniform.
4577  // The code below handles both pointer and non-pointer induction variables.
4578  for (auto &Induction : *Legal->getInductionVars()) {
4579  auto *Ind = Induction.first;
4580  auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4581 
4582  // Determine if all users of the induction variable are uniform after
4583  // vectorization.
4584  auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4585  auto *I = cast<Instruction>(U);
4586  return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4587  isVectorizedMemAccessUse(I, Ind);
4588  });
4589  if (!UniformInd)
4590  continue;
4591 
4592  // Determine if all users of the induction variable update instruction are
4593  // uniform after vectorization.
4594  auto UniformIndUpdate =
4595  llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4596  auto *I = cast<Instruction>(U);
4597  return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4598  isVectorizedMemAccessUse(I, IndUpdate);
4599  });
4600  if (!UniformIndUpdate)
4601  continue;
4602 
4603  // The induction variable and its update instruction will remain uniform.
4604  Worklist.insert(Ind);
4605  Worklist.insert(IndUpdate);
4606  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
4607  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate
4608  << "\n");
4609  }
4610 
4611  Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4612 }
4613 
4616  // TODO: It may by useful to do since it's still likely to be dynamically
4617  // uniform if the target can skip.
4618  LLVM_DEBUG(
4619  dbgs() << "LV: Not inserting runtime ptr check for divergent target");
4620 
4621  ORE->emit(
4622  createMissedAnalysis("CantVersionLoopWithDivergentTarget")
4623  << "runtime pointer checks needed. Not enabled for divergent target");
4624 
4625  return None;
4626  }
4627 
4628  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4629  if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize.
4630  return computeFeasibleMaxVF(OptForSize, TC);
4631 
4633  ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
4634  << "runtime pointer checks needed. Enable vectorization of this "
4635  "loop with '#pragma clang loop vectorize(enable)' when "
4636  "compiling with -Os/-Oz");
4637  LLVM_DEBUG(
4638  dbgs()
4639  << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
4640  return None;
4641  }
4642 
4643  if (!PSE.getUnionPredicate().getPredicates().empty()) {
4644  ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
4645  << "runtime SCEV checks needed. Enable vectorization of this "
4646  "loop with '#pragma clang loop vectorize(enable)' when "
4647  "compiling with -Os/-Oz");
4648  LLVM_DEBUG(
4649  dbgs()
4650  << "LV: Aborting. Runtime SCEV check is required with -Os/-Oz.\n");
4651  return None;
4652  }
4653 
4654  // FIXME: Avoid specializing for stride==1 instead of bailing out.
4655  if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4656  ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
4657  << "runtime stride == 1 checks needed. Enable vectorization of "
4658  "this loop with '#pragma clang loop vectorize(enable)' when "
4659  "compiling with -Os/-Oz");
4660  LLVM_DEBUG(
4661  dbgs()
4662  << "LV: Aborting. Runtime stride check is required with -Os/-Oz.\n");
4663  return None;
4664  }
4665 
4666  // If we optimize the program for size, avoid creating the tail loop.
4667  LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4668 
4669  if (TC == 1) {
4670  ORE->emit(createMissedAnalysis("SingleIterationLoop")
4671  << "loop trip count is one, irrelevant for vectorization");
4672  LLVM_DEBUG(dbgs() << "LV: Aborting, single iteration (non) loop.\n");
4673  return None;
4674  }
4675 
4676  // Record that scalar epilogue is not allowed.
4677  LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4678 
4679  IsScalarEpilogueAllowed = !OptForSize;
4680 
4681  // We don't create an epilogue when optimizing for size.
4682  // Invalidate interleave groups that require an epilogue if we can't mask
4683  // the interleave-group.
4685  InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4686 
4687  unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC);
4688 
4689  if (TC > 0 && TC % MaxVF == 0) {
4690  LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4691  return MaxVF;
4692  }
4693 
4694  // If we don't know the precise trip count, or if the trip count that we
4695  // found modulo the vectorization factor is not zero, try to fold the tail
4696  // by masking.
4697  // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4698  if (Legal->canFoldTailByMasking()) {
4699  FoldTailByMasking = true;
4700  return MaxVF;
4701  }
4702 
4703  if (TC == 0) {
4704  ORE->emit(
4705  createMissedAnalysis("UnknownLoopCountComplexCFG")
4706  << "unable to calculate the loop count due to complex control flow");
4707  return None;
4708  }
4709 
4710  ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize")
4711  << "cannot optimize for size and vectorize at the same time. "
4712  "Enable vectorization of this loop with '#pragma clang loop "
4713  "vectorize(enable)' when compiling with -Os/-Oz");
4714  return None;
4715 }
4716 
4717 unsigned
4718 LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
4719  unsigned ConstTripCount) {
4720  MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4721  unsigned SmallestType, WidestType;
4722  std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4723  unsigned WidestRegister = TTI.getRegisterBitWidth(true);
4724 
4725  // Get the maximum safe dependence distance in bits computed by LAA.
4726  // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4727  // the memory accesses that is most restrictive (involved in the smallest
4728  // dependence distance).
4729  unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
4730 
4731  WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
4732 
4733  unsigned MaxVectorSize = WidestRegister / WidestType;
4734 
4735  LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4736  << " / " << WidestType << " bits.\n");
4737  LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4738  << WidestRegister << " bits.\n");
4739 
4740  assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
4741  " into one vector!");
4742  if (MaxVectorSize == 0) {
4743  LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
4744  MaxVectorSize = 1;
4745  return MaxVectorSize;
4746  } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
4747  isPowerOf2_32(ConstTripCount)) {
4748  // We need to clamp the VF to be the ConstTripCount. There is no point in
4749  // choosing a higher viable VF as done in the loop below.
4750  LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
4751  << ConstTripCount << "\n");
4752  MaxVectorSize = ConstTripCount;
4753  return MaxVectorSize;
4754  }
4755 
4756  unsigned MaxVF = MaxVectorSize;
4757  if (TTI.shouldMaximizeVectorBandwidth(OptForSize) ||
4758  (MaximizeBandwidth && !OptForSize)) {
4759  // Collect all viable vectorization factors larger than the default MaxVF
4760  // (i.e. MaxVectorSize).
4762  unsigned NewMaxVectorSize = WidestRegister / SmallestType;
4763  for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
4764  VFs.push_back(VS);
4765 
4766  // For each VF calculate its register usage.
4767  auto RUs = calculateRegisterUsage(VFs);
4768 
4769  // Select the largest VF which doesn't require more registers than existing
4770  // ones.
4771  unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
4772  for (int i = RUs.size() - 1; i >= 0; --i) {
4773  if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
4774  MaxVF = VFs[i];
4775  break;
4776  }
4777  }
4778  if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
4779  if (MaxVF < MinVF) {
4780  LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4781  << ") with target's minimum: " << MinVF << '\n');
4782  MaxVF = MinVF;
4783  }
4784  }
4785  }
4786  return MaxVF;
4787 }
4788 
4791  float Cost = expectedCost(1).first;
4792  const float ScalarCost = Cost;
4793  unsigned Width = 1;
4794  LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
4795 
4796  bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
4797  if (ForceVectorization && MaxVF > 1) {
4798  // Ignore scalar width, because the user explicitly wants vectorization.
4799  // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4800  // evaluation.
4802  }
4803 
4804  for (unsigned i = 2; i <= MaxVF; i *= 2) {
4805  // Notice that the vector loop needs to be executed less times, so
4806  // we need to divide the cost of the vector loops by the width of
4807  // the vector elements.
4808  VectorizationCostTy C = expectedCost(i);
4809  float VectorCost = C.first / (float)i;
4810  LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
4811  << " costs: " << (int)VectorCost << ".\n");
4812  if (!C.second && !ForceVectorization) {
4813  LLVM_DEBUG(
4814  dbgs() << "LV: Not considering vector loop of width " << i
4815  << " because it will not generate any vector instructions.\n");
4816  continue;
4817  }
4818  if (VectorCost < Cost) {
4819  Cost = VectorCost;
4820  Width = i;
4821  }
4822  }
4823 
4824  if (!EnableCondStoresVectorization && NumPredStores) {
4825  ORE->emit(createMissedAnalysis("ConditionalStore")
4826  << "store that is conditionally executed prevents vectorization");
4827  LLVM_DEBUG(
4828  dbgs() << "LV: No vectorization. There are conditional stores.\n");
4829  Width = 1;
4830  Cost = ScalarCost;
4831  }
4832 
4833  LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
4834  << "LV: Vectorization seems to be not beneficial, "
4835  << "but was forced by a user.\n");
4836  LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
4837  VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
4838  return Factor;
4839 }
4840 
4841 std::pair<unsigned, unsigned>
4843  unsigned MinWidth = -1U;
4844  unsigned MaxWidth = 8;
4845  const DataLayout &DL = TheFunction->getParent()->getDataLayout();
4846 
4847  // For each block.
4848  for (BasicBlock *BB : TheLoop->blocks()) {
4849  // For each instruction in the loop.
4850  for (Instruction &I : BB->instructionsWithoutDebug()) {
4851  Type *T = I.getType();
4852 
4853  // Skip ignored values.
4854  if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
4855  continue;
4856 
4857  // Only examine Loads, Stores and PHINodes.
4858  if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
4859  continue;
4860 
4861  // Examine PHI nodes that are reduction variables. Update the type to
4862  // account for the recurrence type.
4863  if (auto *PN = dyn_cast<PHINode>(&I)) {
4864  if (!Legal->isReductionVariable(PN))
4865  continue;
4866  RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
4867  T = RdxDesc.getRecurrenceType();
4868  }
4869 
4870  // Examine the stored values.
4871  if (auto *ST = dyn_cast<StoreInst>(&I))
4872  T = ST->getValueOperand()->getType();
4873 
4874  // Ignore loaded pointer types and stored pointer types that are not
4875  // vectorizable.
4876  //
4877  // FIXME: The check here attempts to predict whether a load or store will
4878  // be vectorized. We only know this for certain after a VF has
4879  // been selected. Here, we assume that if an access can be
4880  // vectorized, it will be. We should also look at extending this
4881  // optimization to non-pointer types.
4882  //
4883  if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
4884  !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
4885  continue;
4886 
4887  MinWidth = std::min(MinWidth,
4888  (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
4889  MaxWidth = std::max(MaxWidth,
4890  (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
4891  }
4892  }
4893 
4894  return {MinWidth, MaxWidth};
4895 }
4896 
4898  unsigned VF,
4899  unsigned LoopCost) {
4900  // -- The interleave heuristics --
4901  // We interleave the loop in order to expose ILP and reduce the loop overhead.
4902  // There are many micro-architectural considerations that we can't predict
4903  // at this level. For example, frontend pressure (on decode or fetch) due to
4904  // code size, or the number and capabilities of the execution ports.
4905  //
4906  // We use the following heuristics to select the interleave count:
4907  // 1. If the code has reductions, then we interleave to break the cross
4908  // iteration dependency.
4909  // 2. If the loop is really small, then we interleave to reduce the loop
4910  // overhead.
4911  // 3. We don't interleave if we think that we will spill registers to memory
4912  // due to the increased register pressure.
4913 
4914  // When we optimize for size, we don't interleave.
4915  if (OptForSize)
4916  return 1;
4917 
4918  // We used the distance for the interleave count.
4919  if (Legal->getMaxSafeDepDistBytes() != -1U)
4920  return 1;
4921 
4922  // Do not interleave loops with a relatively small trip count.
4923  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4924  if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
4925  return 1;
4926 
4927  unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
4928  LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
4929  << " registers\n");
4930 
4931  if (VF == 1) {
4932  if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
4933  TargetNumRegisters = ForceTargetNumScalarRegs;
4934  } else {
4935  if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
4936  TargetNumRegisters = ForceTargetNumVectorRegs;
4937  }
4938 
4939  RegisterUsage R = calculateRegisterUsage({VF})[0];
4940  // We divide by these constants so assume that we have at least one
4941  // instruction that uses at least one register.
4943 
4944  // We calculate the interleave count using the following formula.
4945  // Subtract the number of loop invariants from the number of available
4946  // registers. These registers are used by all of the interleaved instances.
4947  // Next, divide the remaining registers by the number of registers that is
4948  // required by the loop, in order to estimate how many parallel instances
4949  // fit without causing spills. All of this is rounded down if necessary to be
4950  // a power of two. We want power of two interleave count to simplify any
4951  // addressing operations or alignment considerations.
4952  // We also want power of two interleave counts to ensu