LLVM  9.0.0svn
LoopVectorize.cpp
Go to the documentation of this file.
1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
11 // and generates target-independent LLVM-IR.
12 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
13 // of instructions in order to estimate the profitability of vectorization.
14 //
15 // The loop vectorizer combines consecutive loop iterations into a single
16 // 'wide' iteration. After this transformation the index is incremented
17 // by the SIMD vector width, and not by one.
18 //
19 // This pass has three parts:
20 // 1. The main loop pass that drives the different parts.
21 // 2. LoopVectorizationLegality - A unit that checks for the legality
22 // of the vectorization.
23 // 3. InnerLoopVectorizer - A unit that performs the actual
24 // widening of instructions.
25 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
26 // of vectorization. It decides on the optimal vector width, which
27 // can be one, if vectorization is not profitable.
28 //
29 // There is a development effort going on to migrate loop vectorizer to the
30 // VPlan infrastructure and to introduce outer loop vectorization support (see
31 // docs/Proposal/VectorizationPlan.rst and
32 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
33 // purpose, we temporarily introduced the VPlan-native vectorization path: an
34 // alternative vectorization path that is natively implemented on top of the
35 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
36 //
37 //===----------------------------------------------------------------------===//
38 //
39 // The reduction-variable vectorization is based on the paper:
40 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
41 //
42 // Variable uniformity checks are inspired by:
43 // Karrenberg, R. and Hack, S. Whole Function Vectorization.
44 //
45 // The interleaved access vectorization is based on the paper:
46 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
47 // Data for SIMD
48 //
49 // Other ideas/concepts are from:
50 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
51 //
52 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
53 // Vectorizing Compilers.
54 //
55 //===----------------------------------------------------------------------===//
56 
59 #include "VPRecipeBuilder.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanHCFGTransforms.h"
62 #include "llvm/ADT/APInt.h"
63 #include "llvm/ADT/ArrayRef.h"
64 #include "llvm/ADT/DenseMap.h"
65 #include "llvm/ADT/DenseMapInfo.h"
66 #include "llvm/ADT/Hashing.h"
67 #include "llvm/ADT/MapVector.h"
68 #include "llvm/ADT/None.h"
69 #include "llvm/ADT/Optional.h"
70 #include "llvm/ADT/STLExtras.h"
71 #include "llvm/ADT/SetVector.h"
72 #include "llvm/ADT/SmallPtrSet.h"
73 #include "llvm/ADT/SmallVector.h"
74 #include "llvm/ADT/Statistic.h"
75 #include "llvm/ADT/StringRef.h"
76 #include "llvm/ADT/Twine.h"
81 #include "llvm/Analysis/CFG.h"
87 #include "llvm/Analysis/LoopInfo.h"
96 #include "llvm/IR/Attributes.h"
97 #include "llvm/IR/BasicBlock.h"
98 #include "llvm/IR/CFG.h"
99 #include "llvm/IR/Constant.h"
100 #include "llvm/IR/Constants.h"
101 #include "llvm/IR/DataLayout.h"
103 #include "llvm/IR/DebugLoc.h"
104 #include "llvm/IR/DerivedTypes.h"
105 #include "llvm/IR/DiagnosticInfo.h"
106 #include "llvm/IR/Dominators.h"
107 #include "llvm/IR/Function.h"
108 #include "llvm/IR/IRBuilder.h"
109 #include "llvm/IR/InstrTypes.h"
110 #include "llvm/IR/Instruction.h"
111 #include "llvm/IR/Instructions.h"
112 #include "llvm/IR/IntrinsicInst.h"
113 #include "llvm/IR/Intrinsics.h"
114 #include "llvm/IR/LLVMContext.h"
115 #include "llvm/IR/Metadata.h"
116 #include "llvm/IR/Module.h"
117 #include "llvm/IR/Operator.h"
118 #include "llvm/IR/Type.h"
119 #include "llvm/IR/Use.h"
120 #include "llvm/IR/User.h"
121 #include "llvm/IR/Value.h"
122 #include "llvm/IR/ValueHandle.h"
123 #include "llvm/IR/Verifier.h"
124 #include "llvm/Pass.h"
125 #include "llvm/Support/Casting.h"
127 #include "llvm/Support/Compiler.h"
128 #include "llvm/Support/Debug.h"
130 #include "llvm/Support/MathExtras.h"
137 #include <algorithm>
138 #include <cassert>
139 #include <cstdint>
140 #include <cstdlib>
141 #include <functional>
142 #include <iterator>
143 #include <limits>
144 #include <memory>
145 #include <string>
146 #include <tuple>
147 #include <utility>
148 #include <vector>
149 
150 using namespace llvm;
151 
152 #define LV_NAME "loop-vectorize"
153 #define DEBUG_TYPE LV_NAME
154 
155 /// @{
156 /// Metadata attribute names
157 static const char *const LLVMLoopVectorizeFollowupAll =
158  "llvm.loop.vectorize.followup_all";
159 static const char *const LLVMLoopVectorizeFollowupVectorized =
160  "llvm.loop.vectorize.followup_vectorized";
161 static const char *const LLVMLoopVectorizeFollowupEpilogue =
162  "llvm.loop.vectorize.followup_epilogue";
163 /// @}
164 
165 STATISTIC(LoopsVectorized, "Number of loops vectorized");
166 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
167 
168 /// Loops with a known constant trip count below this number are vectorized only
169 /// if no scalar iteration overheads are incurred.
171  "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
172  cl::desc("Loops with a constant trip count that is smaller than this "
173  "value are vectorized only if no scalar iteration overheads "
174  "are incurred."));
175 
177  "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
178  cl::desc("Maximize bandwidth when selecting vectorization factor which "
179  "will be determined by the smallest type in loop."));
180 
182  "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
183  cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
184 
185 /// An interleave-group may need masking if it resides in a block that needs
186 /// predication, or in order to mask away gaps.
188  "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
189  cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
190 
191 /// We don't interleave loops with a known constant trip count below this
192 /// number.
193 static const unsigned TinyTripCountInterleaveThreshold = 128;
194 
196  "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
197  cl::desc("A flag that overrides the target's number of scalar registers."));
198 
200  "force-target-num-vector-regs", cl::init(0), cl::Hidden,
201  cl::desc("A flag that overrides the target's number of vector registers."));
202 
204  "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
205  cl::desc("A flag that overrides the target's max interleave factor for "
206  "scalar loops."));
207 
209  "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
210  cl::desc("A flag that overrides the target's max interleave factor for "
211  "vectorized loops."));
212 
214  "force-target-instruction-cost", cl::init(0), cl::Hidden,
215  cl::desc("A flag that overrides the target's expected cost for "
216  "an instruction to a single constant value. Mostly "
217  "useful for getting consistent testing."));
218 
220  "small-loop-cost", cl::init(20), cl::Hidden,
221  cl::desc(
222  "The cost of a loop that is considered 'small' by the interleaver."));
223 
225  "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
226  cl::desc("Enable the use of the block frequency analysis to access PGO "
227  "heuristics minimizing code growth in cold regions and being more "
228  "aggressive in hot regions."));
229 
230 // Runtime interleave loops for load/store throughput.
232  "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
233  cl::desc(
234  "Enable runtime interleaving until load/store ports are saturated"));
235 
236 /// The number of stores in a loop that are allowed to need predication.
238  "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
239  cl::desc("Max number of stores to be predicated behind an if."));
240 
242  "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
243  cl::desc("Count the induction variable only once when interleaving"));
244 
246  "enable-cond-stores-vec", cl::init(true), cl::Hidden,
247  cl::desc("Enable if predication of stores during vectorization."));
248 
250  "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
251  cl::desc("The maximum interleave count to use when interleaving a scalar "
252  "reduction in a nested loop."));
253 
255  "enable-vplan-native-path", cl::init(false), cl::Hidden,
256  cl::desc("Enable VPlan-native vectorization path with "
257  "support for outer loop vectorization."));
258 
259 // This flag enables the stress testing of the VPlan H-CFG construction in the
260 // VPlan-native vectorization path. It must be used in conjuction with
261 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
262 // verification of the H-CFGs built.
264  "vplan-build-stress-test", cl::init(false), cl::Hidden,
265  cl::desc(
266  "Build VPlan for every supported loop nest in the function and bail "
267  "out right after the build (stress test the VPlan H-CFG construction "
268  "in the VPlan-native vectorization path)."));
269 
270 /// A helper function for converting Scalar types to vector types.
271 /// If the incoming type is void, we return void. If the VF is 1, we return
272 /// the scalar type.
273 static Type *ToVectorTy(Type *Scalar, unsigned VF) {
274  if (Scalar->isVoidTy() || VF == 1)
275  return Scalar;
276  return VectorType::get(Scalar, VF);
277 }
278 
279 /// A helper function that returns the type of loaded or stored value.
281  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
282  "Expected Load or Store instruction");
283  if (auto *LI = dyn_cast<LoadInst>(I))
284  return LI->getType();
285  return cast<StoreInst>(I)->getValueOperand()->getType();
286 }
287 
288 /// A helper function that returns true if the given type is irregular. The
289 /// type is irregular if its allocated size doesn't equal the store size of an
290 /// element of the corresponding vector type at the given vectorization factor.
291 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
292  // Determine if an array of VF elements of type Ty is "bitcast compatible"
293  // with a <VF x Ty> vector.
294  if (VF > 1) {
295  auto *VectorTy = VectorType::get(Ty, VF);
296  return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
297  }
298 
299  // If the vectorization factor is one, we just check if an array of type Ty
300  // requires padding between elements.
301  return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
302 }
303 
304 /// A helper function that returns the reciprocal of the block probability of
305 /// predicated blocks. If we return X, we are assuming the predicated block
306 /// will execute once for every X iterations of the loop header.
307 ///
308 /// TODO: We should use actual block probability here, if available. Currently,
309 /// we always assume predicated blocks have a 50% chance of executing.
310 static unsigned getReciprocalPredBlockProb() { return 2; }
311 
312 /// A helper function that adds a 'fast' flag to floating-point operations.
314  if (isa<FPMathOperator>(V)) {
315  FastMathFlags Flags;
316  Flags.setFast();
317  cast<Instruction>(V)->setFastMathFlags(Flags);
318  }
319  return V;
320 }
321 
322 /// A helper function that returns an integer or floating-point constant with
323 /// value C.
324 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
325  return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
326  : ConstantFP::get(Ty, C);
327 }
328 
329 namespace llvm {
330 
331 /// InnerLoopVectorizer vectorizes loops which contain only one basic
332 /// block to a specified vectorization factor (VF).
333 /// This class performs the widening of scalars into vectors, or multiple
334 /// scalars. This class also implements the following features:
335 /// * It inserts an epilogue loop for handling loops that don't have iteration
336 /// counts that are known to be a multiple of the vectorization factor.
337 /// * It handles the code generation for reduction variables.
338 /// * Scalarization (implementation using scalars) of un-vectorizable
339 /// instructions.
340 /// InnerLoopVectorizer does not perform any vectorization-legality
341 /// checks, and relies on the caller to check for the different legality
342 /// aspects. The InnerLoopVectorizer relies on the
343 /// LoopVectorizationLegality class to provide information about the induction
344 /// and reduction variables that were found to a given vectorization factor.
346 public:
349  const TargetLibraryInfo *TLI,
351  OptimizationRemarkEmitter *ORE, unsigned VecWidth,
352  unsigned UnrollFactor, LoopVectorizationLegality *LVL,
354  : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
355  AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
356  Builder(PSE.getSE()->getContext()),
357  VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
358  virtual ~InnerLoopVectorizer() = default;
359 
360  /// Create a new empty loop. Unlink the old loop and connect the new one.
361  /// Return the pre-header block of the new loop.
363 
364  /// Widen a single instruction within the innermost loop.
366 
367  /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
368  void fixVectorizedLoop();
369 
370  // Return true if any runtime check is added.
372 
373  /// A type for vectorized values in the new loop. Each value from the
374  /// original loop, when vectorized, is represented by UF vector values in the
375  /// new unrolled loop, where UF is the unroll factor.
377 
378  /// Vectorize a single PHINode in a block. This method handles the induction
379  /// variable canonicalization. It supports both VF = 1 for unrolled loops and
380  /// arbitrary length vectors.
381  void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
382 
383  /// A helper function to scalarize a single Instruction in the innermost loop.
384  /// Generates a sequence of scalar instances for each lane between \p MinLane
385  /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
386  /// inclusive..
387  void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
388  bool IfPredicateInstr);
389 
390  /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
391  /// is provided, the integer induction variable will first be truncated to
392  /// the corresponding type.
393  void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
394 
395  /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
396  /// vector or scalar value on-demand if one is not yet available. When
397  /// vectorizing a loop, we visit the definition of an instruction before its
398  /// uses. When visiting the definition, we either vectorize or scalarize the
399  /// instruction, creating an entry for it in the corresponding map. (In some
400  /// cases, such as induction variables, we will create both vector and scalar
401  /// entries.) Then, as we encounter uses of the definition, we derive values
402  /// for each scalar or vector use unless such a value is already available.
403  /// For example, if we scalarize a definition and one of its uses is vector,
404  /// we build the required vector on-demand with an insertelement sequence
405  /// when visiting the use. Otherwise, if the use is scalar, we can use the
406  /// existing scalar definition.
407  ///
408  /// Return a value in the new loop corresponding to \p V from the original
409  /// loop at unroll index \p Part. If the value has already been vectorized,
410  /// the corresponding vector entry in VectorLoopValueMap is returned. If,
411  /// however, the value has a scalar entry in VectorLoopValueMap, we construct
412  /// a new vector value on-demand by inserting the scalar values into a vector
413  /// with an insertelement sequence. If the value has been neither vectorized
414  /// nor scalarized, it must be loop invariant, so we simply broadcast the
415  /// value into a vector.
416  Value *getOrCreateVectorValue(Value *V, unsigned Part);
417 
418  /// Return a value in the new loop corresponding to \p V from the original
419  /// loop at unroll and vector indices \p Instance. If the value has been
420  /// vectorized but not scalarized, the necessary extractelement instruction
421  /// will be generated.
422  Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
423 
424  /// Construct the vector value of a scalarized value \p V one lane at a time.
425  void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
426 
427  /// Try to vectorize the interleaved access group that \p Instr belongs to,
428  /// optionally masking the vector operations if \p BlockInMask is non-null.
430  VectorParts *BlockInMask = nullptr);
431 
432  /// Vectorize Load and Store instructions, optionally masking the vector
433  /// operations if \p BlockInMask is non-null.
435  VectorParts *BlockInMask = nullptr);
436 
437  /// Set the debug location in the builder using the debug location in
438  /// the instruction.
439  void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
440 
441  /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
442  void fixNonInductionPHIs(void);
443 
444 protected:
446 
447  /// A small list of PHINodes.
449 
450  /// A type for scalarized values in the new loop. Each value from the
451  /// original loop, when scalarized, is represented by UF x VF scalar values
452  /// in the new unrolled loop, where UF is the unroll factor and VF is the
453  /// vectorization factor.
455 
456  /// Set up the values of the IVs correctly when exiting the vector loop.
457  void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
458  Value *CountRoundDown, Value *EndValue,
459  BasicBlock *MiddleBlock);
460 
461  /// Create a new induction variable inside L.
462  PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
463  Value *Step, Instruction *DL);
464 
465  /// Handle all cross-iteration phis in the header.
466  void fixCrossIterationPHIs();
467 
468  /// Fix a first-order recurrence. This is the second phase of vectorizing
469  /// this phi node.
470  void fixFirstOrderRecurrence(PHINode *Phi);
471 
472  /// Fix a reduction cross-iteration phi. This is the second phase of
473  /// vectorizing this phi node.
474  void fixReduction(PHINode *Phi);
475 
476  /// The Loop exit block may have single value PHI nodes with some
477  /// incoming value. While vectorizing we only handled real values
478  /// that were defined inside the loop and we should have one value for
479  /// each predecessor of its parent basic block. See PR14725.
480  void fixLCSSAPHIs();
481 
482  /// Iteratively sink the scalarized operands of a predicated instruction into
483  /// the block that was created for it.
484  void sinkScalarOperands(Instruction *PredInst);
485 
486  /// Shrinks vector element sizes to the smallest bitwidth they can be legally
487  /// represented as.
489 
490  /// Insert the new loop to the loop hierarchy and pass manager
491  /// and update the analysis passes.
492  void updateAnalysis();
493 
494  /// Create a broadcast instruction. This method generates a broadcast
495  /// instruction (shuffle) for loop invariant values and for the induction
496  /// value. If this is the induction variable then we extend it to N, N+1, ...
497  /// this is needed because each iteration in the loop corresponds to a SIMD
498  /// element.
499  virtual Value *getBroadcastInstrs(Value *V);
500 
501  /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
502  /// to each vector element of Val. The sequence starts at StartIndex.
503  /// \p Opcode is relevant for FP induction variable.
504  virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
505  Instruction::BinaryOps Opcode =
506  Instruction::BinaryOpsEnd);
507 
508  /// Compute scalar induction steps. \p ScalarIV is the scalar induction
509  /// variable on which to base the steps, \p Step is the size of the step, and
510  /// \p EntryVal is the value from the original loop that maps to the steps.
511  /// Note that \p EntryVal doesn't have to be an induction variable - it
512  /// can also be a truncate instruction.
513  void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
514  const InductionDescriptor &ID);
515 
516  /// Create a vector induction phi node based on an existing scalar one. \p
517  /// EntryVal is the value from the original loop that maps to the vector phi
518  /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
519  /// truncate instruction, instead of widening the original IV, we widen a
520  /// version of the IV truncated to \p EntryVal's type.
522  Value *Step, Instruction *EntryVal);
523 
524  /// Returns true if an instruction \p I should be scalarized instead of
525  /// vectorized for the chosen vectorization factor.
527 
528  /// Returns true if we should generate a scalar version of \p IV.
529  bool needsScalarInduction(Instruction *IV) const;
530 
531  /// If there is a cast involved in the induction variable \p ID, which should
532  /// be ignored in the vectorized loop body, this function records the
533  /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
534  /// cast. We had already proved that the casted Phi is equal to the uncasted
535  /// Phi in the vectorized loop (under a runtime guard), and therefore
536  /// there is no need to vectorize the cast - the same value can be used in the
537  /// vector loop for both the Phi and the cast.
538  /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
539  /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
540  ///
541  /// \p EntryVal is the value from the original loop that maps to the vector
542  /// phi node and is used to distinguish what is the IV currently being
543  /// processed - original one (if \p EntryVal is a phi corresponding to the
544  /// original IV) or the "newly-created" one based on the proof mentioned above
545  /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
546  /// latter case \p EntryVal is a TruncInst and we must not record anything for
547  /// that IV, but it's error-prone to expect callers of this routine to care
548  /// about that, hence this explicit parameter.
550  const Instruction *EntryVal,
551  Value *VectorLoopValue,
552  unsigned Part,
553  unsigned Lane = UINT_MAX);
554 
555  /// Generate a shuffle sequence that will reverse the vector Vec.
556  virtual Value *reverseVector(Value *Vec);
557 
558  /// Returns (and creates if needed) the original loop trip count.
559  Value *getOrCreateTripCount(Loop *NewLoop);
560 
561  /// Returns (and creates if needed) the trip count of the widened loop.
563 
564  /// Returns a bitcasted value to the requested vector type.
565  /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
567  const DataLayout &DL);
568 
569  /// Emit a bypass check to see if the vector trip count is zero, including if
570  /// it overflows.
572 
573  /// Emit a bypass check to see if all of the SCEV assumptions we've
574  /// had to make are correct.
575  void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
576 
577  /// Emit bypass checks to check any memory assumptions we may have made.
578  void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
579 
580  /// Compute the transformed value of Index at offset StartValue using step
581  /// StepValue.
582  /// For integer induction, returns StartValue + Index * StepValue.
583  /// For pointer induction, returns StartValue[Index * StepValue].
584  /// FIXME: The newly created binary instructions should contain nsw/nuw
585  /// flags, which can be found from the original scalar operations.
587  const DataLayout &DL,
588  const InductionDescriptor &ID) const;
589 
590  /// Add additional metadata to \p To that was not present on \p Orig.
591  ///
592  /// Currently this is used to add the noalias annotations based on the
593  /// inserted memchecks. Use this for instructions that are *cloned* into the
594  /// vector loop.
595  void addNewMetadata(Instruction *To, const Instruction *Orig);
596 
597  /// Add metadata from one instruction to another.
598  ///
599  /// This includes both the original MDs from \p From and additional ones (\see
600  /// addNewMetadata). Use this for *newly created* instructions in the vector
601  /// loop.
603 
604  /// Similar to the previous function but it adds the metadata to a
605  /// vector of instructions.
607 
608  /// The original loop.
610 
611  /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
612  /// dynamic knowledge to simplify SCEV expressions and converts them to a
613  /// more usable form.
615 
616  /// Loop Info.
618 
619  /// Dominator Tree.
621 
622  /// Alias Analysis.
624 
625  /// Target Library Info.
627 
628  /// Target Transform Info.
630 
631  /// Assumption Cache.
633 
634  /// Interface to emit optimization remarks.
636 
637  /// LoopVersioning. It's only set up (non-null) if memchecks were
638  /// used.
639  ///
640  /// This is currently only used to add no-alias metadata based on the
641  /// memchecks. The actually versioning is performed manually.
642  std::unique_ptr<LoopVersioning> LVer;
643 
644  /// The vectorization SIMD factor to use. Each vector will have this many
645  /// vector elements.
646  unsigned VF;
647 
648  /// The vectorization unroll factor to use. Each scalar is vectorized to this
649  /// many different vector instructions.
650  unsigned UF;
651 
652  /// The builder that we use
654 
655  // --- Vectorization state ---
656 
657  /// The vector-loop preheader.
659 
660  /// The scalar-loop preheader.
662 
663  /// Middle Block between the vector and the scalar.
665 
666  /// The ExitBlock of the scalar loop.
668 
669  /// The vector loop body.
671 
672  /// The scalar loop body.
674 
675  /// A list of all bypass blocks. The first block is the entry of the loop.
677 
678  /// The new Induction variable which was added to the new block.
679  PHINode *Induction = nullptr;
680 
681  /// The induction variable of the old basic block.
682  PHINode *OldInduction = nullptr;
683 
684  /// Maps values from the original loop to their corresponding values in the
685  /// vectorized loop. A key value can map to either vector values, scalar
686  /// values or both kinds of values, depending on whether the key was
687  /// vectorized and scalarized.
689 
690  /// Store instructions that were predicated.
692 
693  /// Trip count of the original loop.
694  Value *TripCount = nullptr;
695 
696  /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
697  Value *VectorTripCount = nullptr;
698 
699  /// The legality analysis.
701 
702  /// The profitablity analysis.
704 
705  // Record whether runtime checks are added.
706  bool AddedSafetyChecks = false;
707 
708  // Holds the end values for each induction variable. We save the end values
709  // so we can later fix-up the external users of the induction variables.
711 
712  // Vector of original scalar PHIs whose corresponding widened PHIs need to be
713  // fixed up at the end of vector code generation.
715 };
716 
718 public:
721  const TargetLibraryInfo *TLI,
723  OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
726  : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
727  UnrollFactor, LVL, CM) {}
728 
729 private:
730  Value *getBroadcastInstrs(Value *V) override;
731  Value *getStepVector(Value *Val, int StartIdx, Value *Step,
732  Instruction::BinaryOps Opcode =
733  Instruction::BinaryOpsEnd) override;
734  Value *reverseVector(Value *Vec) override;
735 };
736 
737 } // end namespace llvm
738 
739 /// Look for a meaningful debug location on the instruction or it's
740 /// operands.
742  if (!I)
743  return I;
744 
745  DebugLoc Empty;
746  if (I->getDebugLoc() != Empty)
747  return I;
748 
749  for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
750  if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
751  if (OpInst->getDebugLoc() != Empty)
752  return OpInst;
753  }
754 
755  return I;
756 }
757 
759  if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
760  const DILocation *DIL = Inst->getDebugLoc();
761  if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
762  !isa<DbgInfoIntrinsic>(Inst)) {
763  auto NewDIL = DIL->cloneWithDuplicationFactor(UF * VF);
764  if (NewDIL)
765  B.SetCurrentDebugLocation(NewDIL.getValue());
766  else
767  LLVM_DEBUG(dbgs()
768  << "Failed to create new discriminator: "
769  << DIL->getFilename() << " Line: " << DIL->getLine());
770  }
771  else
773  } else
775 }
776 
777 #ifndef NDEBUG
778 /// \return string containing a file name and a line # for the given loop.
779 static std::string getDebugLocString(const Loop *L) {
780  std::string Result;
781  if (L) {
782  raw_string_ostream OS(Result);
783  if (const DebugLoc LoopDbgLoc = L->getStartLoc())
784  LoopDbgLoc.print(OS);
785  else
786  // Just print the module name.
787  OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
788  OS.flush();
789  }
790  return Result;
791 }
792 #endif
793 
795  const Instruction *Orig) {
796  // If the loop was versioned with memchecks, add the corresponding no-alias
797  // metadata.
798  if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
799  LVer->annotateInstWithNoAlias(To, Orig);
800 }
801 
803  Instruction *From) {
804  propagateMetadata(To, From);
805  addNewMetadata(To, From);
806 }
807 
809  Instruction *From) {
810  for (Value *V : To) {
811  if (Instruction *I = dyn_cast<Instruction>(V))
812  addMetadata(I, From);
813  }
814 }
815 
816 namespace llvm {
817 
818 /// LoopVectorizationCostModel - estimates the expected speedups due to
819 /// vectorization.
820 /// In many cases vectorization is not profitable. This can happen because of
821 /// a number of reasons. In this class we mainly attempt to predict the
822 /// expected speedup/slowdowns due to the supported instruction set. We use the
823 /// TargetTransformInfo to query the different backends for the cost of
824 /// different operations.
826 public:
829  const TargetTransformInfo &TTI,
830  const TargetLibraryInfo *TLI, DemandedBits *DB,
833  const LoopVectorizeHints *Hints,
835  : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
836  AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {}
837 
838  /// \return An upper bound for the vectorization factor, or None if
839  /// vectorization should be avoided up front.
840  Optional<unsigned> computeMaxVF(bool OptForSize);
841 
842  /// \return The most profitable vectorization factor and the cost of that VF.
843  /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
844  /// then this vectorization factor will be selected if vectorization is
845  /// possible.
846  VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
847 
848  /// Setup cost-based decisions for user vectorization factor.
849  void selectUserVectorizationFactor(unsigned UserVF) {
850  collectUniformsAndScalars(UserVF);
851  collectInstsToScalarize(UserVF);
852  }
853 
854  /// \return The size (in bits) of the smallest and widest types in the code
855  /// that needs to be vectorized. We ignore values that remain scalar such as
856  /// 64 bit loop indices.
857  std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
858 
859  /// \return The desired interleave count.
860  /// If interleave count has been specified by metadata it will be returned.
861  /// Otherwise, the interleave count is computed and returned. VF and LoopCost
862  /// are the selected vectorization factor and the cost of the selected VF.
863  unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
864  unsigned LoopCost);
865 
866  /// Memory access instruction may be vectorized in more than one way.
867  /// Form of instruction after vectorization depends on cost.
868  /// This function takes cost-based decisions for Load/Store instructions
869  /// and collects them in a map. This decisions map is used for building
870  /// the lists of loop-uniform and loop-scalar instructions.
871  /// The calculated cost is saved with widening decision in order to
872  /// avoid redundant calculations.
873  void setCostBasedWideningDecision(unsigned VF);
874 
875  /// A struct that represents some properties of the register usage
876  /// of a loop.
877  struct RegisterUsage {
878  /// Holds the number of loop invariant values that are used in the loop.
880 
881  /// Holds the maximum number of concurrent live intervals in the loop.
882  unsigned MaxLocalUsers;
883  };
884 
885  /// \return Returns information about the register usages of the loop for the
886  /// given vectorization factors.
887  SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
888 
889  /// Collect values we want to ignore in the cost model.
890  void collectValuesToIgnore();
891 
892  /// \returns The smallest bitwidth each instruction can be represented with.
893  /// The vector equivalents of these instructions should be truncated to this
894  /// type.
896  return MinBWs;
897  }
898 
899  /// \returns True if it is more profitable to scalarize instruction \p I for
900  /// vectorization factor \p VF.
901  bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
902  assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
903 
904  // Cost model is not run in the VPlan-native path - return conservative
905  // result until this changes.
907  return false;
908 
909  auto Scalars = InstsToScalarize.find(VF);
910  assert(Scalars != InstsToScalarize.end() &&
911  "VF not yet analyzed for scalarization profitability");
912  return Scalars->second.find(I) != Scalars->second.end();
913  }
914 
915  /// Returns true if \p I is known to be uniform after vectorization.
916  bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
917  if (VF == 1)
918  return true;
919 
920  // Cost model is not run in the VPlan-native path - return conservative
921  // result until this changes.
923  return false;
924 
925  auto UniformsPerVF = Uniforms.find(VF);
926  assert(UniformsPerVF != Uniforms.end() &&
927  "VF not yet analyzed for uniformity");
928  return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
929  }
930 
931  /// Returns true if \p I is known to be scalar after vectorization.
932  bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
933  if (VF == 1)
934  return true;
935 
936  // Cost model is not run in the VPlan-native path - return conservative
937  // result until this changes.
939  return false;
940 
941  auto ScalarsPerVF = Scalars.find(VF);
942  assert(ScalarsPerVF != Scalars.end() &&
943  "Scalar values are not calculated for VF");
944  return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
945  }
946 
947  /// \returns True if instruction \p I can be truncated to a smaller bitwidth
948  /// for vectorization factor \p VF.
949  bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
950  return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
951  !isProfitableToScalarize(I, VF) &&
952  !isScalarAfterVectorization(I, VF);
953  }
954 
955  /// Decision that was taken during cost calculation for memory instruction.
958  CM_Widen, // For consecutive accesses with stride +1.
959  CM_Widen_Reverse, // For consecutive accesses with stride -1.
962  CM_Scalarize
963  };
964 
965  /// Save vectorization decision \p W and \p Cost taken by the cost model for
966  /// instruction \p I and vector width \p VF.
968  unsigned Cost) {
969  assert(VF >= 2 && "Expected VF >=2");
970  WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
971  }
972 
973  /// Save vectorization decision \p W and \p Cost taken by the cost model for
974  /// interleaving group \p Grp and vector width \p VF.
975  void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
976  InstWidening W, unsigned Cost) {
977  assert(VF >= 2 && "Expected VF >=2");
978  /// Broadcast this decicion to all instructions inside the group.
979  /// But the cost will be assigned to one instruction only.
980  for (unsigned i = 0; i < Grp->getFactor(); ++i) {
981  if (auto *I = Grp->getMember(i)) {
982  if (Grp->getInsertPos() == I)
983  WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
984  else
985  WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
986  }
987  }
988  }
989 
990  /// Return the cost model decision for the given instruction \p I and vector
991  /// width \p VF. Return CM_Unknown if this instruction did not pass
992  /// through the cost modeling.
994  assert(VF >= 2 && "Expected VF >=2");
995 
996  // Cost model is not run in the VPlan-native path - return conservative
997  // result until this changes.
999  return CM_GatherScatter;
1000 
1001  std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1002  auto Itr = WideningDecisions.find(InstOnVF);
1003  if (Itr == WideningDecisions.end())
1004  return CM_Unknown;
1005  return Itr->second.first;
1006  }
1007 
1008  /// Return the vectorization cost for the given instruction \p I and vector
1009  /// width \p VF.
1010  unsigned getWideningCost(Instruction *I, unsigned VF) {
1011  assert(VF >= 2 && "Expected VF >=2");
1012  std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1013  assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1014  "The cost is not calculated");
1015  return WideningDecisions[InstOnVF].second;
1016  }
1017 
1018  /// Return True if instruction \p I is an optimizable truncate whose operand
1019  /// is an induction variable. Such a truncate will be removed by adding a new
1020  /// induction variable with the destination type.
1021  bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1022  // If the instruction is not a truncate, return false.
1023  auto *Trunc = dyn_cast<TruncInst>(I);
1024  if (!Trunc)
1025  return false;
1026 
1027  // Get the source and destination types of the truncate.
1028  Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1029  Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1030 
1031  // If the truncate is free for the given types, return false. Replacing a
1032  // free truncate with an induction variable would add an induction variable
1033  // update instruction to each iteration of the loop. We exclude from this
1034  // check the primary induction variable since it will need an update
1035  // instruction regardless.
1036  Value *Op = Trunc->getOperand(0);
1037  if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1038  return false;
1039 
1040  // If the truncated value is not an induction variable, return false.
1041  return Legal->isInductionPhi(Op);
1042  }
1043 
1044  /// Collects the instructions to scalarize for each predicated instruction in
1045  /// the loop.
1046  void collectInstsToScalarize(unsigned VF);
1047 
1048  /// Collect Uniform and Scalar values for the given \p VF.
1049  /// The sets depend on CM decision for Load/Store instructions
1050  /// that may be vectorized as interleave, gather-scatter or scalarized.
1051  void collectUniformsAndScalars(unsigned VF) {
1052  // Do the analysis once.
1053  if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1054  return;
1055  setCostBasedWideningDecision(VF);
1056  collectLoopUniforms(VF);
1057  collectLoopScalars(VF);
1058  }
1059 
1060  /// Returns true if the target machine supports masked store operation
1061  /// for the given \p DataType and kind of access to \p Ptr.
1063  return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType);
1064  }
1065 
1066  /// Returns true if the target machine supports masked load operation
1067  /// for the given \p DataType and kind of access to \p Ptr.
1069  return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType);
1070  }
1071 
1072  /// Returns true if the target machine supports masked scatter operation
1073  /// for the given \p DataType.
1075  return TTI.isLegalMaskedScatter(DataType);
1076  }
1077 
1078  /// Returns true if the target machine supports masked gather operation
1079  /// for the given \p DataType.
1081  return TTI.isLegalMaskedGather(DataType);
1082  }
1083 
1084  /// Returns true if the target machine can represent \p V as a masked gather
1085  /// or scatter operation.
1087  bool LI = isa<LoadInst>(V);
1088  bool SI = isa<StoreInst>(V);
1089  if (!LI && !SI)
1090  return false;
1091  auto *Ty = getMemInstValueType(V);
1092  return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
1093  }
1094 
1095  /// Returns true if \p I is an instruction that will be scalarized with
1096  /// predication. Such instructions include conditional stores and
1097  /// instructions that may divide by zero.
1098  /// If a non-zero VF has been calculated, we check if I will be scalarized
1099  /// predication for that VF.
1100  bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1101 
1102  // Returns true if \p I is an instruction that will be predicated either
1103  // through scalar predication or masked load/store or masked gather/scatter.
1104  // Superset of instructions that return true for isScalarWithPredication.
1106  if (!blockNeedsPredication(I->getParent()))
1107  return false;
1108  // Loads and stores that need some form of masked operation are predicated
1109  // instructions.
1110  if (isa<LoadInst>(I) || isa<StoreInst>(I))
1111  return Legal->isMaskRequired(I);
1112  return isScalarWithPredication(I);
1113  }
1114 
1115  /// Returns true if \p I is a memory instruction with consecutive memory
1116  /// access that can be widened.
1117  bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1118 
1119  /// Returns true if \p I is a memory instruction in an interleaved-group
1120  /// of memory accesses that can be vectorized with wide vector loads/stores
1121  /// and shuffles.
1122  bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1123 
1124  /// Check if \p Instr belongs to any interleaved access group.
1126  return InterleaveInfo.isInterleaved(Instr);
1127  }
1128 
1129  /// Get the interleaved access group that \p Instr belongs to.
1132  return InterleaveInfo.getInterleaveGroup(Instr);
1133  }
1134 
1135  /// Returns true if an interleaved group requires a scalar iteration
1136  /// to handle accesses with gaps, and there is nothing preventing us from
1137  /// creating a scalar epilogue.
1138  bool requiresScalarEpilogue() const {
1139  return IsScalarEpilogueAllowed && InterleaveInfo.requiresScalarEpilogue();
1140  }
1141 
1142  /// Returns true if a scalar epilogue is not allowed due to optsize.
1143  bool isScalarEpilogueAllowed() const { return IsScalarEpilogueAllowed; }
1144 
1145  /// Returns true if all loop blocks should be masked to fold tail loop.
1146  bool foldTailByMasking() const { return FoldTailByMasking; }
1147 
1149  return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1150  }
1151 
1152 private:
1153  unsigned NumPredStores = 0;
1154 
1155  /// \return An upper bound for the vectorization factor, larger than zero.
1156  /// One is returned if vectorization should best be avoided due to cost.
1157  unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount);
1158 
1159  /// The vectorization cost is a combination of the cost itself and a boolean
1160  /// indicating whether any of the contributing operations will actually
1161  /// operate on
1162  /// vector values after type legalization in the backend. If this latter value
1163  /// is
1164  /// false, then all operations will be scalarized (i.e. no vectorization has
1165  /// actually taken place).
1166  using VectorizationCostTy = std::pair<unsigned, bool>;
1167 
1168  /// Returns the expected execution cost. The unit of the cost does
1169  /// not matter because we use the 'cost' units to compare different
1170  /// vector widths. The cost that is returned is *not* normalized by
1171  /// the factor width.
1172  VectorizationCostTy expectedCost(unsigned VF);
1173 
1174  /// Returns the execution time cost of an instruction for a given vector
1175  /// width. Vector width of one means scalar.
1176  VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1177 
1178  /// The cost-computation logic from getInstructionCost which provides
1179  /// the vector type as an output parameter.
1180  unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1181 
1182  /// Calculate vectorization cost of memory instruction \p I.
1183  unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1184 
1185  /// The cost computation for scalarized memory instruction.
1186  unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1187 
1188  /// The cost computation for interleaving group of memory instructions.
1189  unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1190 
1191  /// The cost computation for Gather/Scatter instruction.
1192  unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1193 
1194  /// The cost computation for widening instruction \p I with consecutive
1195  /// memory access.
1196  unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1197 
1198  /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1199  /// Load: scalar load + broadcast.
1200  /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1201  /// element)
1202  unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1203 
1204  /// Returns whether the instruction is a load or store and will be a emitted
1205  /// as a vector operation.
1206  bool isConsecutiveLoadOrStore(Instruction *I);
1207 
1208  /// Returns true if an artificially high cost for emulated masked memrefs
1209  /// should be used.
1210  bool useEmulatedMaskMemRefHack(Instruction *I);
1211 
1212  /// Create an analysis remark that explains why vectorization failed
1213  ///
1214  /// \p RemarkName is the identifier for the remark. \return the remark object
1215  /// that can be streamed to.
1216  OptimizationRemarkAnalysis createMissedAnalysis(StringRef RemarkName) {
1217  return createLVMissedAnalysis(Hints->vectorizeAnalysisPassName(),
1218  RemarkName, TheLoop);
1219  }
1220 
1221  /// Map of scalar integer values to the smallest bitwidth they can be legally
1222  /// represented as. The vector equivalents of these values should be truncated
1223  /// to this type.
1225 
1226  /// A type representing the costs for instructions if they were to be
1227  /// scalarized rather than vectorized. The entries are Instruction-Cost
1228  /// pairs.
1230 
1231  /// A set containing all BasicBlocks that are known to present after
1232  /// vectorization as a predicated block.
1233  SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1234 
1235  /// Records whether it is allowed to have the original scalar loop execute at
1236  /// least once. This may be needed as a fallback loop in case runtime
1237  /// aliasing/dependence checks fail, or to handle the tail/remainder
1238  /// iterations when the trip count is unknown or doesn't divide by the VF,
1239  /// or as a peel-loop to handle gaps in interleave-groups.
1240  /// Under optsize and when the trip count is very small we don't allow any
1241  /// iterations to execute in the scalar loop.
1242  bool IsScalarEpilogueAllowed = true;
1243 
1244  /// All blocks of loop are to be masked to fold tail of scalar iterations.
1245  bool FoldTailByMasking = false;
1246 
1247  /// A map holding scalar costs for different vectorization factors. The
1248  /// presence of a cost for an instruction in the mapping indicates that the
1249  /// instruction will be scalarized when vectorizing with the associated
1250  /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1251  DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1252 
1253  /// Holds the instructions known to be uniform after vectorization.
1254  /// The data is collected per VF.
1256 
1257  /// Holds the instructions known to be scalar after vectorization.
1258  /// The data is collected per VF.
1260 
1261  /// Holds the instructions (address computations) that are forced to be
1262  /// scalarized.
1264 
1265  /// Returns the expected difference in cost from scalarizing the expression
1266  /// feeding a predicated instruction \p PredInst. The instructions to
1267  /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1268  /// non-negative return value implies the expression will be scalarized.
1269  /// Currently, only single-use chains are considered for scalarization.
1270  int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1271  unsigned VF);
1272 
1273  /// Collect the instructions that are uniform after vectorization. An
1274  /// instruction is uniform if we represent it with a single scalar value in
1275  /// the vectorized loop corresponding to each vector iteration. Examples of
1276  /// uniform instructions include pointer operands of consecutive or
1277  /// interleaved memory accesses. Note that although uniformity implies an
1278  /// instruction will be scalar, the reverse is not true. In general, a
1279  /// scalarized instruction will be represented by VF scalar values in the
1280  /// vectorized loop, each corresponding to an iteration of the original
1281  /// scalar loop.
1282  void collectLoopUniforms(unsigned VF);
1283 
1284  /// Collect the instructions that are scalar after vectorization. An
1285  /// instruction is scalar if it is known to be uniform or will be scalarized
1286  /// during vectorization. Non-uniform scalarized instructions will be
1287  /// represented by VF values in the vectorized loop, each corresponding to an
1288  /// iteration of the original scalar loop.
1289  void collectLoopScalars(unsigned VF);
1290 
1291  /// Keeps cost model vectorization decision and cost for instructions.
1292  /// Right now it is used for memory instructions only.
1294  std::pair<InstWidening, unsigned>>;
1295 
1296  DecisionList WideningDecisions;
1297 
1298 public:
1299  /// The loop that we evaluate.
1301 
1302  /// Predicated scalar evolution analysis.
1304 
1305  /// Loop Info analysis.
1307 
1308  /// Vectorization legality.
1310 
1311  /// Vector target information.
1313 
1314  /// Target Library Info.
1316 
1317  /// Demanded bits analysis.
1319 
1320  /// Assumption cache.
1322 
1323  /// Interface to emit optimization remarks.
1325 
1327 
1328  /// Loop Vectorize Hint.
1330 
1331  /// The interleave access information contains groups of interleaved accesses
1332  /// with the same stride and close to each other.
1334 
1335  /// Values to ignore in the cost model.
1337 
1338  /// Values to ignore in the cost model when VF > 1.
1340 };
1341 
1342 } // end namespace llvm
1343 
1344 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1345 // vectorization. The loop needs to be annotated with #pragma omp simd
1346 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1347 // vector length information is not provided, vectorization is not considered
1348 // explicit. Interleave hints are not allowed either. These limitations will be
1349 // relaxed in the future.
1350 // Please, note that we are currently forced to abuse the pragma 'clang
1351 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1352 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1353 // provides *explicit vectorization hints* (LV can bypass legal checks and
1354 // assume that vectorization is legal). However, both hints are implemented
1355 // using the same metadata (llvm.loop.vectorize, processed by
1356 // LoopVectorizeHints). This will be fixed in the future when the native IR
1357 // representation for pragma 'omp simd' is introduced.
1358 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1360  assert(!OuterLp->empty() && "This is not an outer loop");
1361  LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1362 
1363  // Only outer loops with an explicit vectorization hint are supported.
1364  // Unannotated outer loops are ignored.
1365  if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1366  return false;
1367 
1368  Function *Fn = OuterLp->getHeader()->getParent();
1369  if (!Hints.allowVectorization(Fn, OuterLp,
1370  true /*VectorizeOnlyWhenForced*/)) {
1371  LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1372  return false;
1373  }
1374 
1375  if (!Hints.getWidth()) {
1376  LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No user vector width.\n");
1377  Hints.emitRemarkWithHints();
1378  return false;
1379  }
1380 
1381  if (Hints.getInterleave() > 1) {
1382  // TODO: Interleave support is future work.
1383  LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1384  "outer loops.\n");
1385  Hints.emitRemarkWithHints();
1386  return false;
1387  }
1388 
1389  return true;
1390 }
1391 
1395  // Collect inner loops and outer loops without irreducible control flow. For
1396  // now, only collect outer loops that have explicit vectorization hints. If we
1397  // are stress testing the VPlan H-CFG construction, we collect the outermost
1398  // loop of every loop nest.
1399  if (L.empty() || VPlanBuildStressTest ||
1401  LoopBlocksRPO RPOT(&L);
1402  RPOT.perform(LI);
1403  if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1404  V.push_back(&L);
1405  // TODO: Collect inner loops inside marked outer loops in case
1406  // vectorization fails for the outer loop. Do not invoke
1407  // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1408  // already known to be reducible. We can use an inherited attribute for
1409  // that.
1410  return;
1411  }
1412  }
1413  for (Loop *InnerL : L)
1414  collectSupportedLoops(*InnerL, LI, ORE, V);
1415 }
1416 
1417 namespace {
1418 
1419 /// The LoopVectorize Pass.
1420 struct LoopVectorize : public FunctionPass {
1421  /// Pass identification, replacement for typeid
1422  static char ID;
1423 
1424  LoopVectorizePass Impl;
1425 
1426  explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1427  bool VectorizeOnlyWhenForced = false)
1428  : FunctionPass(ID) {
1429  Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
1430  Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
1432  }
1433 
1434  bool runOnFunction(Function &F) override {
1435  if (skipFunction(F))
1436  return false;
1437 
1438  auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1439  auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1440  auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1441  auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1442  auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1443  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1444  auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
1445  auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1446  auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1447  auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1448  auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1449  auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1450 
1451  std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1452  [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1453 
1454  return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1455  GetLAA, *ORE);
1456  }
1457 
1458  void getAnalysisUsage(AnalysisUsage &AU) const override {
1469 
1470  // We currently do not preserve loopinfo/dominator analyses with outer loop
1471  // vectorization. Until this is addressed, mark these analyses as preserved
1472  // only for non-VPlan-native path.
1473  // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1474  if (!EnableVPlanNativePath) {
1477  }
1478 
1481  }
1482 };
1483 
1484 } // end anonymous namespace
1485 
1486 //===----------------------------------------------------------------------===//
1487 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1488 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1489 //===----------------------------------------------------------------------===//
1490 
1492  // We need to place the broadcast of invariant variables outside the loop,
1493  // but only if it's proven safe to do so. Else, broadcast will be inside
1494  // vector loop body.
1495  Instruction *Instr = dyn_cast<Instruction>(V);
1496  bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1497  (!Instr ||
1499  // Place the code for broadcasting invariant variables in the new preheader.
1501  if (SafeToHoist)
1503 
1504  // Broadcast the scalar into all locations in the vector.
1505  Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1506 
1507  return Shuf;
1508 }
1509 
1511  const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1512  assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1513  "Expected either an induction phi-node or a truncate of it!");
1514  Value *Start = II.getStartValue();
1515 
1516  // Construct the initial value of the vector IV in the vector loop preheader
1517  auto CurrIP = Builder.saveIP();
1519  if (isa<TruncInst>(EntryVal)) {
1520  assert(Start->getType()->isIntegerTy() &&
1521  "Truncation requires an integer type");
1522  auto *TruncType = cast<IntegerType>(EntryVal->getType());
1523  Step = Builder.CreateTrunc(Step, TruncType);
1524  Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1525  }
1526  Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1527  Value *SteppedStart =
1528  getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1529 
1530  // We create vector phi nodes for both integer and floating-point induction
1531  // variables. Here, we determine the kind of arithmetic we will perform.
1532  Instruction::BinaryOps AddOp;
1533  Instruction::BinaryOps MulOp;
1534  if (Step->getType()->isIntegerTy()) {
1535  AddOp = Instruction::Add;
1536  MulOp = Instruction::Mul;
1537  } else {
1538  AddOp = II.getInductionOpcode();
1539  MulOp = Instruction::FMul;
1540  }
1541 
1542  // Multiply the vectorization factor by the step using integer or
1543  // floating-point arithmetic as appropriate.
1544  Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1545  Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1546 
1547  // Create a vector splat to use in the induction update.
1548  //
1549  // FIXME: If the step is non-constant, we create the vector splat with
1550  // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1551  // handle a constant vector splat.
1552  Value *SplatVF = isa<Constant>(Mul)
1553  ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1554  : Builder.CreateVectorSplat(VF, Mul);
1555  Builder.restoreIP(CurrIP);
1556 
1557  // We may need to add the step a number of times, depending on the unroll
1558  // factor. The last of those goes into the PHI.
1559  PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1561  VecInd->setDebugLoc(EntryVal->getDebugLoc());
1562  Instruction *LastInduction = VecInd;
1563  for (unsigned Part = 0; Part < UF; ++Part) {
1564  VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1565 
1566  if (isa<TruncInst>(EntryVal))
1567  addMetadata(LastInduction, EntryVal);
1568  recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1569 
1570  LastInduction = cast<Instruction>(addFastMathFlag(
1571  Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1572  LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1573  }
1574 
1575  // Move the last step to the end of the latch block. This ensures consistent
1576  // placement of all induction updates.
1577  auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1578  auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1579  auto *ICmp = cast<Instruction>(Br->getCondition());
1580  LastInduction->moveBefore(ICmp);
1581  LastInduction->setName("vec.ind.next");
1582 
1583  VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1584  VecInd->addIncoming(LastInduction, LoopVectorLatch);
1585 }
1586 
1588  return Cost->isScalarAfterVectorization(I, VF) ||
1590 }
1591 
1594  return true;
1595  auto isScalarInst = [&](User *U) -> bool {
1596  auto *I = cast<Instruction>(U);
1598  };
1599  return llvm::any_of(IV->users(), isScalarInst);
1600 }
1601 
1603  const InductionDescriptor &ID, const Instruction *EntryVal,
1604  Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1605  assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1606  "Expected either an induction phi-node or a truncate of it!");
1607 
1608  // This induction variable is not the phi from the original loop but the
1609  // newly-created IV based on the proof that casted Phi is equal to the
1610  // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1611  // re-uses the same InductionDescriptor that original IV uses but we don't
1612  // have to do any recording in this case - that is done when original IV is
1613  // processed.
1614  if (isa<TruncInst>(EntryVal))
1615  return;
1616 
1617  const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1618  if (Casts.empty())
1619  return;
1620  // Only the first Cast instruction in the Casts vector is of interest.
1621  // The rest of the Casts (if exist) have no uses outside the
1622  // induction update chain itself.
1623  Instruction *CastInst = *Casts.begin();
1624  if (Lane < UINT_MAX)
1625  VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1626  else
1627  VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1628 }
1629 
1631  assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1632  "Primary induction variable must have an integer type");
1633 
1634  auto II = Legal->getInductionVars()->find(IV);
1635  assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
1636 
1637  auto ID = II->second;
1638  assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1639 
1640  // The scalar value to broadcast. This will be derived from the canonical
1641  // induction variable.
1642  Value *ScalarIV = nullptr;
1643 
1644  // The value from the original loop to which we are mapping the new induction
1645  // variable.
1646  Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1647 
1648  // True if we have vectorized the induction variable.
1649  auto VectorizedIV = false;
1650 
1651  // Determine if we want a scalar version of the induction variable. This is
1652  // true if the induction variable itself is not widened, or if it has at
1653  // least one user in the loop that is not widened.
1654  auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
1655 
1656  // Generate code for the induction step. Note that induction steps are
1657  // required to be loop-invariant
1658  assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
1659  "Induction step should be loop invariant");
1660  auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1661  Value *Step = nullptr;
1662  if (PSE.getSE()->isSCEVable(IV->getType())) {
1663  SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1664  Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
1666  } else {
1667  Step = cast<SCEVUnknown>(ID.getStep())->getValue();
1668  }
1669 
1670  // Try to create a new independent vector induction variable. If we can't
1671  // create the phi node, we will splat the scalar induction variable in each
1672  // loop iteration.
1673  if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
1674  createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1675  VectorizedIV = true;
1676  }
1677 
1678  // If we haven't yet vectorized the induction variable, or if we will create
1679  // a scalar one, we need to define the scalar induction variable and step
1680  // values. If we were given a truncation type, truncate the canonical
1681  // induction variable and step. Otherwise, derive these values from the
1682  // induction descriptor.
1683  if (!VectorizedIV || NeedsScalarIV) {
1684  ScalarIV = Induction;
1685  if (IV != OldInduction) {
1686  ScalarIV = IV->getType()->isIntegerTy()
1688  : Builder.CreateCast(Instruction::SIToFP, Induction,
1689  IV->getType());
1690  ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1691  ScalarIV->setName("offset.idx");
1692  }
1693  if (Trunc) {
1694  auto *TruncType = cast<IntegerType>(Trunc->getType());
1695  assert(Step->getType()->isIntegerTy() &&
1696  "Truncation requires an integer step");
1697  ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1698  Step = Builder.CreateTrunc(Step, TruncType);
1699  }
1700  }
1701 
1702  // If we haven't yet vectorized the induction variable, splat the scalar
1703  // induction variable, and build the necessary step vectors.
1704  // TODO: Don't do it unless the vectorized IV is really required.
1705  if (!VectorizedIV) {
1706  Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1707  for (unsigned Part = 0; Part < UF; ++Part) {
1708  Value *EntryPart =
1709  getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1710  VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1711  if (Trunc)
1712  addMetadata(EntryPart, Trunc);
1713  recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1714  }
1715  }
1716 
1717  // If an induction variable is only used for counting loop iterations or
1718  // calculating addresses, it doesn't need to be widened. Create scalar steps
1719  // that can be used by instructions we will later scalarize. Note that the
1720  // addition of the scalar steps will not increase the number of instructions
1721  // in the loop in the common case prior to InstCombine. We will be trading
1722  // one vector extract for each scalar step.
1723  if (NeedsScalarIV)
1724  buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1725 }
1726 
1728  Instruction::BinaryOps BinOp) {
1729  // Create and check the types.
1730  assert(Val->getType()->isVectorTy() && "Must be a vector");
1731  int VLen = Val->getType()->getVectorNumElements();
1732 
1733  Type *STy = Val->getType()->getScalarType();
1734  assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1735  "Induction Step must be an integer or FP");
1736  assert(Step->getType() == STy && "Step has wrong type");
1737 
1739 
1740  if (STy->isIntegerTy()) {
1741  // Create a vector of consecutive numbers from zero to VF.
1742  for (int i = 0; i < VLen; ++i)
1743  Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1744 
1745  // Add the consecutive indices to the vector value.
1746  Constant *Cv = ConstantVector::get(Indices);
1747  assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1748  Step = Builder.CreateVectorSplat(VLen, Step);
1749  assert(Step->getType() == Val->getType() && "Invalid step vec");
1750  // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1751  // which can be found from the original scalar operations.
1752  Step = Builder.CreateMul(Cv, Step);
1753  return Builder.CreateAdd(Val, Step, "induction");
1754  }
1755 
1756  // Floating point induction.
1757  assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1758  "Binary Opcode should be specified for FP induction");
1759  // Create a vector of consecutive numbers from zero to VF.
1760  for (int i = 0; i < VLen; ++i)
1761  Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1762 
1763  // Add the consecutive indices to the vector value.
1764  Constant *Cv = ConstantVector::get(Indices);
1765 
1766  Step = Builder.CreateVectorSplat(VLen, Step);
1767 
1768  // Floating point operations had to be 'fast' to enable the induction.
1769  FastMathFlags Flags;
1770  Flags.setFast();
1771 
1772  Value *MulOp = Builder.CreateFMul(Cv, Step);
1773  if (isa<Instruction>(MulOp))
1774  // Have to check, MulOp may be a constant
1775  cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1776 
1777  Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1778  if (isa<Instruction>(BOp))
1779  cast<Instruction>(BOp)->setFastMathFlags(Flags);
1780  return BOp;
1781 }
1782 
1784  Instruction *EntryVal,
1785  const InductionDescriptor &ID) {
1786  // We shouldn't have to build scalar steps if we aren't vectorizing.
1787  assert(VF > 1 && "VF should be greater than one");
1788 
1789  // Get the value type and ensure it and the step have the same integer type.
1790  Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1791  assert(ScalarIVTy == Step->getType() &&
1792  "Val and Step should have the same type");
1793 
1794  // We build scalar steps for both integer and floating-point induction
1795  // variables. Here, we determine the kind of arithmetic we will perform.
1796  Instruction::BinaryOps AddOp;
1797  Instruction::BinaryOps MulOp;
1798  if (ScalarIVTy->isIntegerTy()) {
1799  AddOp = Instruction::Add;
1800  MulOp = Instruction::Mul;
1801  } else {
1802  AddOp = ID.getInductionOpcode();
1803  MulOp = Instruction::FMul;
1804  }
1805 
1806  // Determine the number of scalars we need to generate for each unroll
1807  // iteration. If EntryVal is uniform, we only need to generate the first
1808  // lane. Otherwise, we generate all VF values.
1809  unsigned Lanes =
1810  Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1811  : VF;
1812  // Compute the scalar steps and save the results in VectorLoopValueMap.
1813  for (unsigned Part = 0; Part < UF; ++Part) {
1814  for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
1815  auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1816  auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1817  auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
1818  VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
1819  recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
1820  }
1821  }
1822 }
1823 
1825  assert(V != Induction && "The new induction variable should not be used.");
1826  assert(!V->getType()->isVectorTy() && "Can't widen a vector");
1827  assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1828 
1829  // If we have a stride that is replaced by one, do it here. Defer this for
1830  // the VPlan-native path until we start running Legal checks in that path.
1832  V = ConstantInt::get(V->getType(), 1);
1833 
1834  // If we have a vector mapped to this value, return it.
1835  if (VectorLoopValueMap.hasVectorValue(V, Part))
1836  return VectorLoopValueMap.getVectorValue(V, Part);
1837 
1838  // If the value has not been vectorized, check if it has been scalarized
1839  // instead. If it has been scalarized, and we actually need the value in
1840  // vector form, we will construct the vector values on demand.
1842  Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
1843 
1844  // If we've scalarized a value, that value should be an instruction.
1845  auto *I = cast<Instruction>(V);
1846 
1847  // If we aren't vectorizing, we can just copy the scalar map values over to
1848  // the vector map.
1849  if (VF == 1) {
1850  VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
1851  return ScalarValue;
1852  }
1853 
1854  // Get the last scalar instruction we generated for V and Part. If the value
1855  // is known to be uniform after vectorization, this corresponds to lane zero
1856  // of the Part unroll iteration. Otherwise, the last instruction is the one
1857  // we created for the last vector lane of the Part unroll iteration.
1858  unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
1859  auto *LastInst = cast<Instruction>(
1860  VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
1861 
1862  // Set the insert point after the last scalarized instruction. This ensures
1863  // the insertelement sequence will directly follow the scalar definitions.
1864  auto OldIP = Builder.saveIP();
1865  auto NewIP = std::next(BasicBlock::iterator(LastInst));
1866  Builder.SetInsertPoint(&*NewIP);
1867 
1868  // However, if we are vectorizing, we need to construct the vector values.
1869  // If the value is known to be uniform after vectorization, we can just
1870  // broadcast the scalar value corresponding to lane zero for each unroll
1871  // iteration. Otherwise, we construct the vector values using insertelement
1872  // instructions. Since the resulting vectors are stored in
1873  // VectorLoopValueMap, we will only generate the insertelements once.
1874  Value *VectorValue = nullptr;
1876  VectorValue = getBroadcastInstrs(ScalarValue);
1877  VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
1878  } else {
1879  // Initialize packing with insertelements to start from undef.
1881  VectorLoopValueMap.setVectorValue(V, Part, Undef);
1882  for (unsigned Lane = 0; Lane < VF; ++Lane)
1883  packScalarIntoVectorValue(V, {Part, Lane});
1884  VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
1885  }
1886  Builder.restoreIP(OldIP);
1887  return VectorValue;
1888  }
1889 
1890  // If this scalar is unknown, assume that it is a constant or that it is
1891  // loop invariant. Broadcast V and save the value for future uses.
1892  Value *B = getBroadcastInstrs(V);
1893  VectorLoopValueMap.setVectorValue(V, Part, B);
1894  return B;
1895 }
1896 
1897 Value *
1899  const VPIteration &Instance) {
1900  // If the value is not an instruction contained in the loop, it should
1901  // already be scalar.
1902  if (OrigLoop->isLoopInvariant(V))
1903  return V;
1904 
1905  assert(Instance.Lane > 0
1906  ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
1907  : true && "Uniform values only have lane zero");
1908 
1909  // If the value from the original loop has not been vectorized, it is
1910  // represented by UF x VF scalar values in the new loop. Return the requested
1911  // scalar value.
1912  if (VectorLoopValueMap.hasScalarValue(V, Instance))
1913  return VectorLoopValueMap.getScalarValue(V, Instance);
1914 
1915  // If the value has not been scalarized, get its entry in VectorLoopValueMap
1916  // for the given unroll part. If this entry is not a vector type (i.e., the
1917  // vectorization factor is one), there is no need to generate an
1918  // extractelement instruction.
1919  auto *U = getOrCreateVectorValue(V, Instance.Part);
1920  if (!U->getType()->isVectorTy()) {
1921  assert(VF == 1 && "Value not scalarized has non-vector type");
1922  return U;
1923  }
1924 
1925  // Otherwise, the value from the original loop has been vectorized and is
1926  // represented by UF vector values. Extract and return the requested scalar
1927  // value from the appropriate vector lane.
1928  return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
1929 }
1930 
1932  Value *V, const VPIteration &Instance) {
1933  assert(V != Induction && "The new induction variable should not be used.");
1934  assert(!V->getType()->isVectorTy() && "Can't pack a vector");
1935  assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1936 
1937  Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
1938  Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
1939  VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
1940  Builder.getInt32(Instance.Lane));
1941  VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
1942 }
1943 
1945  assert(Vec->getType()->isVectorTy() && "Invalid type");
1946  SmallVector<Constant *, 8> ShuffleMask;
1947  for (unsigned i = 0; i < VF; ++i)
1948  ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
1949 
1950  return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
1951  ConstantVector::get(ShuffleMask),
1952  "reverse");
1953 }
1954 
1955 // Return whether we allow using masked interleave-groups (for dealing with
1956 // strided loads/stores that reside in predicated blocks, or for dealing
1957 // with gaps).
1959  // If an override option has been passed in for interleaved accesses, use it.
1960  if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
1962 
1964 }
1965 
1966 // Try to vectorize the interleave group that \p Instr belongs to.
1967 //
1968 // E.g. Translate following interleaved load group (factor = 3):
1969 // for (i = 0; i < N; i+=3) {
1970 // R = Pic[i]; // Member of index 0
1971 // G = Pic[i+1]; // Member of index 1
1972 // B = Pic[i+2]; // Member of index 2
1973 // ... // do something to R, G, B
1974 // }
1975 // To:
1976 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
1977 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements
1978 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements
1979 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements
1980 //
1981 // Or translate following interleaved store group (factor = 3):
1982 // for (i = 0; i < N; i+=3) {
1983 // ... do something to R, G, B
1984 // Pic[i] = R; // Member of index 0
1985 // Pic[i+1] = G; // Member of index 1
1986 // Pic[i+2] = B; // Member of index 2
1987 // }
1988 // To:
1989 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
1990 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
1991 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
1992 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
1993 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
1995  VectorParts *BlockInMask) {
1996  const InterleaveGroup<Instruction> *Group =
1998  assert(Group && "Fail to get an interleaved access group.");
1999 
2000  // Skip if current instruction is not the insert position.
2001  if (Instr != Group->getInsertPos())
2002  return;
2003 
2004  const DataLayout &DL = Instr->getModule()->getDataLayout();
2005  Value *Ptr = getLoadStorePointerOperand(Instr);
2006 
2007  // Prepare for the vector type of the interleaved load/store.
2008  Type *ScalarTy = getMemInstValueType(Instr);
2009  unsigned InterleaveFactor = Group->getFactor();
2010  Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2011  Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr));
2012 
2013  // Prepare for the new pointers.
2015  SmallVector<Value *, 2> NewPtrs;
2016  unsigned Index = Group->getIndex(Instr);
2017 
2018  VectorParts Mask;
2019  bool IsMaskForCondRequired = BlockInMask;
2020  if (IsMaskForCondRequired) {
2021  Mask = *BlockInMask;
2022  // TODO: extend the masked interleaved-group support to reversed access.
2023  assert(!Group->isReverse() && "Reversed masked interleave-group "
2024  "not supported.");
2025  }
2026 
2027  // If the group is reverse, adjust the index to refer to the last vector lane
2028  // instead of the first. We adjust the index from the first vector lane,
2029  // rather than directly getting the pointer for lane VF - 1, because the
2030  // pointer operand of the interleaved access is supposed to be uniform. For
2031  // uniform instructions, we're only required to generate a value for the
2032  // first vector lane in each unroll iteration.
2033  if (Group->isReverse())
2034  Index += (VF - 1) * Group->getFactor();
2035 
2036  bool InBounds = false;
2037  if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2038  InBounds = gep->isInBounds();
2039 
2040  for (unsigned Part = 0; Part < UF; Part++) {
2041  Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
2042 
2043  // Notice current instruction could be any index. Need to adjust the address
2044  // to the member of index 0.
2045  //
2046  // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2047  // b = A[i]; // Member of index 0
2048  // Current pointer is pointed to A[i+1], adjust it to A[i].
2049  //
2050  // E.g. A[i+1] = a; // Member of index 1
2051  // A[i] = b; // Member of index 0
2052  // A[i+2] = c; // Member of index 2 (Current instruction)
2053  // Current pointer is pointed to A[i+2], adjust it to A[i].
2054  NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index));
2055  if (InBounds)
2056  cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);
2057 
2058  // Cast to the vector pointer type.
2059  NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
2060  }
2061 
2062  setDebugLocFromInst(Builder, Instr);
2063  Value *UndefVec = UndefValue::get(VecTy);
2064 
2065  Value *MaskForGaps = nullptr;
2066  if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2067  MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2068  assert(MaskForGaps && "Mask for Gaps is required but it is null");
2069  }
2070 
2071  // Vectorize the interleaved load group.
2072  if (isa<LoadInst>(Instr)) {
2073  // For each unroll part, create a wide load for the group.
2074  SmallVector<Value *, 2> NewLoads;
2075  for (unsigned Part = 0; Part < UF; Part++) {
2076  Instruction *NewLoad;
2077  if (IsMaskForCondRequired || MaskForGaps) {
2079  "masked interleaved groups are not allowed.");
2080  Value *GroupMask = MaskForGaps;
2081  if (IsMaskForCondRequired) {
2082  auto *Undefs = UndefValue::get(Mask[Part]->getType());
2083  auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2084  Value *ShuffledMask = Builder.CreateShuffleVector(
2085  Mask[Part], Undefs, RepMask, "interleaved.mask");
2086  GroupMask = MaskForGaps
2087  ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2088  MaskForGaps)
2089  : ShuffledMask;
2090  }
2091  NewLoad =
2092  Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),
2093  GroupMask, UndefVec, "wide.masked.vec");
2094  }
2095  else
2096  NewLoad = Builder.CreateAlignedLoad(NewPtrs[Part],
2097  Group->getAlignment(), "wide.vec");
2098  Group->addMetadata(NewLoad);
2099  NewLoads.push_back(NewLoad);
2100  }
2101 
2102  // For each member in the group, shuffle out the appropriate data from the
2103  // wide loads.
2104  for (unsigned I = 0; I < InterleaveFactor; ++I) {
2105  Instruction *Member = Group->getMember(I);
2106 
2107  // Skip the gaps in the group.
2108  if (!Member)
2109  continue;
2110 
2111  Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2112  for (unsigned Part = 0; Part < UF; Part++) {
2113  Value *StridedVec = Builder.CreateShuffleVector(
2114  NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2115 
2116  // If this member has different type, cast the result type.
2117  if (Member->getType() != ScalarTy) {
2118  VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2119  StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2120  }
2121 
2122  if (Group->isReverse())
2123  StridedVec = reverseVector(StridedVec);
2124 
2125  VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2126  }
2127  }
2128  return;
2129  }
2130 
2131  // The sub vector type for current instruction.
2132  VectorType *SubVT = VectorType::get(ScalarTy, VF);
2133 
2134  // Vectorize the interleaved store group.
2135  for (unsigned Part = 0; Part < UF; Part++) {
2136  // Collect the stored vector from each member.
2137  SmallVector<Value *, 4> StoredVecs;
2138  for (unsigned i = 0; i < InterleaveFactor; i++) {
2139  // Interleaved store group doesn't allow a gap, so each index has a member
2140  Instruction *Member = Group->getMember(i);
2141  assert(Member && "Fail to get a member from an interleaved store group");
2142 
2143  Value *StoredVec = getOrCreateVectorValue(
2144  cast<StoreInst>(Member)->getValueOperand(), Part);
2145  if (Group->isReverse())
2146  StoredVec = reverseVector(StoredVec);
2147 
2148  // If this member has different type, cast it to a unified type.
2149 
2150  if (StoredVec->getType() != SubVT)
2151  StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2152 
2153  StoredVecs.push_back(StoredVec);
2154  }
2155 
2156  // Concatenate all vectors into a wide vector.
2157  Value *WideVec = concatenateVectors(Builder, StoredVecs);
2158 
2159  // Interleave the elements in the wide vector.
2160  Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2161  Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2162  "interleaved.vec");
2163 
2164  Instruction *NewStoreInstr;
2165  if (IsMaskForCondRequired) {
2166  auto *Undefs = UndefValue::get(Mask[Part]->getType());
2167  auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2168  Value *ShuffledMask = Builder.CreateShuffleVector(
2169  Mask[Part], Undefs, RepMask, "interleaved.mask");
2170  NewStoreInstr = Builder.CreateMaskedStore(
2171  IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask);
2172  }
2173  else
2174  NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part],
2175  Group->getAlignment());
2176 
2177  Group->addMetadata(NewStoreInstr);
2178  }
2179 }
2180 
2182  VectorParts *BlockInMask) {
2183  // Attempt to issue a wide load.
2184  LoadInst *LI = dyn_cast<LoadInst>(Instr);
2185  StoreInst *SI = dyn_cast<StoreInst>(Instr);
2186 
2187  assert((LI || SI) && "Invalid Load/Store instruction");
2188 
2190  Cost->getWideningDecision(Instr, VF);
2192  "CM decision should be taken at this point");
2194  return vectorizeInterleaveGroup(Instr);
2195 
2196  Type *ScalarDataTy = getMemInstValueType(Instr);
2197  Type *DataTy = VectorType::get(ScalarDataTy, VF);
2198  Value *Ptr = getLoadStorePointerOperand(Instr);
2199  unsigned Alignment = getLoadStoreAlignment(Instr);
2200  // An alignment of 0 means target abi alignment. We need to use the scalar's
2201  // target abi alignment in such a case.
2202  const DataLayout &DL = Instr->getModule()->getDataLayout();
2203  if (!Alignment)
2204  Alignment = DL.getABITypeAlignment(ScalarDataTy);
2205  unsigned AddressSpace = getLoadStoreAddressSpace(Instr);
2206 
2207  // Determine if the pointer operand of the access is either consecutive or
2208  // reverse consecutive.
2209  bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2210  bool ConsecutiveStride =
2211  Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2212  bool CreateGatherScatter =
2214 
2215  // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2216  // gather/scatter. Otherwise Decision should have been to Scalarize.
2217  assert((ConsecutiveStride || CreateGatherScatter) &&
2218  "The instruction should be scalarized");
2219 
2220  // Handle consecutive loads/stores.
2221  if (ConsecutiveStride)
2222  Ptr = getOrCreateScalarValue(Ptr, {0, 0});
2223 
2224  VectorParts Mask;
2225  bool isMaskRequired = BlockInMask;
2226  if (isMaskRequired)
2227  Mask = *BlockInMask;
2228 
2229  bool InBounds = false;
2230  if (auto *gep = dyn_cast<GetElementPtrInst>(
2231  getLoadStorePointerOperand(Instr)->stripPointerCasts()))
2232  InBounds = gep->isInBounds();
2233 
2234  const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2235  // Calculate the pointer for the specific unroll-part.
2236  GetElementPtrInst *PartPtr = nullptr;
2237 
2238  if (Reverse) {
2239  // If the address is consecutive but reversed, then the
2240  // wide store needs to start at the last vector element.
2241  PartPtr = cast<GetElementPtrInst>(
2242  Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)));
2243  PartPtr->setIsInBounds(InBounds);
2244  PartPtr = cast<GetElementPtrInst>(
2245  Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)));
2246  PartPtr->setIsInBounds(InBounds);
2247  if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2248  Mask[Part] = reverseVector(Mask[Part]);
2249  } else {
2250  PartPtr = cast<GetElementPtrInst>(
2251  Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)));
2252  PartPtr->setIsInBounds(InBounds);
2253  }
2254 
2255  return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2256  };
2257 
2258  // Handle Stores:
2259  if (SI) {
2261 
2262  for (unsigned Part = 0; Part < UF; ++Part) {
2263  Instruction *NewSI = nullptr;
2264  Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
2265  if (CreateGatherScatter) {
2266  Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2267  Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2268  NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2269  MaskPart);
2270  } else {
2271  if (Reverse) {
2272  // If we store to reverse consecutive memory locations, then we need
2273  // to reverse the order of elements in the stored value.
2274  StoredVal = reverseVector(StoredVal);
2275  // We don't want to update the value in the map as it might be used in
2276  // another expression. So don't call resetVectorValue(StoredVal).
2277  }
2278  auto *VecPtr = CreateVecPtr(Part, Ptr);
2279  if (isMaskRequired)
2280  NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2281  Mask[Part]);
2282  else
2283  NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2284  }
2285  addMetadata(NewSI, SI);
2286  }
2287  return;
2288  }
2289 
2290  // Handle loads.
2291  assert(LI && "Must have a load instruction");
2293  for (unsigned Part = 0; Part < UF; ++Part) {
2294  Value *NewLI;
2295  if (CreateGatherScatter) {
2296  Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2297  Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2298  NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2299  nullptr, "wide.masked.gather");
2300  addMetadata(NewLI, LI);
2301  } else {
2302  auto *VecPtr = CreateVecPtr(Part, Ptr);
2303  if (isMaskRequired)
2304  NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
2305  UndefValue::get(DataTy),
2306  "wide.masked.load");
2307  else
2308  NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");
2309 
2310  // Add metadata to the load, but setVectorValue to the reverse shuffle.
2311  addMetadata(NewLI, LI);
2312  if (Reverse)
2313  NewLI = reverseVector(NewLI);
2314  }
2315  VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2316  }
2317 }
2318 
2320  const VPIteration &Instance,
2321  bool IfPredicateInstr) {
2322  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2323 
2324  setDebugLocFromInst(Builder, Instr);
2325 
2326  // Does this instruction return a value ?
2327  bool IsVoidRetTy = Instr->getType()->isVoidTy();
2328 
2329  Instruction *Cloned = Instr->clone();
2330  if (!IsVoidRetTy)
2331  Cloned->setName(Instr->getName() + ".cloned");
2332 
2333  // Replace the operands of the cloned instructions with their scalar
2334  // equivalents in the new loop.
2335  for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2336  auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2337  Cloned->setOperand(op, NewOp);
2338  }
2339  addNewMetadata(Cloned, Instr);
2340 
2341  // Place the cloned scalar in the new loop.
2342  Builder.Insert(Cloned);
2343 
2344  // Add the cloned scalar to the scalar map entry.
2345  VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2346 
2347  // If we just cloned a new assumption, add it the assumption cache.
2348  if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2349  if (II->getIntrinsicID() == Intrinsic::assume)
2350  AC->registerAssumption(II);
2351 
2352  // End if-block.
2353  if (IfPredicateInstr)
2354  PredicatedInstructions.push_back(Cloned);
2355 }
2356 
2358  Value *End, Value *Step,
2359  Instruction *DL) {
2360  BasicBlock *Header = L->getHeader();
2361  BasicBlock *Latch = L->getLoopLatch();
2362  // As we're just creating this loop, it's possible no latch exists
2363  // yet. If so, use the header as this will be a single block loop.
2364  if (!Latch)
2365  Latch = Header;
2366 
2369  setDebugLocFromInst(Builder, OldInst);
2370  auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2371 
2373  setDebugLocFromInst(Builder, OldInst);
2374 
2375  // Create i+1 and fill the PHINode.
2376  Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2377  Induction->addIncoming(Start, L->getLoopPreheader());
2378  Induction->addIncoming(Next, Latch);
2379  // Create the compare.
2380  Value *ICmp = Builder.CreateICmpEQ(Next, End);
2381  Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2382 
2383  // Now we have two terminators. Remove the old one from the block.
2384  Latch->getTerminator()->eraseFromParent();
2385 
2386  return Induction;
2387 }
2388 
2390  if (TripCount)
2391  return TripCount;
2392 
2393  assert(L && "Create Trip Count for null loop.");
2395  // Find the loop boundaries.
2396  ScalarEvolution *SE = PSE.getSE();
2397  const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2398  assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2399  "Invalid loop count");
2400 
2401  Type *IdxTy = Legal->getWidestInductionType();
2402  assert(IdxTy && "No type for induction");
2403 
2404  // The exit count might have the type of i64 while the phi is i32. This can
2405  // happen if we have an induction variable that is sign extended before the
2406  // compare. The only way that we get a backedge taken count is that the
2407  // induction variable was signed and as such will not overflow. In such a case
2408  // truncation is legal.
2409  if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
2410  IdxTy->getPrimitiveSizeInBits())
2411  BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2412  BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2413 
2414  // Get the total trip count from the count by adding 1.
2415  const SCEV *ExitCount = SE->getAddExpr(
2416  BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2417 
2418  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2419 
2420  // Expand the trip count and place the new instructions in the preheader.
2421  // Notice that the pre-header does not change, only the loop body.
2422  SCEVExpander Exp(*SE, DL, "induction");
2423 
2424  // Count holds the overall loop count (N).
2425  TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2427 
2428  if (TripCount->getType()->isPointerTy())
2429  TripCount =
2430  CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2432 
2433  return TripCount;
2434 }
2435 
2437  if (VectorTripCount)
2438  return VectorTripCount;
2439 
2440  Value *TC = getOrCreateTripCount(L);
2442 
2443  Type *Ty = TC->getType();
2444  Constant *Step = ConstantInt::get(Ty, VF * UF);
2445 
2446  // If the tail is to be folded by masking, round the number of iterations N
2447  // up to a multiple of Step instead of rounding down. This is done by first
2448  // adding Step-1 and then rounding down. Note that it's ok if this addition
2449  // overflows: the vector induction variable will eventually wrap to zero given
2450  // that it starts at zero and its Step is a power of two; the loop will then
2451  // exit, with the last early-exit vector comparison also producing all-true.
2452  if (Cost->foldTailByMasking()) {
2453  assert(isPowerOf2_32(VF * UF) &&
2454  "VF*UF must be a power of 2 when folding tail by masking");
2455  TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2456  }
2457 
2458  // Now we need to generate the expression for the part of the loop that the
2459  // vectorized body will execute. This is equal to N - (N % Step) if scalar
2460  // iterations are not required for correctness, or N - Step, otherwise. Step
2461  // is equal to the vectorization factor (number of SIMD elements) times the
2462  // unroll factor (number of SIMD instructions).
2463  Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2464 
2465  // If there is a non-reversed interleaved group that may speculatively access
2466  // memory out-of-bounds, we need to ensure that there will be at least one
2467  // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2468  // the trip count, we set the remainder to be equal to the step. If the step
2469  // does not evenly divide the trip count, no adjustment is necessary since
2470  // there will already be scalar iterations. Note that the minimum iterations
2471  // check ensures that N >= Step.
2472  if (VF > 1 && Cost->requiresScalarEpilogue()) {
2473  auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2474  R = Builder.CreateSelect(IsZero, Step, R);
2475  }
2476 
2477  VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2478 
2479  return VectorTripCount;
2480 }
2481 
2483  const DataLayout &DL) {
2484  // Verify that V is a vector type with same number of elements as DstVTy.
2485  unsigned VF = DstVTy->getNumElements();
2486  VectorType *SrcVecTy = cast<VectorType>(V->getType());
2487  assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2488  Type *SrcElemTy = SrcVecTy->getElementType();
2489  Type *DstElemTy = DstVTy->getElementType();
2490  assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2491  "Vector elements must have same size");
2492 
2493  // Do a direct cast if element types are castable.
2494  if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2495  return Builder.CreateBitOrPointerCast(V, DstVTy);
2496  }
2497  // V cannot be directly casted to desired vector type.
2498  // May happen when V is a floating point vector but DstVTy is a vector of
2499  // pointers or vice-versa. Handle this using a two-step bitcast using an
2500  // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2501  assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2502  "Only one type should be a pointer type");
2503  assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2504  "Only one type should be a floating point type");
2505  Type *IntTy =
2507  VectorType *VecIntTy = VectorType::get(IntTy, VF);
2508  Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2509  return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2510 }
2511 
2513  BasicBlock *Bypass) {
2514  Value *Count = getOrCreateTripCount(L);
2515  BasicBlock *BB = L->getLoopPreheader();
2517 
2518  // Generate code to check if the loop's trip count is less than VF * UF, or
2519  // equal to it in case a scalar epilogue is required; this implies that the
2520  // vector trip count is zero. This check also covers the case where adding one
2521  // to the backedge-taken count overflowed leading to an incorrect trip count
2522  // of zero. In this case we will also jump to the scalar loop.
2525 
2526  // If tail is to be folded, vector loop takes care of all iterations.
2527  Value *CheckMinIters = Builder.getFalse();
2528  if (!Cost->foldTailByMasking())
2529  CheckMinIters = Builder.CreateICmp(
2530  P, Count, ConstantInt::get(Count->getType(), VF * UF),
2531  "min.iters.check");
2532 
2533  BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2534  // Update dominator tree immediately if the generated block is a
2535  // LoopBypassBlock because SCEV expansions to generate loop bypass
2536  // checks may query it before the current function is finished.
2537  DT->addNewBlock(NewBB, BB);
2538  if (L->getParentLoop())
2539  L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2541  BranchInst::Create(Bypass, NewBB, CheckMinIters));
2542  LoopBypassBlocks.push_back(BB);
2543 }
2544 
2546  BasicBlock *BB = L->getLoopPreheader();
2547 
2548  // Generate the code to check that the SCEV assumptions that we made.
2549  // We want the new basic block to start at the first instruction in a
2550  // sequence of instructions that form a check.
2551  SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2552  "scev.check");
2553  Value *SCEVCheck =
2554  Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
2555 
2556  if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2557  if (C->isZero())
2558  return;
2559 
2561  "Cannot SCEV check stride or overflow when folding tail");
2562  // Create a new block containing the stride check.
2563  BB->setName("vector.scevcheck");
2564  auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2565  // Update dominator tree immediately if the generated block is a
2566  // LoopBypassBlock because SCEV expansions to generate loop bypass
2567  // checks may query it before the current function is finished.
2568  DT->addNewBlock(NewBB, BB);
2569  if (L->getParentLoop())
2570  L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2572  BranchInst::Create(Bypass, NewBB, SCEVCheck));
2573  LoopBypassBlocks.push_back(BB);
2574  AddedSafetyChecks = true;
2575 }
2576 
2578  // VPlan-native path does not do any analysis for runtime checks currently.
2580  return;
2581 
2582  BasicBlock *BB = L->getLoopPreheader();
2583 
2584  // Generate the code that checks in runtime if arrays overlap. We put the
2585  // checks into a separate block to make the more common case of few elements
2586  // faster.
2587  Instruction *FirstCheckInst;
2588  Instruction *MemRuntimeCheck;
2589  std::tie(FirstCheckInst, MemRuntimeCheck) =
2591  if (!MemRuntimeCheck)
2592  return;
2593 
2594  assert(!Cost->foldTailByMasking() && "Cannot check memory when folding tail");
2595  // Create a new block containing the memory check.
2596  BB->setName("vector.memcheck");
2597  auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2598  // Update dominator tree immediately if the generated block is a
2599  // LoopBypassBlock because SCEV expansions to generate loop bypass
2600  // checks may query it before the current function is finished.
2601  DT->addNewBlock(NewBB, BB);
2602  if (L->getParentLoop())
2603  L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2605  BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
2606  LoopBypassBlocks.push_back(BB);
2607  AddedSafetyChecks = true;
2608 
2609  // We currently don't use LoopVersioning for the actual loop cloning but we
2610  // still use it to add the noalias metadata.
2611  LVer = llvm::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2612  PSE.getSE());
2613  LVer->prepareNoAliasMetadata();
2614 }
2615 
2617  IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2618  const InductionDescriptor &ID) const {
2619 
2620  SCEVExpander Exp(*SE, DL, "induction");
2621  auto Step = ID.getStep();
2622  auto StartValue = ID.getStartValue();
2623  assert(Index->getType() == Step->getType() &&
2624  "Index type does not match StepValue type");
2625 
2626  // Note: the IR at this point is broken. We cannot use SE to create any new
2627  // SCEV and then expand it, hoping that SCEV's simplification will give us
2628  // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2629  // lead to various SCEV crashes. So all we can do is to use builder and rely
2630  // on InstCombine for future simplifications. Here we handle some trivial
2631  // cases only.
2632  auto CreateAdd = [&B](Value *X, Value *Y) {
2633  assert(X->getType() == Y->getType() && "Types don't match!");
2634  if (auto *CX = dyn_cast<ConstantInt>(X))
2635  if (CX->isZero())
2636  return Y;
2637  if (auto *CY = dyn_cast<ConstantInt>(Y))
2638  if (CY->isZero())
2639  return X;
2640  return B.CreateAdd(X, Y);
2641  };
2642 
2643  auto CreateMul = [&B](Value *X, Value *Y) {
2644  assert(X->getType() == Y->getType() && "Types don't match!");
2645  if (auto *CX = dyn_cast<ConstantInt>(X))
2646  if (CX->isOne())
2647  return Y;
2648  if (auto *CY = dyn_cast<ConstantInt>(Y))
2649  if (CY->isOne())
2650  return X;
2651  return B.CreateMul(X, Y);
2652  };
2653 
2654  switch (ID.getKind()) {
2656  assert(Index->getType() == StartValue->getType() &&
2657  "Index type does not match StartValue type");
2659  return B.CreateSub(StartValue, Index);
2660  auto *Offset = CreateMul(
2661  Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2662  return CreateAdd(StartValue, Offset);
2663  }
2665  assert(isa<SCEVConstant>(Step) &&
2666  "Expected constant step for pointer induction");
2667  return B.CreateGEP(
2668  nullptr, StartValue,
2669  CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2670  &*B.GetInsertPoint())));
2671  }
2673  assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2674  auto InductionBinOp = ID.getInductionBinOp();
2675  assert(InductionBinOp &&
2676  (InductionBinOp->getOpcode() == Instruction::FAdd ||
2677  InductionBinOp->getOpcode() == Instruction::FSub) &&
2678  "Original bin op should be defined for FP induction");
2679 
2680  Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2681 
2682  // Floating point operations had to be 'fast' to enable the induction.
2683  FastMathFlags Flags;
2684  Flags.setFast();
2685 
2686  Value *MulExp = B.CreateFMul(StepValue, Index);
2687  if (isa<Instruction>(MulExp))
2688  // We have to check, the MulExp may be a constant.
2689  cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2690 
2691  Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2692  "induction");
2693  if (isa<Instruction>(BOp))
2694  cast<Instruction>(BOp)->setFastMathFlags(Flags);
2695 
2696  return BOp;
2697  }
2699  return nullptr;
2700  }
2701  llvm_unreachable("invalid enum");
2702 }
2703 
2705  /*
2706  In this function we generate a new loop. The new loop will contain
2707  the vectorized instructions while the old loop will continue to run the
2708  scalar remainder.
2709 
2710  [ ] <-- loop iteration number check.
2711  / |
2712  / v
2713  | [ ] <-- vector loop bypass (may consist of multiple blocks).
2714  | / |
2715  | / v
2716  || [ ] <-- vector pre header.
2717  |/ |
2718  | v
2719  | [ ] \
2720  | [ ]_| <-- vector loop.
2721  | |
2722  | v
2723  | -[ ] <--- middle-block.
2724  | / |
2725  | / v
2726  -|- >[ ] <--- new preheader.
2727  | |
2728  | v
2729  | [ ] \
2730  | [ ]_| <-- old scalar loop to handle remainder.
2731  \ |
2732  \ v
2733  >[ ] <-- exit block.
2734  ...
2735  */
2736 
2737  BasicBlock *OldBasicBlock = OrigLoop->getHeader();
2738  BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2739  BasicBlock *ExitBlock = OrigLoop->getExitBlock();
2740  MDNode *OrigLoopID = OrigLoop->getLoopID();
2741  assert(VectorPH && "Invalid loop structure");
2742  assert(ExitBlock && "Must have an exit block");
2743 
2744  // Some loops have a single integer induction variable, while other loops
2745  // don't. One example is c++ iterators that often have multiple pointer
2746  // induction variables. In the code below we also support a case where we
2747  // don't have a single induction variable.
2748  //
2749  // We try to obtain an induction variable from the original loop as hard
2750  // as possible. However if we don't find one that:
2751  // - is an integer
2752  // - counts from zero, stepping by one
2753  // - is the size of the widest induction variable type
2754  // then we create a new one.
2756  Type *IdxTy = Legal->getWidestInductionType();
2757 
2758  // Split the single block loop into the two loop structure described above.
2759  BasicBlock *VecBody =
2760  VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
2761  BasicBlock *MiddleBlock =
2762  VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
2763  BasicBlock *ScalarPH =
2764  MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
2765 
2766  // Create and register the new vector loop.
2767  Loop *Lp = LI->AllocateLoop();
2768  Loop *ParentLoop = OrigLoop->getParentLoop();
2769 
2770  // Insert the new loop into the loop nest and register the new basic blocks
2771  // before calling any utilities such as SCEV that require valid LoopInfo.
2772  if (ParentLoop) {
2773  ParentLoop->addChildLoop(Lp);
2774  ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
2775  ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
2776  } else {
2777  LI->addTopLevelLoop(Lp);
2778  }
2779  Lp->addBasicBlockToLoop(VecBody, *LI);
2780 
2781  // Find the loop boundaries.
2782  Value *Count = getOrCreateTripCount(Lp);
2783 
2784  Value *StartIdx = ConstantInt::get(IdxTy, 0);
2785 
2786  // Now, compare the new count to zero. If it is zero skip the vector loop and
2787  // jump to the scalar loop. This check also covers the case where the
2788  // backedge-taken count is uint##_max: adding one to it will overflow leading
2789  // to an incorrect trip count of zero. In this (rare) case we will also jump
2790  // to the scalar loop.
2791  emitMinimumIterationCountCheck(Lp, ScalarPH);
2792 
2793  // Generate the code to check any assumptions that we've made for SCEV
2794  // expressions.
2795  emitSCEVChecks(Lp, ScalarPH);
2796 
2797  // Generate the code that checks in runtime if arrays overlap. We put the
2798  // checks into a separate block to make the more common case of few elements
2799  // faster.
2800  emitMemRuntimeChecks(Lp, ScalarPH);
2801 
2802  // Generate the induction variable.
2803  // The loop step is equal to the vectorization factor (num of SIMD elements)
2804  // times the unroll factor (num of SIMD instructions).
2805  Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
2806  Constant *Step = ConstantInt::get(IdxTy, VF * UF);
2807  Induction =
2808  createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
2810 
2811  // We are going to resume the execution of the scalar loop.
2812  // Go over all of the induction variables that we found and fix the
2813  // PHIs that are left in the scalar version of the loop.
2814  // The starting values of PHI nodes depend on the counter of the last
2815  // iteration in the vectorized loop.
2816  // If we come from a bypass edge then we need to start from the original
2817  // start value.
2818 
2819  // This variable saves the new starting index for the scalar loop. It is used
2820  // to test if there are any tail iterations left once the vector loop has
2821  // completed.
2823  for (auto &InductionEntry : *List) {
2824  PHINode *OrigPhi = InductionEntry.first;
2825  InductionDescriptor II = InductionEntry.second;
2826 
2827  // Create phi nodes to merge from the backedge-taken check block.
2828  PHINode *BCResumeVal = PHINode::Create(
2829  OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
2830  // Copy original phi DL over to the new one.
2831  BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
2832  Value *&EndValue = IVEndValues[OrigPhi];
2833  if (OrigPhi == OldInduction) {
2834  // We know what the end value is.
2835  EndValue = CountRoundDown;
2836  } else {
2838  Type *StepType = II.getStep()->getType();
2839  Instruction::CastOps CastOp =
2840  CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
2841  Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
2842  const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2843  EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
2844  EndValue->setName("ind.end");
2845  }
2846 
2847  // The new PHI merges the original incoming value, in case of a bypass,
2848  // or the value at the end of the vectorized loop.
2849  BCResumeVal->addIncoming(EndValue, MiddleBlock);
2850 
2851  // Fix the scalar body counter (PHI node).
2852  unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
2853 
2854  // The old induction's phi node in the scalar body needs the truncated
2855  // value.
2856  for (BasicBlock *BB : LoopBypassBlocks)
2857  BCResumeVal->addIncoming(II.getStartValue(), BB);
2858  OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
2859  }
2860 
2861  // Add a check in the middle block to see if we have completed
2862  // all of the iterations in the first vector loop.
2863  // If (N - N%VF) == N, then we *don't* need to run the remainder.
2864  // If tail is to be folded, we know we don't need to run the remainder.
2865  Value *CmpN = Builder.getTrue();
2866  if (!Cost->foldTailByMasking())
2867  CmpN =
2868  CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
2869  CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
2870  ReplaceInstWithInst(MiddleBlock->getTerminator(),
2871  BranchInst::Create(ExitBlock, ScalarPH, CmpN));
2872 
2873  // Get ready to start creating new instructions into the vectorized body.
2875 
2876  // Save the state.
2878  LoopScalarPreHeader = ScalarPH;
2879  LoopMiddleBlock = MiddleBlock;
2880  LoopExitBlock = ExitBlock;
2881  LoopVectorBody = VecBody;
2882  LoopScalarBody = OldBasicBlock;
2883 
2884  Optional<MDNode *> VectorizedLoopID =
2886  LLVMLoopVectorizeFollowupVectorized});
2887  if (VectorizedLoopID.hasValue()) {
2888  Lp->setLoopID(VectorizedLoopID.getValue());
2889 
2890  // Do not setAlreadyVectorized if loop attributes have been defined
2891  // explicitly.
2892  return LoopVectorPreHeader;
2893  }
2894 
2895  // Keep all loop hints from the original loop on the vector loop (we'll
2896  // replace the vectorizer-specific hints below).
2897  if (MDNode *LID = OrigLoop->getLoopID())
2898  Lp->setLoopID(LID);
2899 
2900  LoopVectorizeHints Hints(Lp, true, *ORE);
2901  Hints.setAlreadyVectorized();
2902 
2903  return LoopVectorPreHeader;
2904 }
2905 
2906 // Fix up external users of the induction variable. At this point, we are
2907 // in LCSSA form, with all external PHIs that use the IV having one input value,
2908 // coming from the remainder loop. We need those PHIs to also have a correct
2909 // value for the IV when arriving directly from the middle block.
2911  const InductionDescriptor &II,
2912  Value *CountRoundDown, Value *EndValue,
2913  BasicBlock *MiddleBlock) {
2914  // There are two kinds of external IV usages - those that use the value
2915  // computed in the last iteration (the PHI) and those that use the penultimate
2916  // value (the value that feeds into the phi from the loop latch).
2917  // We allow both, but they, obviously, have different values.
2918 
2919  assert(OrigLoop->getExitBlock() && "Expected a single exit block");
2920 
2921  DenseMap<Value *, Value *> MissingVals;
2922 
2923  // An external user of the last iteration's value should see the value that
2924  // the remainder loop uses to initialize its own IV.
2926  for (User *U : PostInc->users()) {
2927  Instruction *UI = cast<Instruction>(U);
2928  if (!OrigLoop->contains(UI)) {
2929  assert(isa<PHINode>(UI) && "Expected LCSSA form");
2930  MissingVals[UI] = EndValue;
2931  }
2932  }
2933 
2934  // An external user of the penultimate value need to see EndValue - Step.
2935  // The simplest way to get this is to recompute it from the constituent SCEVs,
2936  // that is Start + (Step * (CRD - 1)).
2937  for (User *U : OrigPhi->users()) {
2938  auto *UI = cast<Instruction>(U);
2939  if (!OrigLoop->contains(UI)) {
2940  const DataLayout &DL =
2942  assert(isa<PHINode>(UI) && "Expected LCSSA form");
2943 
2944  IRBuilder<> B(MiddleBlock->getTerminator());
2945  Value *CountMinusOne = B.CreateSub(
2946  CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
2947  Value *CMO =
2948  !II.getStep()->getType()->isIntegerTy()
2949  ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
2950  II.getStep()->getType())
2951  : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
2952  CMO->setName("cast.cmo");
2953  Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
2954  Escape->setName("ind.escape");
2955  MissingVals[UI] = Escape;
2956  }
2957  }
2958 
2959  for (auto &I : MissingVals) {
2960  PHINode *PHI = cast<PHINode>(I.first);
2961  // One corner case we have to handle is two IVs "chasing" each-other,
2962  // that is %IV2 = phi [...], [ %IV1, %latch ]
2963  // In this case, if IV1 has an external use, we need to avoid adding both
2964  // "last value of IV1" and "penultimate value of IV2". So, verify that we
2965  // don't already have an incoming value for the middle block.
2966  if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
2967  PHI->addIncoming(I.second, MiddleBlock);
2968  }
2969 }
2970 
2971 namespace {
2972 
2973 struct CSEDenseMapInfo {
2974  static bool canHandle(const Instruction *I) {
2975  return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
2976  isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
2977  }
2978 
2979  static inline Instruction *getEmptyKey() {
2981  }
2982 
2983  static inline Instruction *getTombstoneKey() {
2985  }
2986 
2987  static unsigned getHashValue(const Instruction *I) {
2988  assert(canHandle(I) && "Unknown instruction!");
2990  I->value_op_end()));
2991  }
2992 
2993  static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2994  if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2995  LHS == getTombstoneKey() || RHS == getTombstoneKey())
2996  return LHS == RHS;
2997  return LHS->isIdenticalTo(RHS);
2998  }
2999 };
3000 
3001 } // end anonymous namespace
3002 
3003 ///Perform cse of induction variable instructions.
3004 static void cse(BasicBlock *BB) {
3005  // Perform simple cse.
3007  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3008  Instruction *In = &*I++;
3009 
3010  if (!CSEDenseMapInfo::canHandle(In))
3011  continue;
3012 
3013  // Check if we can replace this instruction with any of the
3014  // visited instructions.
3015  if (Instruction *V = CSEMap.lookup(In)) {
3016  In->replaceAllUsesWith(V);
3017  In->eraseFromParent();
3018  continue;
3019  }
3020 
3021  CSEMap[In] = In;
3022  }
3023 }
3024 
3025 /// Estimate the overhead of scalarizing an instruction. This is a
3026 /// convenience wrapper for the type-based getScalarizationOverhead API.
3027 static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,
3028  const TargetTransformInfo &TTI) {
3029  if (VF == 1)
3030  return 0;
3031 
3032  unsigned Cost = 0;
3033  Type *RetTy = ToVectorTy(I->getType(), VF);
3034  if (!RetTy->isVoidTy() &&
3035  (!isa<LoadInst>(I) ||
3037  Cost += TTI.getScalarizationOverhead(RetTy, true, false);
3038 
3039  // Some targets keep addresses scalar.
3040  if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
3041  return Cost;
3042 
3043  if (CallInst *CI = dyn_cast<CallInst>(I)) {
3044  SmallVector<const Value *, 4> Operands(CI->arg_operands());
3045  Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
3046  }
3047  else if (!isa<StoreInst>(I) ||
3050  Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
3051  }
3052 
3053  return Cost;
3054 }
3055 
3056 // Estimate cost of a call instruction CI if it were vectorized with factor VF.
3057 // Return the cost of the instruction, including scalarization overhead if it's
3058 // needed. The flag NeedToScalarize shows if the call needs to be scalarized -
3059 // i.e. either vector version isn't available, or is too expensive.
3060 static unsigned getVectorCallCost(CallInst *CI, unsigned VF,
3061  const TargetTransformInfo &TTI,
3062  const TargetLibraryInfo *TLI,
3063  bool &NeedToScalarize) {
3064  Function *F = CI->getCalledFunction();
3065  StringRef FnName = CI->getCalledFunction()->getName();
3066  Type *ScalarRetTy = CI->getType();
3067  SmallVector<Type *, 4> Tys, ScalarTys;
3068  for (auto &ArgOp : CI->arg_operands())
3069  ScalarTys.push_back(ArgOp->getType());
3070 
3071  // Estimate cost of scalarized vector call. The source operands are assumed
3072  // to be vectors, so we need to extract individual elements from there,
3073  // execute VF scalar calls, and then gather the result into the vector return
3074  // value.
3075  unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3076  if (VF == 1)
3077  return ScalarCallCost;
3078 
3079  // Compute corresponding vector type for return value and arguments.
3080  Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3081  for (Type *ScalarTy : ScalarTys)
3082  Tys.push_back(ToVectorTy(ScalarTy, VF));
3083 
3084  // Compute costs of unpacking argument values for the scalar calls and
3085  // packing the return values to a vector.
3086  unsigned ScalarizationCost = getScalarizationOverhead(CI, VF, TTI);
3087 
3088  unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3089 
3090  // If we can't emit a vector call for this function, then the currently found
3091  // cost is the cost we need to return.
3092  NeedToScalarize = true;
3093  if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
3094  return Cost;
3095 
3096  // If the corresponding vector cost is cheaper, return its cost.
3097  unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3098  if (VectorCallCost < Cost) {
3099  NeedToScalarize = false;
3100  return VectorCallCost;
3101  }
3102  return Cost;
3103 }
3104 
3105 // Estimate cost of an intrinsic call instruction CI if it were vectorized with
3106 // factor VF. Return the cost of the instruction, including scalarization
3107 // overhead if it's needed.
3108 static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF,
3109  const TargetTransformInfo &TTI,
3110  const TargetLibraryInfo *TLI) {
3112  assert(ID && "Expected intrinsic call!");
3113 
3114  FastMathFlags FMF;
3115  if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3116  FMF = FPMO->getFastMathFlags();
3117 
3118  SmallVector<Value *, 4> Operands(CI->arg_operands());
3119  return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3120 }
3121 
3123  auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3124  auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3125  return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3126 }
3128  auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3129  auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3130  return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3131 }
3132 
3134  // For every instruction `I` in MinBWs, truncate the operands, create a
3135  // truncated version of `I` and reextend its result. InstCombine runs
3136  // later and will remove any ext/trunc pairs.
3137  SmallPtrSet<Value *, 4> Erased;
3138  for (const auto &KV : Cost->getMinimalBitwidths()) {
3139  // If the value wasn't vectorized, we must maintain the original scalar
3140  // type. The absence of the value from VectorLoopValueMap indicates that it
3141  // wasn't vectorized.
3142  if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3143  continue;
3144  for (unsigned Part = 0; Part < UF; ++Part) {
3145  Value *I = getOrCreateVectorValue(KV.first, Part);
3146  if (Erased.find(I) != Erased.end() || I->use_empty() ||
3147  !isa<Instruction>(I))
3148  continue;
3149  Type *OriginalTy = I->getType();
3150  Type *ScalarTruncatedTy =
3151  IntegerType::get(OriginalTy->getContext(), KV.second);
3152  Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3153  OriginalTy->getVectorNumElements());
3154  if (TruncatedTy == OriginalTy)
3155  continue;
3156 
3157  IRBuilder<> B(cast<Instruction>(I));
3158  auto ShrinkOperand = [&](Value *V) -> Value * {
3159  if (auto *ZI = dyn_cast<ZExtInst>(V))
3160  if (ZI->getSrcTy() == TruncatedTy)
3161  return ZI->getOperand(0);
3162  return B.CreateZExtOrTrunc(V, TruncatedTy);
3163  };
3164 
3165  // The actual instruction modification depends on the instruction type,
3166  // unfortunately.
3167  Value *NewI = nullptr;
3168  if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3169  NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3170  ShrinkOperand(BO->getOperand(1)));
3171 
3172  // Any wrapping introduced by shrinking this operation shouldn't be
3173  // considered undefined behavior. So, we can't unconditionally copy
3174  // arithmetic wrapping flags to NewI.
3175  cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3176  } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3177  NewI =
3178  B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3179  ShrinkOperand(CI->getOperand(1)));
3180  } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3181  NewI = B.CreateSelect(SI->getCondition(),
3182  ShrinkOperand(SI->getTrueValue()),
3183  ShrinkOperand(SI->getFalseValue()));
3184  } else if (auto *CI = dyn_cast<CastInst>(I)) {
3185  switch (CI->getOpcode()) {
3186  default:
3187  llvm_unreachable("Unhandled cast!");
3188  case Instruction::Trunc:
3189  NewI = ShrinkOperand(CI->getOperand(0));
3190  break;
3191  case Instruction::SExt:
3192  NewI = B.CreateSExtOrTrunc(
3193  CI->getOperand(0),
3194  smallestIntegerVectorType(OriginalTy, TruncatedTy));
3195  break;
3196  case Instruction::ZExt:
3197  NewI = B.CreateZExtOrTrunc(
3198  CI->getOperand(0),
3199  smallestIntegerVectorType(OriginalTy, TruncatedTy));
3200  break;
3201  }
3202  } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3203  auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3204  auto *O0 = B.CreateZExtOrTrunc(
3205  SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3206  auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3207  auto *O1 = B.CreateZExtOrTrunc(
3208  SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3209 
3210  NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3211  } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3212  // Don't do anything with the operands, just extend the result.
3213  continue;
3214  } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3215  auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3216  auto *O0 = B.CreateZExtOrTrunc(
3217  IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3218  auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3219  NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3220  } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3221  auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3222  auto *O0 = B.CreateZExtOrTrunc(
3223  EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3224  NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3225  } else {
3226  // If we don't know what to do, be conservative and don't do anything.
3227  continue;
3228  }
3229 
3230  // Lastly, extend the result.
3231  NewI->takeName(cast<Instruction>(I));
3232  Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3233  I->replaceAllUsesWith(Res);
3234  cast<Instruction>(I)->eraseFromParent();
3235  Erased.insert(I);
3236  VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3237  }
3238  }
3239 
3240  // We'll have created a bunch of ZExts that are now parentless. Clean up.
3241  for (const auto &KV : Cost->getMinimalBitwidths()) {
3242  // If the value wasn't vectorized, we must maintain the original scalar
3243  // type. The absence of the value from VectorLoopValueMap indicates that it
3244  // wasn't vectorized.
3245  if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3246  continue;
3247  for (unsigned Part = 0; Part < UF; ++Part) {
3248  Value *I = getOrCreateVectorValue(KV.first, Part);
3249  ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3250  if (Inst && Inst->use_empty()) {
3251  Value *NewI = Inst->getOperand(0);
3252  Inst->eraseFromParent();
3253  VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3254  }
3255  }
3256  }
3257 }
3258 
3260  // Insert truncates and extends for any truncated instructions as hints to
3261  // InstCombine.
3262  if (VF > 1)
3264 
3265  // Fix widened non-induction PHIs by setting up the PHI operands.
3266  if (OrigPHIsToFix.size()) {
3268  "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3270  }
3271 
3272  // At this point every instruction in the original loop is widened to a
3273  // vector form. Now we need to fix the recurrences in the loop. These PHI
3274  // nodes are currently empty because we did not want to introduce cycles.
3275  // This is the second stage of vectorizing recurrences.
3277 
3278  // Update the dominator tree.
3279  //
3280  // FIXME: After creating the structure of the new loop, the dominator tree is
3281  // no longer up-to-date, and it remains that way until we update it
3282  // here. An out-of-date dominator tree is problematic for SCEV,
3283  // because SCEVExpander uses it to guide code generation. The
3284  // vectorizer use SCEVExpanders in several places. Instead, we should
3285  // keep the dominator tree up-to-date as we go.
3286  updateAnalysis();
3287 
3288  // Fix-up external users of the induction variables.
3289  for (auto &Entry : *Legal->getInductionVars())
3290  fixupIVUsers(Entry.first, Entry.second,
3292  IVEndValues[Entry.first], LoopMiddleBlock);
3293 
3294  fixLCSSAPHIs();
3296  sinkScalarOperands(&*PI);
3297 
3298  // Remove redundant induction instructions.
3300 }
3301 
3303  // In order to support recurrences we need to be able to vectorize Phi nodes.
3304  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3305  // stage #2: We now need to fix the recurrences by adding incoming edges to
3306  // the currently empty PHI nodes. At this point every instruction in the
3307  // original loop is widened to a vector form so we can use them to construct
3308  // the incoming edges.
3309  for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3310  // Handle first-order recurrences and reductions that need to be fixed.
3311  if (Legal->isFirstOrderRecurrence(&Phi))
3313  else if (Legal->isReductionVariable(&Phi))
3314  fixReduction(&Phi);
3315  }
3316 }
3317 
3319  // This is the second phase of vectorizing first-order recurrences. An
3320  // overview of the transformation is described below. Suppose we have the
3321  // following loop.
3322  //
3323  // for (int i = 0; i < n; ++i)
3324  // b[i] = a[i] - a[i - 1];
3325  //
3326  // There is a first-order recurrence on "a". For this loop, the shorthand
3327  // scalar IR looks like:
3328  //
3329  // scalar.ph:
3330  // s_init = a[-1]
3331  // br scalar.body
3332  //
3333  // scalar.body:
3334  // i = phi [0, scalar.ph], [i+1, scalar.body]
3335  // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3336  // s2 = a[i]
3337  // b[i] = s2 - s1
3338  // br cond, scalar.body, ...
3339  //
3340  // In this example, s1 is a recurrence because it's value depends on the
3341  // previous iteration. In the first phase of vectorization, we created a
3342  // temporary value for s1. We now complete the vectorization and produce the
3343  // shorthand vector IR shown below (for VF = 4, UF = 1).
3344  //
3345  // vector.ph:
3346  // v_init = vector(..., ..., ..., a[-1])
3347  // br vector.body
3348  //
3349  // vector.body
3350  // i = phi [0, vector.ph], [i+4, vector.body]
3351  // v1 = phi [v_init, vector.ph], [v2, vector.body]
3352  // v2 = a[i, i+1, i+2, i+3];
3353  // v3 = vector(v1(3), v2(0, 1, 2))
3354  // b[i, i+1, i+2, i+3] = v2 - v3
3355  // br cond, vector.body, middle.block
3356  //
3357  // middle.block:
3358  // x = v2(3)
3359  // br scalar.ph
3360  //
3361  // scalar.ph:
3362  // s_init = phi [x, middle.block], [a[-1], otherwise]
3363  // br scalar.body
3364  //
3365  // After execution completes the vector loop, we extract the next value of
3366  // the recurrence (x) to use as the initial value in the scalar loop.
3367 
3368  // Get the original loop preheader and single loop latch.
3369  auto *Preheader = OrigLoop->getLoopPreheader();
3370  auto *Latch = OrigLoop->getLoopLatch();
3371 
3372  // Get the initial and previous values of the scalar recurrence.
3373  auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3374  auto *Previous = Phi->getIncomingValueForBlock(Latch);
3375 
3376  // Create a vector from the initial value.
3377  auto *VectorInit = ScalarInit;
3378  if (VF > 1) {
3380  VectorInit = Builder.CreateInsertElement(
3381  UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3382  Builder.getInt32(VF - 1), "vector.recur.init");
3383  }
3384 
3385  // We constructed a temporary phi node in the first phase of vectorization.
3386  // This phi node will eventually be deleted.
3388  cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3389 
3390  // Create a phi node for the new recurrence. The current value will either be
3391  // the initial value inserted into a vector or loop-varying vector value.
3392  auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3393  VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3394 
3395  // Get the vectorized previous value of the last part UF - 1. It appears last
3396  // among all unrolled iterations, due to the order of their construction.
3397  Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3398 
3399  // Set the insertion point after the previous value if it is an instruction.
3400  // Note that the previous value may have been constant-folded so it is not
3401  // guaranteed to be an instruction in the vector loop. Also, if the previous
3402  // value is a phi node, we should insert after all the phi nodes to avoid
3403  // breaking basic block verification.
3404  if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||
3405  isa<PHINode>(PreviousLastPart))
3407  else
3409  &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));
3410 
3411  // We will construct a vector for the recurrence by combining the values for
3412  // the current and previous iterations. This is the required shuffle mask.
3413  SmallVector<Constant *, 8> ShuffleMask(VF);
3414  ShuffleMask[0] = Builder.getInt32(VF - 1);
3415  for (unsigned I = 1; I < VF; ++I)
3416  ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3417 
3418  // The vector from which to take the initial value for the current iteration
3419  // (actual or unrolled). Initially, this is the vector phi node.
3420  Value *Incoming = VecPhi;
3421 
3422  // Shuffle the current and previous vector and update the vector parts.
3423  for (unsigned Part = 0; Part < UF; ++Part) {
3424  Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3425  Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3426  auto *Shuffle =
3427  VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3428  ConstantVector::get(ShuffleMask))
3429  : Incoming;
3430  PhiPart->replaceAllUsesWith(Shuffle);
3431  cast<Instruction>(PhiPart)->eraseFromParent();
3432  VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3433  Incoming = PreviousPart;
3434  }
3435 
3436  // Fix the latch value of the new recurrence in the vector loop.
3437  VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3438 
3439  // Extract the last vector element in the middle block. This will be the
3440  // initial value for the recurrence when jumping to the scalar loop.
3441  auto *ExtractForScalar = Incoming;
3442  if (VF > 1) {
3444  ExtractForScalar = Builder.CreateExtractElement(
3445  ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3446  }
3447  // Extract the second last element in the middle block if the
3448  // Phi is used outside the loop. We need to extract the phi itself
3449  // and not the last element (the phi update in the current iteration). This
3450  // will be the value when jumping to the exit block from the LoopMiddleBlock,
3451  // when the scalar loop is not run at all.
3452  Value *ExtractForPhiUsedOutsideLoop = nullptr;
3453  if (VF > 1)
3454  ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3455  Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3456  // When loop is unrolled without vectorizing, initialize
3457  // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3458  // `Incoming`. This is analogous to the vectorized case above: extracting the
3459  // second last element when VF > 1.
3460  else if (UF > 1)
3461  ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3462 
3463  // Fix the initial value of the original recurrence in the scalar loop.
3465  auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3466  for (auto *BB : predecessors(LoopScalarPreHeader)) {
3467  auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3468  Start->addIncoming(Incoming, BB);
3469  }
3470 
3472  Phi->setName("scalar.recur");
3473 
3474  // Finally, fix users of the recurrence outside the loop. The users will need
3475  // either the last value of the scalar recurrence or the last value of the
3476  // vector recurrence we extracted in the middle block. Since the loop is in
3477  // LCSSA form, we just need to find all the phi nodes for the original scalar
3478  // recurrence in the exit block, and then add an edge for the middle block.
3479  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3480  if (LCSSAPhi.getIncomingValue(0) == Phi) {
3481  LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3482  }
3483  }
3484 }
3485 
3487  Constant *Zero = Builder.getInt32(0);
3488 
3489  // Get it's reduction variable descriptor.
3491  "Unable to find the reduction variable");
3492  RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
3493 
3495  TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3496  Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3498  RdxDesc.getMinMaxRecurrenceKind();
3499  setDebugLocFromInst(Builder, ReductionStartValue);
3500 
3501  // We need to generate a reduction vector from the incoming scalar.
3502  // To do so, we need to generate the 'identity' vector and override
3503  // one of the elements with the incoming scalar reduction. We need
3504  // to do it in the vector-loop preheader.
3506 
3507  // This is the vector-clone of the value that leaves the loop.
3508  Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3509 
3510  // Find the reduction identity variable. Zero for addition, or, xor,
3511  // one for multiplication, -1 for And.
3512  Value *Identity;
3513  Value *VectorStart;
3516  // MinMax reduction have the start value as their identify.
3517  if (VF == 1) {
3518  VectorStart = Identity = ReductionStartValue;
3519  } else {
3520  VectorStart = Identity =
3521  Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3522  }
3523  } else {
3524  // Handle other reduction kinds:
3526  RK, VecTy->getScalarType());
3527  if (VF == 1) {
3528  Identity = Iden;
3529  // This vector is the Identity vector where the first element is the
3530  // incoming scalar reduction.
3531  VectorStart = ReductionStartValue;
3532  } else {
3533  Identity = ConstantVector::getSplat(VF, Iden);
3534 
3535  // This vector is the Identity vector where the first element is the
3536  // incoming scalar reduction.
3537  VectorStart =
3538  Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3539  }
3540  }
3541 
3542  // Fix the vector-loop phi.
3543 
3544  // Reductions do not have to start at zero. They can start with
3545  // any loop invariant values.
3546  BasicBlock *Latch = OrigLoop->getLoopLatch();
3547  Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3548  for (unsigned Part = 0; Part < UF; ++Part) {
3549  Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3550  Value *Val = getOrCreateVectorValue(LoopVal, Part);
3551  // Make sure to add the reduction stat value only to the
3552  // first unroll part.
3553  Value *StartVal = (Part == 0) ? VectorStart : Identity;
3554  cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3555  cast<PHINode>(VecRdxPhi)
3556  ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3557  }
3558 
3559  // Before each round, move the insertion point right between
3560  // the PHIs and the values we are going to write.
3561  // This allows us to write both PHINodes and the extractelement
3562  // instructions.
3564 
3565  setDebugLocFromInst(Builder, LoopExitInst);
3566 
3567  // If the vector reduction can be performed in a smaller type, we truncate
3568  // then extend the loop exit value to enable InstCombine to evaluate the
3569  // entire expression in the smaller type.
3570  if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3571  Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3574  VectorParts RdxParts(UF);
3575  for (unsigned Part = 0; Part < UF; ++Part) {
3576  RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3577  Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3578  Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3579  : Builder.CreateZExt(Trunc, VecTy);
3580  for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3581  UI != RdxParts[Part]->user_end();)
3582  if (*UI != Trunc) {
3583  (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3584  RdxParts[Part] = Extnd;
3585  } else {
3586  ++UI;
3587  }
3588  }
3590  for (unsigned Part = 0; Part < UF; ++Part) {
3591  RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3592  VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3593  }
3594  }
3595 
3596  // Reduce all of the unrolled parts into a single vector.
3597  Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3599  setDebugLocFromInst(Builder, ReducedPartRdx);
3600  for (unsigned Part = 1; Part < UF; ++Part) {
3601  Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3602  if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3603  // Floating point operations had to be 'fast' to enable the reduction.
3604  ReducedPartRdx = addFastMathFlag(
3606  ReducedPartRdx, "bin.rdx"));
3607  else
3608  ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3609  RdxPart);
3610  }
3611 
3612  if (VF > 1) {
3613  bool NoNaN = Legal->hasFunNoNaNAttr();
3614  ReducedPartRdx =
3615  createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3616  // If the reduction can be performed in a smaller type, we need to extend
3617  // the reduction to the wider type before we branch to the original loop.
3618  if (Phi->getType() != RdxDesc.getRecurrenceType())
3619  ReducedPartRdx =
3620  RdxDesc.isSigned()
3621  ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3622  : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3623  }
3624 
3625  // Create a phi node that merges control-flow from the backedge-taken check
3626  // block and the middle block.
3627  PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3629  for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3630  BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3631  BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3632 
3633  // Now, we need to fix the users of the reduction variable
3634  // inside and outside of the scalar remainder loop.
3635  // We know that the loop is in LCSSA form. We need to update the
3636  // PHI nodes in the exit blocks.
3637  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3638  // All PHINodes need to have a single entry edge, or two if
3639  // we already fixed them.
3640  assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3641 
3642  // We found a reduction value exit-PHI. Update it with the
3643  // incoming bypass edge.
3644  if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3645  LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3646  } // end of the LCSSA phi scan.
3647 
3648  // Fix the scalar loop reduction variable with the incoming reduction sum
3649  // from the vector body and from the backedge value.
3650  int IncomingEdgeBlockIdx =
3652  assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3653  // Pick the other block.
3654  int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3655  Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3656  Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3657 }
3658 
3660  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3661  if (LCSSAPhi.getNumIncomingValues() == 1) {
3662  auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3663  // Non-instruction incoming values will have only one value.
3664  unsigned LastLane = 0;
3665  if (isa<Instruction>(IncomingValue))
3666  LastLane = Cost->isUniformAfterVectorization(
3667  cast<Instruction>(IncomingValue), VF)
3668  ? 0
3669  : VF - 1;
3670  // Can be a loop invariant incoming value or the last scalar value to be
3671  // extracted from the vectorized loop.
3673  Value *lastIncomingValue =
3674  getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3675  LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3676  }
3677  }
3678 }
3679 
3681  // The basic block and loop containing the predicated instruction.
3682  auto *PredBB = PredInst->getParent();
3683  auto *VectorLoop = LI->getLoopFor(PredBB);
3684 
3685  // Initialize a worklist with the operands of the predicated instruction.
3686  SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3687 
3688  // Holds instructions that we need to analyze again. An instruction may be
3689  // reanalyzed if we don't yet know if we can sink it or not.
3690  SmallVector<Instruction *, 8> InstsToReanalyze;
3691 
3692  // Returns true if a given use occurs in the predicated block. Phi nodes use
3693  // their operands in their corresponding predecessor blocks.
3694  auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3695  auto *I = cast<Instruction>(U.getUser());
3696  BasicBlock *BB = I->getParent();
3697  if (auto *Phi = dyn_cast<PHINode>(I))
3698  BB = Phi->getIncomingBlock(
3699  PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3700  return BB == PredBB;
3701  };
3702 
3703  // Iteratively sink the scalarized operands of the predicated instruction
3704  // into the block we created for it. When an instruction is sunk, it's
3705  // operands are then added to the worklist. The algorithm ends after one pass
3706  // through the worklist doesn't sink a single instruction.
3707  bool Changed;
3708  do {
3709  // Add the instructions that need to be reanalyzed to the worklist, and
3710  // reset the changed indicator.
3711  Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3712  InstsToReanalyze.clear();
3713  Changed = false;
3714 
3715  while (!Worklist.empty()) {
3716  auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3717 
3718  // We can't sink an instruction if it is a phi node, is already in the
3719  // predicated block, is not in the loop, or may have side effects.
3720  if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
3721  !VectorLoop->contains(I) || I->mayHaveSideEffects())
3722  continue;
3723 
3724  // It's legal to sink the instruction if all its uses occur in the
3725  // predicated block. Otherwise, there's nothing to do yet, and we may
3726  // need to reanalyze the instruction.
3727  if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3728  InstsToReanalyze.push_back(I);
3729  continue;
3730  }
3731 
3732  // Move the instruction to the beginning of the predicated block, and add
3733  // it's operands to the worklist.
3734  I->moveBefore(&*PredBB->getFirstInsertionPt());
3735  Worklist.insert(I->op_begin(), I->op_end());
3736 
3737  // The sinking may have enabled other instructions to be sunk, so we will
3738  // need to iterate.
3739  Changed = true;
3740  }
3741  } while (Changed);
3742 }
3743 
3745  for (PHINode *OrigPhi : OrigPHIsToFix) {
3746  PHINode *NewPhi =
3747  cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
3748  unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
3749 
3750  SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
3751  predecessors(OrigPhi->getParent()));
3752  SmallVector<BasicBlock *, 2> VectorBBPredecessors(
3753  predecessors(NewPhi->getParent()));
3754  assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
3755  "Scalar and Vector BB should have the same number of predecessors");
3756 
3757  // The insertion point in Builder may be invalidated by the time we get
3758  // here. Force the Builder insertion point to something valid so that we do
3759  // not run into issues during insertion point restore in
3760  // getOrCreateVectorValue calls below.
3761  Builder.SetInsertPoint(NewPhi);
3762 
3763  // The predecessor order is preserved and we can rely on mapping between
3764  // scalar and vector block predecessors.
3765  for (unsigned i = 0; i < NumIncomingValues; ++i) {
3766  BasicBlock *NewPredBB = VectorBBPredecessors[i];
3767 
3768  // When looking up the new scalar/vector values to fix up, use incoming
3769  // values from original phi.
3770  Value *ScIncV =
3771  OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
3772 
3773  // Scalar incoming value may need a broadcast
3774  Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
3775  NewPhi->addIncoming(NewIncV, NewPredBB);
3776  }
3777  }
3778 }
3779 
3781  unsigned VF) {
3782  PHINode *P = cast<PHINode>(PN);
3783  if (EnableVPlanNativePath) {
3784  // Currently we enter here in the VPlan-native path for non-induction
3785  // PHIs where all control flow is uniform. We simply widen these PHIs.
3786  // Create a vector phi with no operands - the vector phi operands will be
3787  // set at the end of vector code generation.
3788  Type *VecTy =
3789  (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3790  Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
3791  VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
3792  OrigPHIsToFix.push_back(P);
3793 
3794  return;
3795  }
3796 
3797  assert(PN->getParent() == OrigLoop->getHeader() &&
3798  "Non-header phis should have been handled elsewhere");
3799 
3800  // In order to support recurrences we need to be able to vectorize Phi nodes.
3801  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3802  // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3803  // this value when we vectorize all of the instructions that use the PHI.
3805  for (unsigned Part = 0; Part < UF; ++Part) {
3806  // This is phase one of vectorizing PHIs.
3807  Type *VecTy =
3808  (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3809  Value *EntryPart = PHINode::Create(
3810  VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
3811  VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
3812  }
3813  return;
3814  }
3815 
3817 
3818  // This PHINode must be an induction variable.
3819  // Make sure that we know about it.
3820  assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
3821 
3823  const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
3824 
3825  // FIXME: The newly created binary instructions should contain nsw/nuw flags,
3826  // which can be found from the original scalar operations.
3827  switch (II.getKind()) {
3829  llvm_unreachable("Unknown induction");
3832  llvm_unreachable("Integer/fp induction is handled elsewhere.");
3834  // Handle the pointer induction variable case.
3835  assert(P->getType()->isPointerTy() && "Unexpected type.");
3836  // This is the normalized GEP that starts counting at zero.
3837  Value *PtrInd = Induction;
3838  PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
3839  // Determine the number of scalars we need to generate for each unroll
3840  // iteration. If the instruction is uniform, we only need to generate the
3841  // first lane. Otherwise, we generate all VF values.
3842  unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
3843  // These are the scalar results. Notice that we don't generate vector GEPs
3844  // because scalar GEPs result in better code.
3845  for (unsigned Part = 0; Part < UF; ++Part) {
3846  for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
3847  Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
3848  Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
3849  Value *SclrGep =
3850  emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
3851  SclrGep->setName("next.gep");
3852  VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
3853  }
3854  }
3855  return;
3856  }
3857  }
3858 }
3859 
3860 /// A helper function for checking whether an integer division-related
3861 /// instruction may divide by zero (in which case it must be predicated if
3862 /// executed conditionally in the scalar code).
3863 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
3864 /// Non-zero divisors that are non compile-time constants will not be
3865 /// converted into multiplication, so we will still end up scalarizing
3866 /// the division, but can do so w/o predication.
3868  assert((I.getOpcode() == Instruction::UDiv ||
3869  I.getOpcode() == Instruction::SDiv ||
3870  I.getOpcode() == Instruction::URem ||
3871  I.getOpcode() == Instruction::SRem) &&
3872  "Unexpected instruction");
3873  Value *Divisor = I.getOperand(1);
3874  auto *CInt = dyn_cast<ConstantInt>(Divisor);
3875  return !CInt || CInt->isZero();
3876 }
3877 
3879  switch (I.getOpcode()) {
3880  case Instruction::Br:
3881  case Instruction::PHI:
3882  llvm_unreachable("This instruction is handled by a different recipe.");
3883  case Instruction::GetElementPtr: {
3884  // Construct a vector GEP by widening the operands of the scalar GEP as
3885  // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
3886  // results in a vector of pointers when at least one operand of the GEP
3887  // is vector-typed. Thus, to keep the representation compact, we only use
3888  // vector-typed operands for loop-varying values.
3889  auto *GEP = cast<GetElementPtrInst>(&I);
3890 
3891  if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
3892  // If we are vectorizing, but the GEP has only loop-invariant operands,
3893  // the GEP we build (by only using vector-typed operands for
3894  // loop-varying values) would be a scalar pointer. Thus, to ensure we
3895  // produce a vector of pointers, we need to either arbitrarily pick an
3896  // operand to broadcast, or broadcast a clone of the original GEP.
3897  // Here, we broadcast a clone of the original.
3898  //
3899  // TODO: If at some point we decide to scalarize instructions having
3900  // loop-invariant operands, this special case will no longer be
3901  // required. We would add the scalarization decision to
3902  // collectLoopScalars() and teach getVectorValue() to broadcast
3903  // the lane-zero scalar value.
3904  auto *Clone = Builder.Insert(GEP->clone());
3905  for (unsigned Part = 0; Part < UF; ++Part) {
3906  Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
3907  VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
3908  addMetadata(EntryPart, GEP);
3909  }
3910  } else {
3911  // If the GEP has at least one loop-varying operand, we are sure to
3912  // produce a vector of pointers. But if we are only unrolling, we want
3913  // to produce a scalar GEP for each unroll part. Thus, the GEP we
3914  // produce with the code below will be scalar (if VF == 1) or vector
3915  // (otherwise). Note that for the unroll-only case, we still maintain
3916  // values in the vector mapping with initVector, as we do for other
3917  // instructions.
3918  for (unsigned Part = 0; Part < UF; ++Part) {
3919  // The pointer operand of the new GEP. If it's loop-invariant, we
3920  // won't broadcast it.
3921  auto *Ptr =
3922  OrigLoop->isLoopInvariant(GEP->getPointerOperand())
3923  ? GEP->getPointerOperand()
3924  : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
3925 
3926  // Collect all the indices for the new GEP. If any index is
3927  // loop-invariant, we won't broadcast it.
3928  SmallVector<Value *, 4> Indices;
3929  for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
3930  if (OrigLoop->isLoopInvariant(U.get()))
3931  Indices.push_back(U.get());
3932  else
3933  Indices.push_back(getOrCreateVectorValue(U.get(), Part));
3934  }
3935 
3936  // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
3937  // but it should be a vector, otherwise.
3938  auto *NewGEP = GEP->isInBounds()
3939  ? Builder.CreateInBoundsGEP(Ptr, Indices)
3940  : Builder.CreateGEP(Ptr, Indices);
3941  assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
3942  "NewGEP is not a pointer vector");
3943  VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
3944  addMetadata(NewGEP, GEP);
3945  }
3946  }
3947 
3948  break;
3949  }
3950  case Instruction::UDiv:
3951  case Instruction::SDiv:
3952  case Instruction::SRem:
3953  case Instruction::URem:
3954  case Instruction::Add:
3955  case Instruction::FAdd:
3956  case Instruction::Sub:
3957  case Instruction::FSub:
3958  case Instruction::Mul:
3959  case Instruction::FMul:
3960  case Instruction::FDiv:
3961  case Instruction::FRem:
3962  case Instruction::Shl:
3963  case Instruction::LShr:
3964  case Instruction::AShr:
3965  case Instruction::And:
3966  case Instruction::Or:
3967  case Instruction::Xor: {
3968  // Just widen binops.
3969  auto *BinOp = cast<BinaryOperator>(&I);
3970  setDebugLocFromInst(Builder, BinOp);
3971 
3972  for (unsigned Part = 0; Part < UF; ++Part) {
3973  Value *A = getOrCreateVectorValue(BinOp->getOperand(0), Part);
3974  Value *B = getOrCreateVectorValue(BinOp->getOperand(1), Part);
3975  Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
3976 
3977  if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
3978  VecOp->copyIRFlags(BinOp);
3979 
3980  // Use this vector value for all users of the original instruction.
3981  VectorLoopValueMap.setVectorValue(&I, Part, V);
3982  addMetadata(V, BinOp);
3983  }
3984 
3985  break;
3986  }
3987  case Instruction::Select: {
3988  // Widen selects.
3989  // If the selector is loop invariant we can create a select
3990  // instruction with a scalar condition. Otherwise, use vector-select.
3991  auto *SE = PSE.getSE();
3992  bool InvariantCond =
3995 
3996  // The condition can be loop invariant but still defined inside the
3997  // loop. This means that we can't just use the original 'cond' value.
3998  // We have to take the 'vectorized' value and pick the first lane.
3999  // Instcombine will make this a no-op.
4000 
4001  auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4002 
4003  for (unsigned Part = 0; Part < UF; ++Part) {
4004  Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4005  Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4006  Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4007  Value *Sel =
4008  Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4009  VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4010  addMetadata(Sel, &I);
4011  }
4012 
4013  break;
4014  }
4015 
4016  case Instruction::ICmp:
4017  case Instruction::FCmp: {
4018  // Widen compares. Generate vector compares.
4019  bool FCmp = (I.getOpcode() == Instruction::FCmp);
4020  auto *Cmp = dyn_cast<CmpInst>(&I);
4022  for (unsigned Part = 0; Part < UF; ++Part) {
4023  Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4024  Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4025  Value *C = nullptr;
4026  if (FCmp) {
4027  // Propagate fast math flags.
4029  Builder.setFastMathFlags(Cmp->getFastMathFlags());
4030  C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4031  } else {
4032  C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4033  }
4034  VectorLoopValueMap.setVectorValue(&I, Part, C);
4035  addMetadata(C, &I);
4036  }
4037 
4038  break;
4039  }
4040 
4041  case Instruction::ZExt:
4042  case Instruction::SExt:
4043  case Instruction::FPToUI:
4044  case Instruction::FPToSI:
4045  case Instruction::FPExt:
4046  case Instruction::PtrToInt:
4047  case Instruction::IntToPtr:
4048  case Instruction::SIToFP:
4049  case Instruction::UIToFP:
4050  case Instruction::Trunc:
4051  case Instruction::FPTrunc:
4052  case Instruction::BitCast: {
4053  auto *CI = dyn_cast<CastInst>(&I);
4055 
4056  /// Vectorize casts.
4057  Type *DestTy =
4058  (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4059 
4060  for (unsigned Part = 0; Part < UF; ++Part) {
4061  Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4062  Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4063  VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4064  addMetadata(Cast, &I);
4065  }
4066  break;
4067  }
4068 
4069  case Instruction::Call: {
4070  // Ignore dbg intrinsics.
4071  if (isa<DbgInfoIntrinsic>(I))
4072  break;
4074 
4075  Module *M = I.getParent()->getParent()->getParent();
4076  auto *CI = cast<CallInst>(&I);
4077 
4078  StringRef FnName = CI->getCalledFunction()->getName();
4079  Function *F = CI->getCalledFunction();
4080  Type *RetTy = ToVectorTy(CI->getType(), VF);
4082  for (Value *ArgOperand : CI->arg_operands())
4083  Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4084 
4086 
4087  // The flag shows whether we use Intrinsic or a usual Call for vectorized
4088  // version of the instruction.
4089  // Is it beneficial to perform intrinsic call compared to lib call?
4090  bool NeedToScalarize;
4091  unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
4092  bool UseVectorIntrinsic =
4093  ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
4094  assert((UseVectorIntrinsic || !NeedToScalarize) &&
4095  "Instruction should be scalarized elsewhere.");
4096 
4097  for (unsigned Part = 0; Part < UF; ++Part) {
4099  for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4100  Value *Arg = CI->getArgOperand(i);
4101  // Some intrinsics have a scalar argument - don't replace it with a
4102  // vector.
4103  if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4104  Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4105  Args.push_back(Arg);
4106  }
4107 
4108  Function *VectorF;
4109  if (UseVectorIntrinsic) {
4110  // Use vector version of the intrinsic.
4111  Type *TysForDecl[] = {CI->getType()};
4112  if (VF > 1)
4113  TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4114  VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4115  } else {
4116  // Use vector version of the library call.
4117  StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
4118  assert(!VFnName.empty() && "Vector function name is empty.");
4119  VectorF = M->getFunction(VFnName);
4120  if (!VectorF) {
4121  // Generate a declaration
4122  FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
4123  VectorF =
4124  Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
4125  VectorF->copyAttributesFrom(F);
4126  }
4127  }
4128  assert(VectorF && "Can't create vector function.");
4129 
4131  CI->getOperandBundlesAsDefs(OpBundles);
4132  CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4133 
4134  if (isa<FPMathOperator>(V))
4135  V->copyFastMathFlags(CI);
4136 
4137  VectorLoopValueMap.setVectorValue(&I, Part, V);
4138  addMetadata(V, &I);
4139  }
4140 
4141  break;
4142  }
4143 
4144  default:
4145  // This instruction is not vectorized by simple widening.
4146  LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4147  llvm_unreachable("Unhandled instruction!");
4148  } // end of switch.
4149 }
4150 
4152  // Forget the original basic block.
4154 
4155  // DT is not kept up-to-date for outer loop vectorization
4157  return;
4158 
4159  // Update the dominator tree information.
4161  "Entry does not dominate exit.");
4162 
4169 }
4170 
4171 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4172  // We should not collect Scalars more than once per VF. Right now, this
4173  // function is called from collectUniformsAndScalars(), which already does
4174  // this check. Collecting Scalars for VF=1 does not make any sense.
4175  assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4176  "This function should not be visited twice for the same VF");
4177 
4179 
4180  // These sets are used to seed the analysis with pointers used by memory
4181  // accesses that will remain scalar.
4183  SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4184 
4185  // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4186  // The pointer operands of loads and stores will be scalar as long as the
4187  // memory access is not a gather or scatter operation. The value operand of a
4188  // store will remain scalar if the store is scalarized.
4189  auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4190  InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4191  assert(WideningDecision != CM_Unknown &&
4192  "Widening decision should be ready at this moment");
4193  if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4194  if (Ptr == Store->getValueOperand())
4195  return WideningDecision == CM_Scalarize;
4196  assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4197  "Ptr is neither a value or pointer operand");
4198  return WideningDecision != CM_GatherScatter;
4199  };
4200 
4201  // A helper that returns true if the given value is a bitcast or
4202  // getelementptr instruction contained in the loop.
4203  auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4204  return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4205  isa<GetElementPtrInst>(V)) &&
4206  !TheLoop->isLoopInvariant(V);
4207  };
4208 
4209  // A helper that evaluates a memory access's use of a pointer. If the use
4210  // will be a scalar use, and the pointer is only used by memory accesses, we
4211  // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4212  // PossibleNonScalarPtrs.
4213  auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4214  // We only care about bitcast and getelementptr instructions contained in
4215  // the loop.
4216  if (!isLoopVaryingBitCastOrGEP(Ptr))
4217  return;
4218 
4219  // If the pointer has already been identified as scalar (e.g., if it was
4220  // also identified as uniform), there's nothing to do.
4221  auto *I = cast<Instruction>(Ptr);
4222  if (Worklist.count(I))
4223  return;
4224 
4225  // If the use of the pointer will be a scalar use, and all users of the
4226  // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4227  // place the pointer in PossibleNonScalarPtrs.
4228  if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4229  return isa<LoadInst>(U) || isa<StoreInst>(U);
4230  }))
4231  ScalarPtrs.insert(I);
4232  else
4233  PossibleNonScalarPtrs.insert(I);
4234  };
4235 
4236  // We seed the scalars analysis with three classes of instructions: (1)
4237  // instructions marked uniform-after-vectorization, (2) bitcast and
4238  // getelementptr instructions used by memory accesses requiring a scalar use,
4239  // and (3) pointer induction variables and their update instructions (we
4240  // currently only scalarize these).
4241  //
4242  // (1) Add to the worklist all instructions that have been identified as
4243  // uniform-after-vectorization.
4244  Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4245 
4246  // (2) Add to the worklist all bitcast and getelementptr instructions used by
4247  // memory accesses requiring a scalar use. The pointer operands of loads and
4248  // stores will be scalar as long as the memory accesses is not a gather or
4249  // scatter operation. The value operand of a store will remain scalar if the
4250  // store is scalarized.
4251  for (auto *BB : TheLoop->blocks())
4252  for (auto &I : *BB) {
4253  if (auto *Load = dyn_cast<LoadInst>(&I)) {
4254  evaluatePtrUse(Load, Load->getPointerOperand());
4255  } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4256  evaluatePtrUse(Store, Store->getPointerOperand());
4257  evaluatePtrUse(Store, Store->getValueOperand());
4258  }
4259  }
4260  for (auto *I : ScalarPtrs)
4261  if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4262  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4263  Worklist.insert(I);
4264  }
4265 
4266  // (3) Add to the worklist all pointer induction variables and their update
4267  // instructions.
4268  //
4269  // TODO: Once we are able to vectorize pointer induction variables we should
4270  // no longer insert them into the worklist here.
4271  auto *Latch = TheLoop->getLoopLatch();
4272  for (auto &Induction : *Legal->getInductionVars()) {
4273  auto *Ind = Induction.first;
4274  auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4275  if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4276  continue;
4277  Worklist.insert(Ind);
4278  Worklist.insert(IndUpdate);
4279  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4280  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4281  << "\n");
4282  }
4283 
4284  // Insert the forced scalars.
4285  // FIXME: Currently widenPHIInstruction() often creates a dead vector
4286  // induction variable when the PHI user is scalarized.
4287  auto ForcedScalar = ForcedScalars.find(VF);
4288  if (ForcedScalar != ForcedScalars.end())
4289  for (auto *I : ForcedScalar->second)
4290  Worklist.insert(I);
4291 
4292  // Expand the worklist by looking through any bitcasts and getelementptr
4293  // instructions we've already identified as scalar. This is similar to the
4294  // expansion step in collectLoopUniforms(); however, here we're only
4295  // expanding to include additional bitcasts and getelementptr instructions.
4296  unsigned Idx = 0;
4297  while (Idx != Worklist.size()) {
4298  Instruction *Dst = Worklist[Idx++];
4299  if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4300  continue;
4301  auto *Src = cast<Instruction>(Dst->getOperand(0));
4302  if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4303  auto *J = cast<Instruction>(U);
4304  return !TheLoop->contains(J) || Worklist.count(J) ||
4305  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4306  isScalarUse(J, Src));
4307  })) {
4308  Worklist.insert(Src);
4309  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4310  }
4311  }
4312 
4313  // An induction variable will remain scalar if all users of the induction
4314  // variable and induction variable update remain scalar.
4315  for (auto &Induction : *Legal->getInductionVars()) {
4316  auto *Ind = Induction.first;
4317  auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4318 
4319  // We already considered pointer induction variables, so there's no reason
4320  // to look at their users again.
4321  //
4322  // TODO: Once we are able to vectorize pointer induction variables we
4323  // should no longer skip over them here.
4324  if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4325  continue;
4326 
4327  // Determine if all users of the induction variable are scalar after
4328  // vectorization.
4329  auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4330  auto *I = cast<Instruction>(U);
4331  return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4332  });
4333  if (!ScalarInd)
4334  continue;
4335 
4336  // Determine if all users of the induction variable update instruction are
4337  // scalar after vectorization.
4338  auto ScalarIndUpdate =
4339  llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4340  auto *I = cast<Instruction>(U);
4341  return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4342  });
4343  if (!ScalarIndUpdate)
4344  continue;
4345 
4346  // The induction variable and its update instruction will remain scalar.
4347  Worklist.insert(Ind);
4348  Worklist.insert(IndUpdate);
4349  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4350  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4351  << "\n");
4352  }
4353 
4354  Scalars[VF].insert(Worklist.begin(), Worklist.end());
4355 }
4356 
4358  if (!blockNeedsPredication(I->getParent()))
4359  return false;
4360  switch(I->getOpcode()) {
4361  default:
4362  break;
4363  case Instruction::Load:
4364  case Instruction::Store: {
4365  if (!Legal->isMaskRequired(I))
4366  return false;
4367  auto *Ptr = getLoadStorePointerOperand(I);
4368  auto *Ty = getMemInstValueType(I);
4369  // We have already decided how to vectorize this instruction, get that
4370  // result.
4371  if (VF > 1) {
4372  InstWidening WideningDecision = getWideningDecision(I, VF);
4373  assert(WideningDecision != CM_Unknown &&
4374  "Widening decision should be ready at this moment");
4375  return WideningDecision == CM_Scalarize;
4376  }
4377  return isa<LoadInst>(I) ?
4378  !(isLegalMaskedLoad(Ty, Ptr) || isLegalMaskedGather(Ty))
4379  : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty));
4380  }
4381  case Instruction::UDiv:
4382  case Instruction::SDiv:
4383  case Instruction::SRem:
4384  case Instruction::URem:
4385  return mayDivideByZero(*I);
4386  }
4387  return false;
4388 }
4389 
4391  unsigned VF) {
4392  assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4393  assert(getWideningDecision(I, VF) == CM_Unknown &&
4394  "Decision should not be set yet.");
4395  auto *Group = getInterleavedAccessGroup(I);
4396  assert(Group && "Must have a group.");
4397 
4398  // Check if masking is required.
4399  // A Group may need masking for one of two reasons: it resides in a block that
4400  // needs predication, or it was decided to use masking to deal with gaps.
4401  bool PredicatedAccessRequiresMasking =
4403  bool AccessWithGapsRequiresMasking =
4404  Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed;
4405  if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4406  return true;
4407 
4408  // If masked interleaving is required, we expect that the user/target had
4409  // enabled it, because otherwise it either wouldn't have been created or
4410  // it should have been invalidated by the CostModel.
4412  "Masked interleave-groups for predicated accesses are not enabled.");
4413 
4414  auto *Ty = getMemInstValueType(I);
4415  return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty)
4416  : TTI.isLegalMaskedStore(Ty);
4417 }
4418 
4420  unsigned VF) {
4421  // Get and ensure we have a valid memory instruction.
4422  LoadInst *LI = dyn_cast<LoadInst>(I);
4424  assert((LI || SI) && "Invalid memory instruction");
4425 
4426  auto *Ptr = getLoadStorePointerOperand(I);
4427 
4428  // In order to be widened, the pointer should be consecutive, first of all.
4429  if (!Legal->isConsecutivePtr(Ptr))
4430  return false;
4431 
4432  // If the instruction is a store located in a predicated block, it will be
4433  // scalarized.
4434  if (isScalarWithPredication(I))
4435  return false;
4436 
4437  // If the instruction's allocated size doesn't equal it's type size, it
4438  // requires padding and will be scalarized.
4439  auto &DL = I->getModule()->getDataLayout();
4440  auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4441  if (hasIrregularType(ScalarTy, DL, VF))
4442  return false;
4443 
4444  return true;
4445 }
4446 
4447 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4448  // We should not collect Uniforms more than once per VF. Right now,
4449  // this function is called from collectUniformsAndScalars(), which
4450  // already does this check. Collecting Uniforms for VF=1 does not make any
4451  // sense.
4452 
4453  assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4454  "This function should not be visited twice for the same VF");
4455 
4456  // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4457  // not analyze again. Uniforms.count(VF) will return 1.
4458  Uniforms[VF].clear();
4459 
4460  // We now know that the loop is vectorizable!
4461  // Collect instructions inside the loop that will remain uniform after
4462  // vectorization.
4463 
4464  // Global values, params and instructions outside of current loop are out of
4465  // scope.
4466  auto isOutOfScope = [&](Value *V) -> bool {
4468  return (!I || !TheLoop->contains(I));
4469  };
4470 
4471  SetVector<Instruction *> Worklist;
4472  BasicBlock *Latch = TheLoop->getLoopLatch();
4473 
4474  // Start with the conditional branch. If the branch condition is an
4475  // instruction contained in the loop that is only used by the branch, it is
4476  // uniform.
4477  auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4478  if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
4479  Worklist.insert(Cmp);
4480  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
4481  }
4482 
4483  // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4484  // are pointers that are treated like consecutive pointers during
4485  // vectorization. The pointer operands of interleaved accesses are an
4486  // example.
4487  SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4488 
4489  // Holds pointer operands of instructions that are possibly non-uniform.
4490  SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4491 
4492  auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4493  InstWidening WideningDecision = getWideningDecision(I, VF);
4494  assert(WideningDecision != CM_Unknown &&
4495  "Widening decision should be ready at this moment");
4496 
4497  return (WideningDecision == CM_Widen ||
4498  WideningDecision == CM_Widen_Reverse ||
4499  WideningDecision == CM_Interleave);
4500  };
4501  // Iterate over the instructions in the loop, and collect all
4502  // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4503  // that a consecutive-like pointer operand will be scalarized, we collect it
4504  // in PossibleNonUniformPtrs instead. We use two sets here because a single
4505  // getelementptr instruction can be used by both vectorized and scalarized
4506  // memory instructions. For example, if a loop loads and stores from the same
4507  // location, but the store is conditional, the store will be scalarized, and
4508  // the getelementptr won't remain uniform.
4509  for (auto *BB : TheLoop->blocks())
4510  for (auto &I : *BB) {
4511  // If there's no pointer operand, there's nothing to do.
4512  auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4513  if (!Ptr)
4514  continue;
4515 
4516  // True if all users of Ptr are memory accesses that have Ptr as their
4517  // pointer operand.
4518  auto UsersAreMemAccesses =
4519  llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4520  return getLoadStorePointerOperand(U) == Ptr;
4521  });
4522 
4523  // Ensure the memory instruction will not be scalarized or used by
4524  // gather/scatter, making its pointer operand non-uniform. If the pointer
4525  // operand is used by any instruction other than a memory access, we
4526  // conservatively assume the pointer operand may be non-uniform.
4527  if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4528  PossibleNonUniformPtrs.insert(Ptr);
4529 
4530  // If the memory instruction will be vectorized and its pointer operand
4531  // is consecutive-like, or interleaving - the pointer operand should
4532  // remain uniform.
4533  else
4534  ConsecutiveLikePtrs.insert(Ptr);
4535  }
4536 
4537  // Add to the Worklist all consecutive and consecutive-like pointers that
4538  // aren't also identified as possibly non-uniform.
4539  for (auto *V : ConsecutiveLikePtrs)
4540  if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) {
4541  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
4542  Worklist.insert(V);
4543  }
4544 
4545  // Expand Worklist in topological order: whenever a new instruction
4546  // is added , its users should be already inside Worklist. It ensures
4547  // a uniform instruction will only be used by uniform instructions.
4548  unsigned idx = 0;
4549  while (idx != Worklist.size()) {
4550  Instruction *I = Worklist[idx++];
4551 
4552  for (auto OV : I->operand_values()) {
4553  // isOutOfScope operands cannot be uniform instructions.
4554  if (isOutOfScope(OV))
4555  continue;
4556  // First order recurrence Phi's should typically be considered
4557  // non-uniform.
4558  auto *OP = dyn_cast<PHINode>(OV);
4559  if (OP && Legal->isFirstOrderRecurrence(OP))
4560  continue;
4561  // If all the users of the operand are uniform, then add the
4562  // operand into the uniform worklist.
4563  auto *OI = cast<Instruction>(OV);
4564  if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4565  auto *J = cast<Instruction>(U);
4566  return Worklist.count(J) ||
4567  (OI == getLoadStorePointerOperand(J) &&
4568  isUniformDecision(J, VF));
4569  })) {
4570  Worklist.insert(OI);
4571  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
4572  }
4573  }
4574  }
4575 
4576  // Returns true if Ptr is the pointer operand of a memory access instruction
4577  // I, and I is known to not require scalarization.
4578  auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4579  return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4580  };
4581 
4582  // For an instruction to be added into Worklist above, all its users inside
4583  // the loop should also be in Worklist. However, this condition cannot be
4584  // true for phi nodes that form a cyclic dependence. We must process phi
4585  // nodes separately. An induction variable will remain uniform if all users
4586  // of the induction variable and induction variable update remain uniform.
4587  // The code below handles both pointer and non-pointer induction variables.
4588  for (auto &Induction : *Legal->getInductionVars()) {
4589  auto *Ind = Induction.first;
4590  auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4591 
4592  // Determine if all users of the induction variable are uniform after
4593  // vectorization.
4594  auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4595  auto *I = cast<Instruction>(U);
4596  return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4597  isVectorizedMemAccessUse(I, Ind);
4598  });
4599  if (!UniformInd)
4600  continue;
4601 
4602  // Determine if all users of the induction variable update instruction are
4603  // uniform after vectorization.
4604  auto UniformIndUpdate =
4605  llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4606  auto *I = cast<Instruction>(U);
4607  return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4608  isVectorizedMemAccessUse(I, IndUpdate);
4609  });
4610  if (!UniformIndUpdate)
4611  continue;
4612 
4613  // The induction variable and its update instruction will remain uniform.
4614  Worklist.insert(Ind);
4615  Worklist.insert(IndUpdate);
4616  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
4617  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate
4618  << "\n");
4619  }
4620 
4621  Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4622 }
4623 
4626  // TODO: It may by useful to do since it's still likely to be dynamically
4627  // uniform if the target can skip.
4628  LLVM_DEBUG(
4629  dbgs() << "LV: Not inserting runtime ptr check for divergent target");
4630 
4631  ORE->emit(
4632  createMissedAnalysis("CantVersionLoopWithDivergentTarget")
4633  << "runtime pointer checks needed. Not enabled for divergent target");
4634 
4635  return None;
4636  }
4637 
4638  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4639  if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize.
4640  return computeFeasibleMaxVF(OptForSize, TC);
4641 
4643  ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
4644  << "runtime pointer checks needed. Enable vectorization of this "
4645  "loop with '#pragma clang loop vectorize(enable)' when "
4646  "compiling with -Os/-Oz");
4647  LLVM_DEBUG(
4648  dbgs()
4649  << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
4650  return None;
4651  }
4652 
4653  if (!PSE.getUnionPredicate().getPredicates().empty()) {
4654  ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
4655  << "runtime SCEV checks needed. Enable vectorization of this "
4656  "loop with '#pragma clang loop vectorize(enable)' when "
4657  "compiling with -Os/-Oz");
4658  LLVM_DEBUG(
4659  dbgs()
4660  << "LV: Aborting. Runtime SCEV check is required with -Os/-Oz.\n");
4661  return None;
4662  }
4663 
4664  // FIXME: Avoid specializing for stride==1 instead of bailing out.
4665  if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4666  ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
4667  << "runtime stride == 1 checks needed. Enable vectorization of "
4668  "this loop with '#pragma clang loop vectorize(enable)' when "
4669  "compiling with -Os/-Oz");
4670  LLVM_DEBUG(
4671  dbgs()
4672  << "LV: Aborting. Runtime stride check is required with -Os/-Oz.\n");
4673  return None;
4674  }
4675 
4676  // If we optimize the program for size, avoid creating the tail loop.
4677  LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4678 
4679  if (TC == 1) {
4680  ORE->emit(createMissedAnalysis("SingleIterationLoop")
4681  << "loop trip count is one, irrelevant for vectorization");
4682  LLVM_DEBUG(dbgs() << "LV: Aborting, single iteration (non) loop.\n");
4683  return None;
4684  }
4685 
4686  // Record that scalar epilogue is not allowed.
4687  LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4688 
4689  IsScalarEpilogueAllowed = !OptForSize;
4690 
4691  // We don't create an epilogue when optimizing for size.
4692  // Invalidate interleave groups that require an epilogue if we can't mask
4693  // the interleave-group.
4695  InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4696 
4697  unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC);
4698 
4699  if (TC > 0 && TC % MaxVF == 0) {
4700  LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4701  return MaxVF;
4702  }
4703 
4704  // If we don't know the precise trip count, or if the trip count that we
4705  // found modulo the vectorization factor is not zero, try to fold the tail
4706  // by masking.
4707  // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4708  if (Legal->canFoldTailByMasking()) {
4709  FoldTailByMasking = true;
4710  return MaxVF;
4711  }
4712 
4713  if (TC == 0) {
4714  ORE->emit(
4715  createMissedAnalysis("UnknownLoopCountComplexCFG")
4716  << "unable to calculate the loop count due to complex control flow");
4717  return None;
4718  }
4719 
4720  ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize")
4721  << "cannot optimize for size and vectorize at the same time. "
4722  "Enable vectorization of this loop with '#pragma clang loop "
4723  "vectorize(enable)' when compiling with -Os/-Oz");
4724  return None;
4725 }
4726 
4727 unsigned
4728 LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
4729  unsigned ConstTripCount) {
4730  MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4731  unsigned SmallestType, WidestType;
4732  std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4733  unsigned WidestRegister = TTI.getRegisterBitWidth(true);
4734 
4735  // Get the maximum safe dependence distance in bits computed by LAA.
4736  // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4737  // the memory accesses that is most restrictive (involved in the smallest
4738  // dependence distance).
4739  unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
4740 
4741  WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
4742 
4743  unsigned MaxVectorSize = WidestRegister / WidestType;
4744 
4745  LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4746  << " / " << WidestType << " bits.\n");
4747  LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4748  << WidestRegister << " bits.\n");
4749 
4750  assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
4751  " into one vector!");
4752  if (MaxVectorSize == 0) {
4753  LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
4754  MaxVectorSize = 1;
4755  return MaxVectorSize;
4756  } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
4757  isPowerOf2_32(ConstTripCount)) {
4758  // We need to clamp the VF to be the ConstTripCount. There is no point in
4759  // choosing a higher viable VF as done in the loop below.
4760  LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
4761  << ConstTripCount << "\n");
4762  MaxVectorSize = ConstTripCount;
4763  return MaxVectorSize;
4764  }
4765 
4766  unsigned MaxVF = MaxVectorSize;
4767  if (TTI.shouldMaximizeVectorBandwidth(OptForSize) ||
4768  (MaximizeBandwidth && !OptForSize)) {
4769  // Collect all viable vectorization factors larger than the default MaxVF
4770  // (i.e. MaxVectorSize).
4772  unsigned NewMaxVectorSize = WidestRegister / SmallestType;
4773  for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
4774  VFs.push_back(VS);
4775 
4776  // For each VF calculate its register usage.
4777  auto RUs = calculateRegisterUsage(VFs);
4778 
4779  // Select the largest VF which doesn't require more registers than existing
4780  // ones.
4781  unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
4782  for (int i = RUs.size() - 1; i >= 0; --i) {
4783  if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
4784  MaxVF = VFs[i];
4785  break;
4786  }
4787  }
4788  if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
4789  if (MaxVF < MinVF) {
4790  LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4791  << ") with target's minimum: " << MinVF << '\n');
4792  MaxVF = MinVF;
4793  }
4794  }
4795  }
4796  return MaxVF;
4797 }
4798 
4801  float Cost = expectedCost(1).first;
4802  const float ScalarCost = Cost;
4803  unsigned Width = 1;
4804  LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
4805 
4806  bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
4807  if (ForceVectorization && MaxVF > 1) {
4808  // Ignore scalar width, because the user explicitly wants vectorization.
4809  // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4810  // evaluation.
4812  }
4813 
4814  for (unsigned i = 2; i <= MaxVF; i *= 2) {
4815  // Notice that the vector loop needs to be executed less times, so
4816  // we need to divide the cost of the vector loops by the width of
4817  // the vector elements.
4818  VectorizationCostTy C = expectedCost(i);
4819  float VectorCost = C.first / (float)i;
4820  LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
4821  << " costs: " << (int)VectorCost << ".\n");
4822  if (!C.second && !ForceVectorization) {
4823  LLVM_DEBUG(
4824  dbgs() << "LV: Not considering vector loop of width " << i
4825  << " because it will not generate any vector instructions.\n");
4826  continue;
4827  }
4828  if (VectorCost < Cost) {
4829  Cost = VectorCost;
4830  Width = i;
4831  }
4832  }
4833 
4834  if (!EnableCondStoresVectorization && NumPredStores) {
4835  ORE->emit(createMissedAnalysis("ConditionalStore")
4836  << "store that is conditionally executed prevents vectorization");
4837  LLVM_DEBUG(
4838  dbgs() << "LV: No vectorization. There are conditional stores.\n");
4839  Width = 1;
4840  Cost = ScalarCost;
4841  }
4842 
4843  LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
4844  << "LV: Vectorization seems to be not beneficial, "
4845  << "but was forced by a user.\n");
4846  LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
4847  VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
4848  return Factor;
4849 }
4850 
4851 std::pair<unsigned, unsigned>
4853  unsigned MinWidth = -1U;
4854  unsigned MaxWidth = 8;
4855  const DataLayout &DL = TheFunction->getParent()->getDataLayout();
4856 
4857  // For each block.
4858  for (BasicBlock *BB : TheLoop->blocks()) {
4859  // For each instruction in the loop.
4860  for (Instruction &I : BB->instructionsWithoutDebug()) {
4861  Type *T = I.getType();
4862 
4863  // Skip ignored values.
4864  if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
4865  continue;
4866 
4867  // Only examine Loads, Stores and PHINodes.
4868  if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
4869  continue;
4870 
4871  // Examine PHI nodes that are reduction variables. Update the type to
4872  // account for the recurrence type.
4873  if (auto *PN = dyn_cast<PHINode>(&I)) {
4874  if (!Legal->isReductionVariable(PN))
4875  continue;
4876  RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
4877  T = RdxDesc.getRecurrenceType();
4878  }
4879 
4880  // Examine the stored values.
4881  if (auto *ST = dyn_cast<StoreInst>(&I))
4882  T = ST->getValueOperand()->getType();
4883 
4884  // Ignore loaded pointer types and stored pointer types that are not
4885  // vectorizable.
4886  //
4887  // FIXME: The check here attempts to predict whether a load or store will
4888  // be vectorized. We only know this for certain after a VF has
4889  // been selected. Here, we assume that if an access can be
4890  // vectorized, it will be. We should also look at extending this
4891  // optimization to non-pointer types.
4892  //
4893  if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
4894  !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
4895  continue;
4896 
4897  MinWidth = std::min(MinWidth,
4898  (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
4899  MaxWidth = std::max(MaxWidth,
4900  (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
4901  }
4902  }
4903 
4904  return {MinWidth, MaxWidth};
4905 }
4906 
4908  unsigned VF,
4909  unsigned LoopCost) {
4910  // -- The interleave heuristics --
4911  // We interleave the loop in order to expose ILP and reduce the loop overhead.
4912  // There are many micro-architectural considerations that we can't predict
4913  // at this level. For example, frontend pressure (on decode or fetch) due to
4914  // code size, or the number and capabilities of the execution ports.
4915  //
4916  // We use the following heuristics to select the interleave count:
4917  // 1. If the code has reductions, then we interleave to break the cross
4918  // iteration dependency.
4919  // 2. If the loop is really small, then we interleave to reduce the loop
4920  // overhead.
4921  // 3. We don't interleave if we think that we will spill registers to memory
4922  // due to the increased register pressure.
4923 
4924  // When we optimize for size, we don't interleave.
4925  if (OptForSize)
4926  return 1;
4927 
4928  // We used the distance for the interleave count.
4929  if (Legal->getMaxSafeDepDistBytes() != -1U)
4930  return 1;
4931 
4932  // Do not interleave loops with a relatively small trip count.
4933  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4934  if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
4935  return 1;
4936 
4937  unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
4938  LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
4939  << " registers\n");
4940 
4941  if (VF == 1) {
4942  if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
4943  TargetNumRegisters = ForceTargetNumScalarRegs;
4944  } else {
4945  if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
4946  TargetNumRegisters = ForceTargetNumVectorRegs;
4947  }
4948 
4949  RegisterUsage R = calculateRegisterUsage({VF})[0];
4950  // We divide by these constants so assume that we have at least one
4951  // instruction that uses at least one register.
4953 
4954  // We calculate the interleave count using the following formula.
4955  // Subtract the number of loop invariants from the number of available
4956  // registers. These registers are used by all of the interleaved instances.
4957  // Next, divide the remaining registers by the number of registers that is