LLVM  10.0.0svn
LoopVectorize.cpp
Go to the documentation of this file.
1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 // of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 // widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 // of vectorization. It decides on the optimal vector width, which
26 // can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 // Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46 // Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52 // Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanHCFGTransforms.h"
62 #include "VPlanPredicator.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
82 #include "llvm/Analysis/CFG.h"
88 #include "llvm/Analysis/LoopInfo.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/LLVMContext.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
133 #include "llvm/Support/MathExtras.h"
141 #include <algorithm>
142 #include <cassert>
143 #include <cstdint>
144 #include <cstdlib>
145 #include <functional>
146 #include <iterator>
147 #include <limits>
148 #include <memory>
149 #include <string>
150 #include <tuple>
151 #include <utility>
152 #include <vector>
153 
154 using namespace llvm;
155 
156 #define LV_NAME "loop-vectorize"
157 #define DEBUG_TYPE LV_NAME
158 
159 /// @{
160 /// Metadata attribute names
161 static const char *const LLVMLoopVectorizeFollowupAll =
162  "llvm.loop.vectorize.followup_all";
163 static const char *const LLVMLoopVectorizeFollowupVectorized =
164  "llvm.loop.vectorize.followup_vectorized";
165 static const char *const LLVMLoopVectorizeFollowupEpilogue =
166  "llvm.loop.vectorize.followup_epilogue";
167 /// @}
168 
169 STATISTIC(LoopsVectorized, "Number of loops vectorized");
170 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
171 
172 /// Loops with a known constant trip count below this number are vectorized only
173 /// if no scalar iteration overheads are incurred.
175  "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
176  cl::desc("Loops with a constant trip count that is smaller than this "
177  "value are vectorized only if no scalar iteration overheads "
178  "are incurred."));
179 
180 // Indicates that an epilogue is undesired, predication is preferred.
181 // This means that the vectorizer will try to fold the loop-tail (epilogue)
182 // into the loop and predicate the loop body accordingly.
184  "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
185  cl::desc("Indicate that an epilogue is undesired, predication should be "
186  "used instead."));
187 
189  "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
190  cl::desc("Maximize bandwidth when selecting vectorization factor which "
191  "will be determined by the smallest type in loop."));
192 
194  "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
195  cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
196 
197 /// An interleave-group may need masking if it resides in a block that needs
198 /// predication, or in order to mask away gaps.
200  "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
201  cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
202 
203 /// We don't interleave loops with a known constant trip count below this
204 /// number.
205 static const unsigned TinyTripCountInterleaveThreshold = 128;
206 
208  "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
209  cl::desc("A flag that overrides the target's number of scalar registers."));
210 
212  "force-target-num-vector-regs", cl::init(0), cl::Hidden,
213  cl::desc("A flag that overrides the target's number of vector registers."));
214 
216  "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
217  cl::desc("A flag that overrides the target's max interleave factor for "
218  "scalar loops."));
219 
221  "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
222  cl::desc("A flag that overrides the target's max interleave factor for "
223  "vectorized loops."));
224 
226  "force-target-instruction-cost", cl::init(0), cl::Hidden,
227  cl::desc("A flag that overrides the target's expected cost for "
228  "an instruction to a single constant value. Mostly "
229  "useful for getting consistent testing."));
230 
232  "small-loop-cost", cl::init(20), cl::Hidden,
233  cl::desc(
234  "The cost of a loop that is considered 'small' by the interleaver."));
235 
237  "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
238  cl::desc("Enable the use of the block frequency analysis to access PGO "
239  "heuristics minimizing code growth in cold regions and being more "
240  "aggressive in hot regions."));
241 
242 // Runtime interleave loops for load/store throughput.
244  "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
245  cl::desc(
246  "Enable runtime interleaving until load/store ports are saturated"));
247 
248 /// The number of stores in a loop that are allowed to need predication.
250  "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
251  cl::desc("Max number of stores to be predicated behind an if."));
252 
254  "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
255  cl::desc("Count the induction variable only once when interleaving"));
256 
258  "enable-cond-stores-vec", cl::init(true), cl::Hidden,
259  cl::desc("Enable if predication of stores during vectorization."));
260 
262  "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
263  cl::desc("The maximum interleave count to use when interleaving a scalar "
264  "reduction in a nested loop."));
265 
267  "enable-vplan-native-path", cl::init(false), cl::Hidden,
268  cl::desc("Enable VPlan-native vectorization path with "
269  "support for outer loop vectorization."));
270 
271 // FIXME: Remove this switch once we have divergence analysis. Currently we
272 // assume divergent non-backedge branches when this switch is true.
274  "enable-vplan-predication", cl::init(false), cl::Hidden,
275  cl::desc("Enable VPlan-native vectorization path predicator with "
276  "support for outer loop vectorization."));
277 
278 // This flag enables the stress testing of the VPlan H-CFG construction in the
279 // VPlan-native vectorization path. It must be used in conjuction with
280 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
281 // verification of the H-CFGs built.
283  "vplan-build-stress-test", cl::init(false), cl::Hidden,
284  cl::desc(
285  "Build VPlan for every supported loop nest in the function and bail "
286  "out right after the build (stress test the VPlan H-CFG construction "
287  "in the VPlan-native vectorization path)."));
288 
290  "interleave-loops", cl::init(true), cl::Hidden,
291  cl::desc("Enable loop interleaving in Loop vectorization passes"));
293  "vectorize-loops", cl::init(true), cl::Hidden,
294  cl::desc("Run the Loop vectorization passes"));
295 
296 /// A helper function for converting Scalar types to vector types.
297 /// If the incoming type is void, we return void. If the VF is 1, we return
298 /// the scalar type.
299 static Type *ToVectorTy(Type *Scalar, unsigned VF) {
300  if (Scalar->isVoidTy() || VF == 1)
301  return Scalar;
302  return VectorType::get(Scalar, VF);
303 }
304 
305 /// A helper function that returns the type of loaded or stored value.
307  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
308  "Expected Load or Store instruction");
309  if (auto *LI = dyn_cast<LoadInst>(I))
310  return LI->getType();
311  return cast<StoreInst>(I)->getValueOperand()->getType();
312 }
313 
314 /// A helper function that returns true if the given type is irregular. The
315 /// type is irregular if its allocated size doesn't equal the store size of an
316 /// element of the corresponding vector type at the given vectorization factor.
317 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
318  // Determine if an array of VF elements of type Ty is "bitcast compatible"
319  // with a <VF x Ty> vector.
320  if (VF > 1) {
321  auto *VectorTy = VectorType::get(Ty, VF);
322  return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
323  }
324 
325  // If the vectorization factor is one, we just check if an array of type Ty
326  // requires padding between elements.
327  return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
328 }
329 
330 /// A helper function that returns the reciprocal of the block probability of
331 /// predicated blocks. If we return X, we are assuming the predicated block
332 /// will execute once for every X iterations of the loop header.
333 ///
334 /// TODO: We should use actual block probability here, if available. Currently,
335 /// we always assume predicated blocks have a 50% chance of executing.
336 static unsigned getReciprocalPredBlockProb() { return 2; }
337 
338 /// A helper function that adds a 'fast' flag to floating-point operations.
340  if (isa<FPMathOperator>(V))
341  cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
342  return V;
343 }
344 
346  if (isa<FPMathOperator>(V))
347  cast<Instruction>(V)->setFastMathFlags(FMF);
348  return V;
349 }
350 
351 /// A helper function that returns an integer or floating-point constant with
352 /// value C.
353 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
354  return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
355  : ConstantFP::get(Ty, C);
356 }
357 
358 namespace llvm {
359 
360 /// InnerLoopVectorizer vectorizes loops which contain only one basic
361 /// block to a specified vectorization factor (VF).
362 /// This class performs the widening of scalars into vectors, or multiple
363 /// scalars. This class also implements the following features:
364 /// * It inserts an epilogue loop for handling loops that don't have iteration
365 /// counts that are known to be a multiple of the vectorization factor.
366 /// * It handles the code generation for reduction variables.
367 /// * Scalarization (implementation using scalars) of un-vectorizable
368 /// instructions.
369 /// InnerLoopVectorizer does not perform any vectorization-legality
370 /// checks, and relies on the caller to check for the different legality
371 /// aspects. The InnerLoopVectorizer relies on the
372 /// LoopVectorizationLegality class to provide information about the induction
373 /// and reduction variables that were found to a given vectorization factor.
375 public:
378  const TargetLibraryInfo *TLI,
380  OptimizationRemarkEmitter *ORE, unsigned VecWidth,
381  unsigned UnrollFactor, LoopVectorizationLegality *LVL,
383  : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
384  AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
385  Builder(PSE.getSE()->getContext()),
386  VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
387  virtual ~InnerLoopVectorizer() = default;
388 
389  /// Create a new empty loop. Unlink the old loop and connect the new one.
390  /// Return the pre-header block of the new loop.
392 
393  /// Widen a single instruction within the innermost loop.
395 
396  /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
397  void fixVectorizedLoop();
398 
399  // Return true if any runtime check is added.
401 
402  /// A type for vectorized values in the new loop. Each value from the
403  /// original loop, when vectorized, is represented by UF vector values in the
404  /// new unrolled loop, where UF is the unroll factor.
406 
407  /// Vectorize a single PHINode in a block. This method handles the induction
408  /// variable canonicalization. It supports both VF = 1 for unrolled loops and
409  /// arbitrary length vectors.
410  void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
411 
412  /// A helper function to scalarize a single Instruction in the innermost loop.
413  /// Generates a sequence of scalar instances for each lane between \p MinLane
414  /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
415  /// inclusive..
416  void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
417  bool IfPredicateInstr);
418 
419  /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
420  /// is provided, the integer induction variable will first be truncated to
421  /// the corresponding type.
422  void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
423 
424  /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
425  /// vector or scalar value on-demand if one is not yet available. When
426  /// vectorizing a loop, we visit the definition of an instruction before its
427  /// uses. When visiting the definition, we either vectorize or scalarize the
428  /// instruction, creating an entry for it in the corresponding map. (In some
429  /// cases, such as induction variables, we will create both vector and scalar
430  /// entries.) Then, as we encounter uses of the definition, we derive values
431  /// for each scalar or vector use unless such a value is already available.
432  /// For example, if we scalarize a definition and one of its uses is vector,
433  /// we build the required vector on-demand with an insertelement sequence
434  /// when visiting the use. Otherwise, if the use is scalar, we can use the
435  /// existing scalar definition.
436  ///
437  /// Return a value in the new loop corresponding to \p V from the original
438  /// loop at unroll index \p Part. If the value has already been vectorized,
439  /// the corresponding vector entry in VectorLoopValueMap is returned. If,
440  /// however, the value has a scalar entry in VectorLoopValueMap, we construct
441  /// a new vector value on-demand by inserting the scalar values into a vector
442  /// with an insertelement sequence. If the value has been neither vectorized
443  /// nor scalarized, it must be loop invariant, so we simply broadcast the
444  /// value into a vector.
445  Value *getOrCreateVectorValue(Value *V, unsigned Part);
446 
447  /// Return a value in the new loop corresponding to \p V from the original
448  /// loop at unroll and vector indices \p Instance. If the value has been
449  /// vectorized but not scalarized, the necessary extractelement instruction
450  /// will be generated.
451  Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
452 
453  /// Construct the vector value of a scalarized value \p V one lane at a time.
454  void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
455 
456  /// Try to vectorize the interleaved access group that \p Instr belongs to,
457  /// optionally masking the vector operations if \p BlockInMask is non-null.
459  VectorParts *BlockInMask = nullptr);
460 
461  /// Vectorize Load and Store instructions, optionally masking the vector
462  /// operations if \p BlockInMask is non-null.
464  VectorParts *BlockInMask = nullptr);
465 
466  /// Set the debug location in the builder using the debug location in
467  /// the instruction.
468  void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
469 
470  /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
471  void fixNonInductionPHIs(void);
472 
473 protected:
475 
476  /// A small list of PHINodes.
478 
479  /// A type for scalarized values in the new loop. Each value from the
480  /// original loop, when scalarized, is represented by UF x VF scalar values
481  /// in the new unrolled loop, where UF is the unroll factor and VF is the
482  /// vectorization factor.
484 
485  /// Set up the values of the IVs correctly when exiting the vector loop.
486  void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
487  Value *CountRoundDown, Value *EndValue,
488  BasicBlock *MiddleBlock);
489 
490  /// Create a new induction variable inside L.
491  PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
492  Value *Step, Instruction *DL);
493 
494  /// Handle all cross-iteration phis in the header.
495  void fixCrossIterationPHIs();
496 
497  /// Fix a first-order recurrence. This is the second phase of vectorizing
498  /// this phi node.
499  void fixFirstOrderRecurrence(PHINode *Phi);
500 
501  /// Fix a reduction cross-iteration phi. This is the second phase of
502  /// vectorizing this phi node.
503  void fixReduction(PHINode *Phi);
504 
505  /// The Loop exit block may have single value PHI nodes with some
506  /// incoming value. While vectorizing we only handled real values
507  /// that were defined inside the loop and we should have one value for
508  /// each predecessor of its parent basic block. See PR14725.
509  void fixLCSSAPHIs();
510 
511  /// Iteratively sink the scalarized operands of a predicated instruction into
512  /// the block that was created for it.
513  void sinkScalarOperands(Instruction *PredInst);
514 
515  /// Shrinks vector element sizes to the smallest bitwidth they can be legally
516  /// represented as.
518 
519  /// Insert the new loop to the loop hierarchy and pass manager
520  /// and update the analysis passes.
521  void updateAnalysis();
522 
523  /// Create a broadcast instruction. This method generates a broadcast
524  /// instruction (shuffle) for loop invariant values and for the induction
525  /// value. If this is the induction variable then we extend it to N, N+1, ...
526  /// this is needed because each iteration in the loop corresponds to a SIMD
527  /// element.
528  virtual Value *getBroadcastInstrs(Value *V);
529 
530  /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
531  /// to each vector element of Val. The sequence starts at StartIndex.
532  /// \p Opcode is relevant for FP induction variable.
533  virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
534  Instruction::BinaryOps Opcode =
535  Instruction::BinaryOpsEnd);
536 
537  /// Compute scalar induction steps. \p ScalarIV is the scalar induction
538  /// variable on which to base the steps, \p Step is the size of the step, and
539  /// \p EntryVal is the value from the original loop that maps to the steps.
540  /// Note that \p EntryVal doesn't have to be an induction variable - it
541  /// can also be a truncate instruction.
542  void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
543  const InductionDescriptor &ID);
544 
545  /// Create a vector induction phi node based on an existing scalar one. \p
546  /// EntryVal is the value from the original loop that maps to the vector phi
547  /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
548  /// truncate instruction, instead of widening the original IV, we widen a
549  /// version of the IV truncated to \p EntryVal's type.
551  Value *Step, Instruction *EntryVal);
552 
553  /// Returns true if an instruction \p I should be scalarized instead of
554  /// vectorized for the chosen vectorization factor.
556 
557  /// Returns true if we should generate a scalar version of \p IV.
558  bool needsScalarInduction(Instruction *IV) const;
559 
560  /// If there is a cast involved in the induction variable \p ID, which should
561  /// be ignored in the vectorized loop body, this function records the
562  /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
563  /// cast. We had already proved that the casted Phi is equal to the uncasted
564  /// Phi in the vectorized loop (under a runtime guard), and therefore
565  /// there is no need to vectorize the cast - the same value can be used in the
566  /// vector loop for both the Phi and the cast.
567  /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
568  /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
569  ///
570  /// \p EntryVal is the value from the original loop that maps to the vector
571  /// phi node and is used to distinguish what is the IV currently being
572  /// processed - original one (if \p EntryVal is a phi corresponding to the
573  /// original IV) or the "newly-created" one based on the proof mentioned above
574  /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
575  /// latter case \p EntryVal is a TruncInst and we must not record anything for
576  /// that IV, but it's error-prone to expect callers of this routine to care
577  /// about that, hence this explicit parameter.
579  const Instruction *EntryVal,
580  Value *VectorLoopValue,
581  unsigned Part,
582  unsigned Lane = UINT_MAX);
583 
584  /// Generate a shuffle sequence that will reverse the vector Vec.
585  virtual Value *reverseVector(Value *Vec);
586 
587  /// Returns (and creates if needed) the original loop trip count.
588  Value *getOrCreateTripCount(Loop *NewLoop);
589 
590  /// Returns (and creates if needed) the trip count of the widened loop.
592 
593  /// Returns a bitcasted value to the requested vector type.
594  /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
596  const DataLayout &DL);
597 
598  /// Emit a bypass check to see if the vector trip count is zero, including if
599  /// it overflows.
601 
602  /// Emit a bypass check to see if all of the SCEV assumptions we've
603  /// had to make are correct.
604  void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
605 
606  /// Emit bypass checks to check any memory assumptions we may have made.
607  void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
608 
609  /// Compute the transformed value of Index at offset StartValue using step
610  /// StepValue.
611  /// For integer induction, returns StartValue + Index * StepValue.
612  /// For pointer induction, returns StartValue[Index * StepValue].
613  /// FIXME: The newly created binary instructions should contain nsw/nuw
614  /// flags, which can be found from the original scalar operations.
616  const DataLayout &DL,
617  const InductionDescriptor &ID) const;
618 
619  /// Add additional metadata to \p To that was not present on \p Orig.
620  ///
621  /// Currently this is used to add the noalias annotations based on the
622  /// inserted memchecks. Use this for instructions that are *cloned* into the
623  /// vector loop.
624  void addNewMetadata(Instruction *To, const Instruction *Orig);
625 
626  /// Add metadata from one instruction to another.
627  ///
628  /// This includes both the original MDs from \p From and additional ones (\see
629  /// addNewMetadata). Use this for *newly created* instructions in the vector
630  /// loop.
632 
633  /// Similar to the previous function but it adds the metadata to a
634  /// vector of instructions.
636 
637  /// The original loop.
639 
640  /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
641  /// dynamic knowledge to simplify SCEV expressions and converts them to a
642  /// more usable form.
644 
645  /// Loop Info.
647 
648  /// Dominator Tree.
650 
651  /// Alias Analysis.
653 
654  /// Target Library Info.
656 
657  /// Target Transform Info.
659 
660  /// Assumption Cache.
662 
663  /// Interface to emit optimization remarks.
665 
666  /// LoopVersioning. It's only set up (non-null) if memchecks were
667  /// used.
668  ///
669  /// This is currently only used to add no-alias metadata based on the
670  /// memchecks. The actually versioning is performed manually.
671  std::unique_ptr<LoopVersioning> LVer;
672 
673  /// The vectorization SIMD factor to use. Each vector will have this many
674  /// vector elements.
675  unsigned VF;
676 
677  /// The vectorization unroll factor to use. Each scalar is vectorized to this
678  /// many different vector instructions.
679  unsigned UF;
680 
681  /// The builder that we use
683 
684  // --- Vectorization state ---
685 
686  /// The vector-loop preheader.
688 
689  /// The scalar-loop preheader.
691 
692  /// Middle Block between the vector and the scalar.
694 
695  /// The ExitBlock of the scalar loop.
697 
698  /// The vector loop body.
700 
701  /// The scalar loop body.
703 
704  /// A list of all bypass blocks. The first block is the entry of the loop.
706 
707  /// The new Induction variable which was added to the new block.
708  PHINode *Induction = nullptr;
709 
710  /// The induction variable of the old basic block.
711  PHINode *OldInduction = nullptr;
712 
713  /// Maps values from the original loop to their corresponding values in the
714  /// vectorized loop. A key value can map to either vector values, scalar
715  /// values or both kinds of values, depending on whether the key was
716  /// vectorized and scalarized.
718 
719  /// Store instructions that were predicated.
721 
722  /// Trip count of the original loop.
723  Value *TripCount = nullptr;
724 
725  /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
726  Value *VectorTripCount = nullptr;
727 
728  /// The legality analysis.
730 
731  /// The profitablity analysis.
733 
734  // Record whether runtime checks are added.
735  bool AddedSafetyChecks = false;
736 
737  // Holds the end values for each induction variable. We save the end values
738  // so we can later fix-up the external users of the induction variables.
740 
741  // Vector of original scalar PHIs whose corresponding widened PHIs need to be
742  // fixed up at the end of vector code generation.
744 };
745 
747 public:
750  const TargetLibraryInfo *TLI,
752  OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
755  : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
756  UnrollFactor, LVL, CM) {}
757 
758 private:
759  Value *getBroadcastInstrs(Value *V) override;
760  Value *getStepVector(Value *Val, int StartIdx, Value *Step,
761  Instruction::BinaryOps Opcode =
762  Instruction::BinaryOpsEnd) override;
763  Value *reverseVector(Value *Vec) override;
764 };
765 
766 } // end namespace llvm
767 
768 /// Look for a meaningful debug location on the instruction or it's
769 /// operands.
771  if (!I)
772  return I;
773 
774  DebugLoc Empty;
775  if (I->getDebugLoc() != Empty)
776  return I;
777 
778  for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
779  if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
780  if (OpInst->getDebugLoc() != Empty)
781  return OpInst;
782  }
783 
784  return I;
785 }
786 
788  if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
789  const DILocation *DIL = Inst->getDebugLoc();
790  if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
791  !isa<DbgInfoIntrinsic>(Inst)) {
792  auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
793  if (NewDIL)
794  B.SetCurrentDebugLocation(NewDIL.getValue());
795  else
796  LLVM_DEBUG(dbgs()
797  << "Failed to create new discriminator: "
798  << DIL->getFilename() << " Line: " << DIL->getLine());
799  }
800  else
802  } else
804 }
805 
806 /// Write a record \p DebugMsg about vectorization failure to the debug
807 /// output stream. If \p I is passed, it is an instruction that prevents
808 /// vectorization.
809 #ifndef NDEBUG
810 static void debugVectorizationFailure(const StringRef DebugMsg,
811  Instruction *I) {
812  dbgs() << "LV: Not vectorizing: " << DebugMsg;
813  if (I != nullptr)
814  dbgs() << " " << *I;
815  else
816  dbgs() << '.';
817  dbgs() << '\n';
818 }
819 #endif
820 
821 /// Create an analysis remark that explains why vectorization failed
822 ///
823 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
824 /// RemarkName is the identifier for the remark. If \p I is passed it is an
825 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for
826 /// the location of the remark. \return the remark object that can be
827 /// streamed to.
828 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
829  StringRef RemarkName, Loop *TheLoop, Instruction *I) {
830  Value *CodeRegion = TheLoop->getHeader();
831  DebugLoc DL = TheLoop->getStartLoc();
832 
833  if (I) {
834  CodeRegion = I->getParent();
835  // If there is no debug location attached to the instruction, revert back to
836  // using the loop's.
837  if (I->getDebugLoc())
838  DL = I->getDebugLoc();
839  }
840 
841  OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
842  R << "loop not vectorized: ";
843  return R;
844 }
845 
846 namespace llvm {
847 
849  const StringRef OREMsg, const StringRef ORETag,
852  LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
854  ORETag, TheLoop, I) << OREMsg);
855 }
856 
857 } // end namespace llvm
858 
859 #ifndef NDEBUG
860 /// \return string containing a file name and a line # for the given loop.
861 static std::string getDebugLocString(const Loop *L) {
862  std::string Result;
863  if (L) {
864  raw_string_ostream OS(Result);
865  if (const DebugLoc LoopDbgLoc = L->getStartLoc())
866  LoopDbgLoc.print(OS);
867  else
868  // Just print the module name.
869  OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
870  OS.flush();
871  }
872  return Result;
873 }
874 #endif
875 
877  const Instruction *Orig) {
878  // If the loop was versioned with memchecks, add the corresponding no-alias
879  // metadata.
880  if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
881  LVer->annotateInstWithNoAlias(To, Orig);
882 }
883 
885  Instruction *From) {
886  propagateMetadata(To, From);
887  addNewMetadata(To, From);
888 }
889 
891  Instruction *From) {
892  for (Value *V : To) {
893  if (Instruction *I = dyn_cast<Instruction>(V))
894  addMetadata(I, From);
895  }
896 }
897 
898 namespace llvm {
899 
900 // Loop vectorization cost-model hints how the scalar epilogue loop should be
901 // lowered.
903 
904  // The default: allowing scalar epilogues.
906 
907  // Vectorization with OptForSize: don't allow epilogues.
909 
910  // A special case of vectorisation with OptForSize: loops with a very small
911  // trip count are considered for vectorization under OptForSize, thereby
912  // making sure the cost of their loop body is dominant, free of runtime
913  // guards and scalar iteration overheads.
915 
916  // Loop hint predicate indicating an epilogue is undesired.
918 };
919 
920 /// LoopVectorizationCostModel - estimates the expected speedups due to
921 /// vectorization.
922 /// In many cases vectorization is not profitable. This can happen because of
923 /// a number of reasons. In this class we mainly attempt to predict the
924 /// expected speedup/slowdowns due to the supported instruction set. We use the
925 /// TargetTransformInfo to query the different backends for the cost of
926 /// different operations.
928 public:
932  const TargetTransformInfo &TTI,
933  const TargetLibraryInfo *TLI, DemandedBits *DB,
936  const LoopVectorizeHints *Hints,
938  : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
939  TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
940  Hints(Hints), InterleaveInfo(IAI) {}
941 
942  /// \return An upper bound for the vectorization factor, or None if
943  /// vectorization and interleaving should be avoided up front.
944  Optional<unsigned> computeMaxVF();
945 
946  /// \return True if runtime checks are required for vectorization, and false
947  /// otherwise.
948  bool runtimeChecksRequired();
949 
950  /// \return The most profitable vectorization factor and the cost of that VF.
951  /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
952  /// then this vectorization factor will be selected if vectorization is
953  /// possible.
954  VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
955 
956  /// Setup cost-based decisions for user vectorization factor.
957  void selectUserVectorizationFactor(unsigned UserVF) {
958  collectUniformsAndScalars(UserVF);
959  collectInstsToScalarize(UserVF);
960  }
961 
962  /// \return The size (in bits) of the smallest and widest types in the code
963  /// that needs to be vectorized. We ignore values that remain scalar such as
964  /// 64 bit loop indices.
965  std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
966 
967  /// \return The desired interleave count.
968  /// If interleave count has been specified by metadata it will be returned.
969  /// Otherwise, the interleave count is computed and returned. VF and LoopCost
970  /// are the selected vectorization factor and the cost of the selected VF.
971  unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
972 
973  /// Memory access instruction may be vectorized in more than one way.
974  /// Form of instruction after vectorization depends on cost.
975  /// This function takes cost-based decisions for Load/Store instructions
976  /// and collects them in a map. This decisions map is used for building
977  /// the lists of loop-uniform and loop-scalar instructions.
978  /// The calculated cost is saved with widening decision in order to
979  /// avoid redundant calculations.
980  void setCostBasedWideningDecision(unsigned VF);
981 
982  /// A struct that represents some properties of the register usage
983  /// of a loop.
984  struct RegisterUsage {
985  /// Holds the number of loop invariant values that are used in the loop.
987 
988  /// Holds the maximum number of concurrent live intervals in the loop.
989  unsigned MaxLocalUsers;
990  };
991 
992  /// \return Returns information about the register usages of the loop for the
993  /// given vectorization factors.
994  SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
995 
996  /// Collect values we want to ignore in the cost model.
997  void collectValuesToIgnore();
998 
999  /// \returns The smallest bitwidth each instruction can be represented with.
1000  /// The vector equivalents of these instructions should be truncated to this
1001  /// type.
1003  return MinBWs;
1004  }
1005 
1006  /// \returns True if it is more profitable to scalarize instruction \p I for
1007  /// vectorization factor \p VF.
1008  bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1009  assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1010 
1011  // Cost model is not run in the VPlan-native path - return conservative
1012  // result until this changes.
1014  return false;
1015 
1016  auto Scalars = InstsToScalarize.find(VF);
1017  assert(Scalars != InstsToScalarize.end() &&
1018  "VF not yet analyzed for scalarization profitability");
1019  return Scalars->second.find(I) != Scalars->second.end();
1020  }
1021 
1022  /// Returns true if \p I is known to be uniform after vectorization.
1023  bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1024  if (VF == 1)
1025  return true;
1026 
1027  // Cost model is not run in the VPlan-native path - return conservative
1028  // result until this changes.
1030  return false;
1031 
1032  auto UniformsPerVF = Uniforms.find(VF);
1033  assert(UniformsPerVF != Uniforms.end() &&
1034  "VF not yet analyzed for uniformity");
1035  return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
1036  }
1037 
1038  /// Returns true if \p I is known to be scalar after vectorization.
1039  bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1040  if (VF == 1)
1041  return true;
1042 
1043  // Cost model is not run in the VPlan-native path - return conservative
1044  // result until this changes.
1046  return false;
1047 
1048  auto ScalarsPerVF = Scalars.find(VF);
1049  assert(ScalarsPerVF != Scalars.end() &&
1050  "Scalar values are not calculated for VF");
1051  return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
1052  }
1053 
1054  /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1055  /// for vectorization factor \p VF.
1056  bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1057  return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
1058  !isProfitableToScalarize(I, VF) &&
1059  !isScalarAfterVectorization(I, VF);
1060  }
1061 
1062  /// Decision that was taken during cost calculation for memory instruction.
1065  CM_Widen, // For consecutive accesses with stride +1.
1066  CM_Widen_Reverse, // For consecutive accesses with stride -1.
1069  CM_Scalarize
1070  };
1071 
1072  /// Save vectorization decision \p W and \p Cost taken by the cost model for
1073  /// instruction \p I and vector width \p VF.
1075  unsigned Cost) {
1076  assert(VF >= 2 && "Expected VF >=2");
1077  WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1078  }
1079 
1080  /// Save vectorization decision \p W and \p Cost taken by the cost model for
1081  /// interleaving group \p Grp and vector width \p VF.
1083  InstWidening W, unsigned Cost) {
1084  assert(VF >= 2 && "Expected VF >=2");
1085  /// Broadcast this decicion to all instructions inside the group.
1086  /// But the cost will be assigned to one instruction only.
1087  for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1088  if (auto *I = Grp->getMember(i)) {
1089  if (Grp->getInsertPos() == I)
1090  WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1091  else
1092  WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1093  }
1094  }
1095  }
1096 
1097  /// Return the cost model decision for the given instruction \p I and vector
1098  /// width \p VF. Return CM_Unknown if this instruction did not pass
1099  /// through the cost modeling.
1101  assert(VF >= 2 && "Expected VF >=2");
1102 
1103  // Cost model is not run in the VPlan-native path - return conservative
1104  // result until this changes.
1106  return CM_GatherScatter;
1107 
1108  std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1109  auto Itr = WideningDecisions.find(InstOnVF);
1110  if (Itr == WideningDecisions.end())
1111  return CM_Unknown;
1112  return Itr->second.first;
1113  }
1114 
1115  /// Return the vectorization cost for the given instruction \p I and vector
1116  /// width \p VF.
1117  unsigned getWideningCost(Instruction *I, unsigned VF) {
1118  assert(VF >= 2 && "Expected VF >=2");
1119  std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1120  assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1121  "The cost is not calculated");
1122  return WideningDecisions[InstOnVF].second;
1123  }
1124 
1125  /// Return True if instruction \p I is an optimizable truncate whose operand
1126  /// is an induction variable. Such a truncate will be removed by adding a new
1127  /// induction variable with the destination type.
1128  bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1129  // If the instruction is not a truncate, return false.
1130  auto *Trunc = dyn_cast<TruncInst>(I);
1131  if (!Trunc)
1132  return false;
1133 
1134  // Get the source and destination types of the truncate.
1135  Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1136  Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1137 
1138  // If the truncate is free for the given types, return false. Replacing a
1139  // free truncate with an induction variable would add an induction variable
1140  // update instruction to each iteration of the loop. We exclude from this
1141  // check the primary induction variable since it will need an update
1142  // instruction regardless.
1143  Value *Op = Trunc->getOperand(0);
1144  if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1145  return false;
1146 
1147  // If the truncated value is not an induction variable, return false.
1148  return Legal->isInductionPhi(Op);
1149  }
1150 
1151  /// Collects the instructions to scalarize for each predicated instruction in
1152  /// the loop.
1153  void collectInstsToScalarize(unsigned VF);
1154 
1155  /// Collect Uniform and Scalar values for the given \p VF.
1156  /// The sets depend on CM decision for Load/Store instructions
1157  /// that may be vectorized as interleave, gather-scatter or scalarized.
1158  void collectUniformsAndScalars(unsigned VF) {
1159  // Do the analysis once.
1160  if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1161  return;
1162  setCostBasedWideningDecision(VF);
1163  collectLoopUniforms(VF);
1164  collectLoopScalars(VF);
1165  }
1166 
1167  /// Returns true if the target machine supports masked store operation
1168  /// for the given \p DataType and kind of access to \p Ptr.
1170  return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType);
1171  }
1172 
1173  /// Returns true if the target machine supports masked load operation
1174  /// for the given \p DataType and kind of access to \p Ptr.
1176  return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType);
1177  }
1178 
1179  /// Returns true if the target machine supports masked scatter operation
1180  /// for the given \p DataType.
1182  return TTI.isLegalMaskedScatter(DataType);
1183  }
1184 
1185  /// Returns true if the target machine supports masked gather operation
1186  /// for the given \p DataType.
1188  return TTI.isLegalMaskedGather(DataType);
1189  }
1190 
1191  /// Returns true if the target machine can represent \p V as a masked gather
1192  /// or scatter operation.
1194  bool LI = isa<LoadInst>(V);
1195  bool SI = isa<StoreInst>(V);
1196  if (!LI && !SI)
1197  return false;
1198  auto *Ty = getMemInstValueType(V);
1199  return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
1200  }
1201 
1202  /// Returns true if \p I is an instruction that will be scalarized with
1203  /// predication. Such instructions include conditional stores and
1204  /// instructions that may divide by zero.
1205  /// If a non-zero VF has been calculated, we check if I will be scalarized
1206  /// predication for that VF.
1207  bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1208 
1209  // Returns true if \p I is an instruction that will be predicated either
1210  // through scalar predication or masked load/store or masked gather/scatter.
1211  // Superset of instructions that return true for isScalarWithPredication.
1213  if (!blockNeedsPredication(I->getParent()))
1214  return false;
1215  // Loads and stores that need some form of masked operation are predicated
1216  // instructions.
1217  if (isa<LoadInst>(I) || isa<StoreInst>(I))
1218  return Legal->isMaskRequired(I);
1219  return isScalarWithPredication(I);
1220  }
1221 
1222  /// Returns true if \p I is a memory instruction with consecutive memory
1223  /// access that can be widened.
1224  bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1225 
1226  /// Returns true if \p I is a memory instruction in an interleaved-group
1227  /// of memory accesses that can be vectorized with wide vector loads/stores
1228  /// and shuffles.
1229  bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1230 
1231  /// Check if \p Instr belongs to any interleaved access group.
1233  return InterleaveInfo.isInterleaved(Instr);
1234  }
1235 
1236  /// Get the interleaved access group that \p Instr belongs to.
1239  return InterleaveInfo.getInterleaveGroup(Instr);
1240  }
1241 
1242  /// Returns true if an interleaved group requires a scalar iteration
1243  /// to handle accesses with gaps, and there is nothing preventing us from
1244  /// creating a scalar epilogue.
1245  bool requiresScalarEpilogue() const {
1246  return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1247  }
1248 
1249  /// Returns true if a scalar epilogue is not allowed due to optsize or a
1250  /// loop hint annotation.
1252  return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1253  }
1254 
1255  /// Returns true if all loop blocks should be masked to fold tail loop.
1256  bool foldTailByMasking() const { return FoldTailByMasking; }
1257 
1259  return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1260  }
1261 
1262  /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1263  /// with factor VF. Return the cost of the instruction, including
1264  /// scalarization overhead if it's needed.
1265  unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1266 
1267  /// Estimate cost of a call instruction CI if it were vectorized with factor
1268  /// VF. Return the cost of the instruction, including scalarization overhead
1269  /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1270  /// scalarized -
1271  /// i.e. either vector version isn't available, or is too expensive.
1272  unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1273 
1274 private:
1275  unsigned NumPredStores = 0;
1276 
1277  /// \return An upper bound for the vectorization factor, larger than zero.
1278  /// One is returned if vectorization should best be avoided due to cost.
1279  unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1280 
1281  /// The vectorization cost is a combination of the cost itself and a boolean
1282  /// indicating whether any of the contributing operations will actually
1283  /// operate on
1284  /// vector values after type legalization in the backend. If this latter value
1285  /// is
1286  /// false, then all operations will be scalarized (i.e. no vectorization has
1287  /// actually taken place).
1288  using VectorizationCostTy = std::pair<unsigned, bool>;
1289 
1290  /// Returns the expected execution cost. The unit of the cost does
1291  /// not matter because we use the 'cost' units to compare different
1292  /// vector widths. The cost that is returned is *not* normalized by
1293  /// the factor width.
1294  VectorizationCostTy expectedCost(unsigned VF);
1295 
1296  /// Returns the execution time cost of an instruction for a given vector
1297  /// width. Vector width of one means scalar.
1298  VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1299 
1300  /// The cost-computation logic from getInstructionCost which provides
1301  /// the vector type as an output parameter.
1302  unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1303 
1304  /// Calculate vectorization cost of memory instruction \p I.
1305  unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1306 
1307  /// The cost computation for scalarized memory instruction.
1308  unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1309 
1310  /// The cost computation for interleaving group of memory instructions.
1311  unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1312 
1313  /// The cost computation for Gather/Scatter instruction.
1314  unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1315 
1316  /// The cost computation for widening instruction \p I with consecutive
1317  /// memory access.
1318  unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1319 
1320  /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1321  /// Load: scalar load + broadcast.
1322  /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1323  /// element)
1324  unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1325 
1326  /// Estimate the overhead of scalarizing an instruction. This is a
1327  /// convenience wrapper for the type-based getScalarizationOverhead API.
1328  unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1329 
1330  /// Returns whether the instruction is a load or store and will be a emitted
1331  /// as a vector operation.
1332  bool isConsecutiveLoadOrStore(Instruction *I);
1333 
1334  /// Returns true if an artificially high cost for emulated masked memrefs
1335  /// should be used.
1336  bool useEmulatedMaskMemRefHack(Instruction *I);
1337 
1338  /// Map of scalar integer values to the smallest bitwidth they can be legally
1339  /// represented as. The vector equivalents of these values should be truncated
1340  /// to this type.
1342 
1343  /// A type representing the costs for instructions if they were to be
1344  /// scalarized rather than vectorized. The entries are Instruction-Cost
1345  /// pairs.
1347 
1348  /// A set containing all BasicBlocks that are known to present after
1349  /// vectorization as a predicated block.
1350  SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1351 
1352  /// Records whether it is allowed to have the original scalar loop execute at
1353  /// least once. This may be needed as a fallback loop in case runtime
1354  /// aliasing/dependence checks fail, or to handle the tail/remainder
1355  /// iterations when the trip count is unknown or doesn't divide by the VF,
1356  /// or as a peel-loop to handle gaps in interleave-groups.
1357  /// Under optsize and when the trip count is very small we don't allow any
1358  /// iterations to execute in the scalar loop.
1359  ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1360 
1361  /// All blocks of loop are to be masked to fold tail of scalar iterations.
1362  bool FoldTailByMasking = false;
1363 
1364  /// A map holding scalar costs for different vectorization factors. The
1365  /// presence of a cost for an instruction in the mapping indicates that the
1366  /// instruction will be scalarized when vectorizing with the associated
1367  /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1368  DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1369 
1370  /// Holds the instructions known to be uniform after vectorization.
1371  /// The data is collected per VF.
1373 
1374  /// Holds the instructions known to be scalar after vectorization.
1375  /// The data is collected per VF.
1377 
1378  /// Holds the instructions (address computations) that are forced to be
1379  /// scalarized.
1381 
1382  /// Returns the expected difference in cost from scalarizing the expression
1383  /// feeding a predicated instruction \p PredInst. The instructions to
1384  /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1385  /// non-negative return value implies the expression will be scalarized.
1386  /// Currently, only single-use chains are considered for scalarization.
1387  int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1388  unsigned VF);
1389 
1390  /// Collect the instructions that are uniform after vectorization. An
1391  /// instruction is uniform if we represent it with a single scalar value in
1392  /// the vectorized loop corresponding to each vector iteration. Examples of
1393  /// uniform instructions include pointer operands of consecutive or
1394  /// interleaved memory accesses. Note that although uniformity implies an
1395  /// instruction will be scalar, the reverse is not true. In general, a
1396  /// scalarized instruction will be represented by VF scalar values in the
1397  /// vectorized loop, each corresponding to an iteration of the original
1398  /// scalar loop.
1399  void collectLoopUniforms(unsigned VF);
1400 
1401  /// Collect the instructions that are scalar after vectorization. An
1402  /// instruction is scalar if it is known to be uniform or will be scalarized
1403  /// during vectorization. Non-uniform scalarized instructions will be
1404  /// represented by VF values in the vectorized loop, each corresponding to an
1405  /// iteration of the original scalar loop.
1406  void collectLoopScalars(unsigned VF);
1407 
1408  /// Keeps cost model vectorization decision and cost for instructions.
1409  /// Right now it is used for memory instructions only.
1411  std::pair<InstWidening, unsigned>>;
1412 
1413  DecisionList WideningDecisions;
1414 
1415  /// Returns true if \p V is expected to be vectorized and it needs to be
1416  /// extracted.
1417  bool needsExtract(Value *V, unsigned VF) const {
1418  Instruction *I = dyn_cast<Instruction>(V);
1419  if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1420  return false;
1421 
1422  // Assume we can vectorize V (and hence we need extraction) if the
1423  // scalars are not computed yet. This can happen, because it is called
1424  // via getScalarizationOverhead from setCostBasedWideningDecision, before
1425  // the scalars are collected. That should be a safe assumption in most
1426  // cases, because we check if the operands have vectorizable types
1427  // beforehand in LoopVectorizationLegality.
1428  return Scalars.find(VF) == Scalars.end() ||
1429  !isScalarAfterVectorization(I, VF);
1430  };
1431 
1432  /// Returns a range containing only operands needing to be extracted.
1433  SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1434  unsigned VF) {
1436  Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1437  }
1438 
1439 public:
1440  /// The loop that we evaluate.
1442 
1443  /// Predicated scalar evolution analysis.
1445 
1446  /// Loop Info analysis.
1448 
1449  /// Vectorization legality.
1451 
1452  /// Vector target information.
1454 
1455  /// Target Library Info.
1457 
1458  /// Demanded bits analysis.
1460 
1461  /// Assumption cache.
1463 
1464  /// Interface to emit optimization remarks.
1466 
1468 
1469  /// Loop Vectorize Hint.
1471 
1472  /// The interleave access information contains groups of interleaved accesses
1473  /// with the same stride and close to each other.
1475 
1476  /// Values to ignore in the cost model.
1478 
1479  /// Values to ignore in the cost model when VF > 1.
1481 };
1482 
1483 } // end namespace llvm
1484 
1485 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1486 // vectorization. The loop needs to be annotated with #pragma omp simd
1487 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1488 // vector length information is not provided, vectorization is not considered
1489 // explicit. Interleave hints are not allowed either. These limitations will be
1490 // relaxed in the future.
1491 // Please, note that we are currently forced to abuse the pragma 'clang
1492 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1493 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1494 // provides *explicit vectorization hints* (LV can bypass legal checks and
1495 // assume that vectorization is legal). However, both hints are implemented
1496 // using the same metadata (llvm.loop.vectorize, processed by
1497 // LoopVectorizeHints). This will be fixed in the future when the native IR
1498 // representation for pragma 'omp simd' is introduced.
1499 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1501  assert(!OuterLp->empty() && "This is not an outer loop");
1502  LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1503 
1504  // Only outer loops with an explicit vectorization hint are supported.
1505  // Unannotated outer loops are ignored.
1506  if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1507  return false;
1508 
1509  Function *Fn = OuterLp->getHeader()->getParent();
1510  if (!Hints.allowVectorization(Fn, OuterLp,
1511  true /*VectorizeOnlyWhenForced*/)) {
1512  LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1513  return false;
1514  }
1515 
1516  if (Hints.getInterleave() > 1) {
1517  // TODO: Interleave support is future work.
1518  LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1519  "outer loops.\n");
1520  Hints.emitRemarkWithHints();
1521  return false;
1522  }
1523 
1524  return true;
1525 }
1526 
1530  // Collect inner loops and outer loops without irreducible control flow. For
1531  // now, only collect outer loops that have explicit vectorization hints. If we
1532  // are stress testing the VPlan H-CFG construction, we collect the outermost
1533  // loop of every loop nest.
1534  if (L.empty() || VPlanBuildStressTest ||
1536  LoopBlocksRPO RPOT(&L);
1537  RPOT.perform(LI);
1538  if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1539  V.push_back(&L);
1540  // TODO: Collect inner loops inside marked outer loops in case
1541  // vectorization fails for the outer loop. Do not invoke
1542  // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1543  // already known to be reducible. We can use an inherited attribute for
1544  // that.
1545  return;
1546  }
1547  }
1548  for (Loop *InnerL : L)
1549  collectSupportedLoops(*InnerL, LI, ORE, V);
1550 }
1551 
1552 namespace {
1553 
1554 /// The LoopVectorize Pass.
1555 struct LoopVectorize : public FunctionPass {
1556  /// Pass identification, replacement for typeid
1557  static char ID;
1558 
1559  LoopVectorizePass Impl;
1560 
1561  explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1562  bool VectorizeOnlyWhenForced = false)
1563  : FunctionPass(ID) {
1564  Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
1565  Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
1567  }
1568 
1569  bool runOnFunction(Function &F) override {
1570  if (skipFunction(F))
1571  return false;
1572 
1573  auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1574  auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1575  auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1576  auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1577  auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1578  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1579  auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1580  auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1581  auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1582  auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1583  auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1584  auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1585  auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1586 
1587  std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1588  [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1589 
1590  return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1591  GetLAA, *ORE, PSI);
1592  }
1593 
1594  void getAnalysisUsage(AnalysisUsage &AU) const override {
1605 
1606  // We currently do not preserve loopinfo/dominator analyses with outer loop
1607  // vectorization. Until this is addressed, mark these analyses as preserved
1608  // only for non-VPlan-native path.
1609  // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1610  if (!EnableVPlanNativePath) {
1613  }
1614 
1618  }
1619 };
1620 
1621 } // end anonymous namespace
1622 
1623 //===----------------------------------------------------------------------===//
1624 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1625 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1626 //===----------------------------------------------------------------------===//
1627 
1629  // We need to place the broadcast of invariant variables outside the loop,
1630  // but only if it's proven safe to do so. Else, broadcast will be inside
1631  // vector loop body.
1632  Instruction *Instr = dyn_cast<Instruction>(V);
1633  bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1634  (!Instr ||
1636  // Place the code for broadcasting invariant variables in the new preheader.
1638  if (SafeToHoist)
1640 
1641  // Broadcast the scalar into all locations in the vector.
1642  Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1643 
1644  return Shuf;
1645 }
1646 
1648  const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1649  assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1650  "Expected either an induction phi-node or a truncate of it!");
1651  Value *Start = II.getStartValue();
1652 
1653  // Construct the initial value of the vector IV in the vector loop preheader
1654  auto CurrIP = Builder.saveIP();
1656  if (isa<TruncInst>(EntryVal)) {
1657  assert(Start->getType()->isIntegerTy() &&
1658  "Truncation requires an integer type");
1659  auto *TruncType = cast<IntegerType>(EntryVal->getType());
1660  Step = Builder.CreateTrunc(Step, TruncType);
1661  Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1662  }
1663  Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1664  Value *SteppedStart =
1665  getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1666 
1667  // We create vector phi nodes for both integer and floating-point induction
1668  // variables. Here, we determine the kind of arithmetic we will perform.
1669  Instruction::BinaryOps AddOp;
1670  Instruction::BinaryOps MulOp;
1671  if (Step->getType()->isIntegerTy()) {
1672  AddOp = Instruction::Add;
1673  MulOp = Instruction::Mul;
1674  } else {
1675  AddOp = II.getInductionOpcode();
1676  MulOp = Instruction::FMul;
1677  }
1678 
1679  // Multiply the vectorization factor by the step using integer or
1680  // floating-point arithmetic as appropriate.
1681  Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1682  Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1683 
1684  // Create a vector splat to use in the induction update.
1685  //
1686  // FIXME: If the step is non-constant, we create the vector splat with
1687  // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1688  // handle a constant vector splat.
1689  Value *SplatVF = isa<Constant>(Mul)
1690  ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1691  : Builder.CreateVectorSplat(VF, Mul);
1692  Builder.restoreIP(CurrIP);
1693 
1694  // We may need to add the step a number of times, depending on the unroll
1695  // factor. The last of those goes into the PHI.
1696  PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1698  VecInd->setDebugLoc(EntryVal->getDebugLoc());
1699  Instruction *LastInduction = VecInd;
1700  for (unsigned Part = 0; Part < UF; ++Part) {
1701  VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1702 
1703  if (isa<TruncInst>(EntryVal))
1704  addMetadata(LastInduction, EntryVal);
1705  recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1706 
1707  LastInduction = cast<Instruction>(addFastMathFlag(
1708  Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1709  LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1710  }
1711 
1712  // Move the last step to the end of the latch block. This ensures consistent
1713  // placement of all induction updates.
1714  auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1715  auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1716  auto *ICmp = cast<Instruction>(Br->getCondition());
1717  LastInduction->moveBefore(ICmp);
1718  LastInduction->setName("vec.ind.next");
1719 
1720  VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1721  VecInd->addIncoming(LastInduction, LoopVectorLatch);
1722 }
1723 
1725  return Cost->isScalarAfterVectorization(I, VF) ||
1727 }
1728 
1731  return true;
1732  auto isScalarInst = [&](User *U) -> bool {
1733  auto *I = cast<Instruction>(U);
1735  };
1736  return llvm::any_of(IV->users(), isScalarInst);
1737 }
1738 
1740  const InductionDescriptor &ID, const Instruction *EntryVal,
1741  Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1742  assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1743  "Expected either an induction phi-node or a truncate of it!");
1744 
1745  // This induction variable is not the phi from the original loop but the
1746  // newly-created IV based on the proof that casted Phi is equal to the
1747  // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1748  // re-uses the same InductionDescriptor that original IV uses but we don't
1749  // have to do any recording in this case - that is done when original IV is
1750  // processed.
1751  if (isa<TruncInst>(EntryVal))
1752  return;
1753 
1754  const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1755  if (Casts.empty())
1756  return;
1757  // Only the first Cast instruction in the Casts vector is of interest.
1758  // The rest of the Casts (if exist) have no uses outside the
1759  // induction update chain itself.
1760  Instruction *CastInst = *Casts.begin();
1761  if (Lane < UINT_MAX)
1762  VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1763  else
1764  VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1765 }
1766 
1768  assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1769  "Primary induction variable must have an integer type");
1770 
1771  auto II = Legal->getInductionVars()->find(IV);
1772  assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
1773 
1774  auto ID = II->second;
1775  assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1776 
1777  // The scalar value to broadcast. This will be derived from the canonical
1778  // induction variable.
1779  Value *ScalarIV = nullptr;
1780 
1781  // The value from the original loop to which we are mapping the new induction
1782  // variable.
1783  Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1784 
1785  // True if we have vectorized the induction variable.
1786  auto VectorizedIV = false;
1787 
1788  // Determine if we want a scalar version of the induction variable. This is
1789  // true if the induction variable itself is not widened, or if it has at
1790  // least one user in the loop that is not widened.
1791  auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
1792 
1793  // Generate code for the induction step. Note that induction steps are
1794  // required to be loop-invariant
1795  assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
1796  "Induction step should be loop invariant");
1797  auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1798  Value *Step = nullptr;
1799  if (PSE.getSE()->isSCEVable(IV->getType())) {
1800  SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1801  Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
1803  } else {
1804  Step = cast<SCEVUnknown>(ID.getStep())->getValue();
1805  }
1806 
1807  // Try to create a new independent vector induction variable. If we can't
1808  // create the phi node, we will splat the scalar induction variable in each
1809  // loop iteration.
1810  if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
1811  createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1812  VectorizedIV = true;
1813  }
1814 
1815  // If we haven't yet vectorized the induction variable, or if we will create
1816  // a scalar one, we need to define the scalar induction variable and step
1817  // values. If we were given a truncation type, truncate the canonical
1818  // induction variable and step. Otherwise, derive these values from the
1819  // induction descriptor.
1820  if (!VectorizedIV || NeedsScalarIV) {
1821  ScalarIV = Induction;
1822  if (IV != OldInduction) {
1823  ScalarIV = IV->getType()->isIntegerTy()
1825  : Builder.CreateCast(Instruction::SIToFP, Induction,
1826  IV->getType());
1827  ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1828  ScalarIV->setName("offset.idx");
1829  }
1830  if (Trunc) {
1831  auto *TruncType = cast<IntegerType>(Trunc->getType());
1832  assert(Step->getType()->isIntegerTy() &&
1833  "Truncation requires an integer step");
1834  ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1835  Step = Builder.CreateTrunc(Step, TruncType);
1836  }
1837  }
1838 
1839  // If we haven't yet vectorized the induction variable, splat the scalar
1840  // induction variable, and build the necessary step vectors.
1841  // TODO: Don't do it unless the vectorized IV is really required.
1842  if (!VectorizedIV) {
1843  Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1844  for (unsigned Part = 0; Part < UF; ++Part) {
1845  Value *EntryPart =
1846  getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1847  VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1848  if (Trunc)
1849  addMetadata(EntryPart, Trunc);
1850  recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1851  }
1852  }
1853 
1854  // If an induction variable is only used for counting loop iterations or
1855  // calculating addresses, it doesn't need to be widened. Create scalar steps
1856  // that can be used by instructions we will later scalarize. Note that the
1857  // addition of the scalar steps will not increase the number of instructions
1858  // in the loop in the common case prior to InstCombine. We will be trading
1859  // one vector extract for each scalar step.
1860  if (NeedsScalarIV)
1861  buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1862 }
1863 
1865  Instruction::BinaryOps BinOp) {
1866  // Create and check the types.
1867  assert(Val->getType()->isVectorTy() && "Must be a vector");
1868  int VLen = Val->getType()->getVectorNumElements();
1869 
1870  Type *STy = Val->getType()->getScalarType();
1871  assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1872  "Induction Step must be an integer or FP");
1873  assert(Step->getType() == STy && "Step has wrong type");
1874 
1876 
1877  if (STy->isIntegerTy()) {
1878  // Create a vector of consecutive numbers from zero to VF.
1879  for (int i = 0; i < VLen; ++i)
1880  Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1881 
1882  // Add the consecutive indices to the vector value.
1883  Constant *Cv = ConstantVector::get(Indices);
1884  assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1885  Step = Builder.CreateVectorSplat(VLen, Step);
1886  assert(Step->getType() == Val->getType() && "Invalid step vec");
1887  // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1888  // which can be found from the original scalar operations.
1889  Step = Builder.CreateMul(Cv, Step);
1890  return Builder.CreateAdd(Val, Step, "induction");
1891  }
1892 
1893  // Floating point induction.
1894  assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1895  "Binary Opcode should be specified for FP induction");
1896  // Create a vector of consecutive numbers from zero to VF.
1897  for (int i = 0; i < VLen; ++i)
1898  Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1899 
1900  // Add the consecutive indices to the vector value.
1901  Constant *Cv = ConstantVector::get(Indices);
1902 
1903  Step = Builder.CreateVectorSplat(VLen, Step);
1904 
1905  // Floating point operations had to be 'fast' to enable the induction.
1906  FastMathFlags Flags;
1907  Flags.setFast();
1908 
1909  Value *MulOp = Builder.CreateFMul(Cv, Step);
1910  if (isa<Instruction>(MulOp))
1911  // Have to check, MulOp may be a constant
1912  cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1913 
1914  Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1915  if (isa<Instruction>(BOp))
1916  cast<Instruction>(BOp)->setFastMathFlags(Flags);
1917  return BOp;
1918 }
1919 
1921  Instruction *EntryVal,
1922  const InductionDescriptor &ID) {
1923  // We shouldn't have to build scalar steps if we aren't vectorizing.
1924  assert(VF > 1 && "VF should be greater than one");
1925 
1926  // Get the value type and ensure it and the step have the same integer type.
1927  Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1928  assert(ScalarIVTy == Step->getType() &&
1929  "Val and Step should have the same type");
1930 
1931  // We build scalar steps for both integer and floating-point induction
1932  // variables. Here, we determine the kind of arithmetic we will perform.
1933  Instruction::BinaryOps AddOp;
1934  Instruction::BinaryOps MulOp;
1935  if (ScalarIVTy->isIntegerTy()) {
1936  AddOp = Instruction::Add;
1937  MulOp = Instruction::Mul;
1938  } else {
1939  AddOp = ID.getInductionOpcode();
1940  MulOp = Instruction::FMul;
1941  }
1942 
1943  // Determine the number of scalars we need to generate for each unroll
1944  // iteration. If EntryVal is uniform, we only need to generate the first
1945  // lane. Otherwise, we generate all VF values.
1946  unsigned Lanes =
1947  Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1948  : VF;
1949  // Compute the scalar steps and save the results in VectorLoopValueMap.
1950  for (unsigned Part = 0; Part < UF; ++Part) {
1951  for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
1952  auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1953  auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1954  auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
1955  VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
1956  recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
1957  }
1958  }
1959 }
1960 
1962  assert(V != Induction && "The new induction variable should not be used.");
1963  assert(!V->getType()->isVectorTy() && "Can't widen a vector");
1964  assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1965 
1966  // If we have a stride that is replaced by one, do it here. Defer this for
1967  // the VPlan-native path until we start running Legal checks in that path.
1969  V = ConstantInt::get(V->getType(), 1);
1970 
1971  // If we have a vector mapped to this value, return it.
1972  if (VectorLoopValueMap.hasVectorValue(V, Part))
1973  return VectorLoopValueMap.getVectorValue(V, Part);
1974 
1975  // If the value has not been vectorized, check if it has been scalarized
1976  // instead. If it has been scalarized, and we actually need the value in
1977  // vector form, we will construct the vector values on demand.
1979  Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
1980 
1981  // If we've scalarized a value, that value should be an instruction.
1982  auto *I = cast<Instruction>(V);
1983 
1984  // If we aren't vectorizing, we can just copy the scalar map values over to
1985  // the vector map.
1986  if (VF == 1) {
1987  VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
1988  return ScalarValue;
1989  }
1990 
1991  // Get the last scalar instruction we generated for V and Part. If the value
1992  // is known to be uniform after vectorization, this corresponds to lane zero
1993  // of the Part unroll iteration. Otherwise, the last instruction is the one
1994  // we created for the last vector lane of the Part unroll iteration.
1995  unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
1996  auto *LastInst = cast<Instruction>(
1997  VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
1998 
1999  // Set the insert point after the last scalarized instruction. This ensures
2000  // the insertelement sequence will directly follow the scalar definitions.
2001  auto OldIP = Builder.saveIP();
2002  auto NewIP = std::next(BasicBlock::iterator(LastInst));
2003  Builder.SetInsertPoint(&*NewIP);
2004 
2005  // However, if we are vectorizing, we need to construct the vector values.
2006  // If the value is known to be uniform after vectorization, we can just
2007  // broadcast the scalar value corresponding to lane zero for each unroll
2008  // iteration. Otherwise, we construct the vector values using insertelement
2009  // instructions. Since the resulting vectors are stored in
2010  // VectorLoopValueMap, we will only generate the insertelements once.
2011  Value *VectorValue = nullptr;
2013  VectorValue = getBroadcastInstrs(ScalarValue);
2014  VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2015  } else {
2016  // Initialize packing with insertelements to start from undef.
2018  VectorLoopValueMap.setVectorValue(V, Part, Undef);
2019  for (unsigned Lane = 0; Lane < VF; ++Lane)
2020  packScalarIntoVectorValue(V, {Part, Lane});
2021  VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2022  }
2023  Builder.restoreIP(OldIP);
2024  return VectorValue;
2025  }
2026 
2027  // If this scalar is unknown, assume that it is a constant or that it is
2028  // loop invariant. Broadcast V and save the value for future uses.
2029  Value *B = getBroadcastInstrs(V);
2030  VectorLoopValueMap.setVectorValue(V, Part, B);
2031  return B;
2032 }
2033 
2034 Value *
2036  const VPIteration &Instance) {
2037  // If the value is not an instruction contained in the loop, it should
2038  // already be scalar.
2039  if (OrigLoop->isLoopInvariant(V))
2040  return V;
2041 
2042  assert(Instance.Lane > 0
2043  ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2044  : true && "Uniform values only have lane zero");
2045 
2046  // If the value from the original loop has not been vectorized, it is
2047  // represented by UF x VF scalar values in the new loop. Return the requested
2048  // scalar value.
2049  if (VectorLoopValueMap.hasScalarValue(V, Instance))
2050  return VectorLoopValueMap.getScalarValue(V, Instance);
2051 
2052  // If the value has not been scalarized, get its entry in VectorLoopValueMap
2053  // for the given unroll part. If this entry is not a vector type (i.e., the
2054  // vectorization factor is one), there is no need to generate an
2055  // extractelement instruction.
2056  auto *U = getOrCreateVectorValue(V, Instance.Part);
2057  if (!U->getType()->isVectorTy()) {
2058  assert(VF == 1 && "Value not scalarized has non-vector type");
2059  return U;
2060  }
2061 
2062  // Otherwise, the value from the original loop has been vectorized and is
2063  // represented by UF vector values. Extract and return the requested scalar
2064  // value from the appropriate vector lane.
2065  return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2066 }
2067 
2069  Value *V, const VPIteration &Instance) {
2070  assert(V != Induction && "The new induction variable should not be used.");
2071  assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2072  assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2073 
2074  Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2075  Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2076  VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2077  Builder.getInt32(Instance.Lane));
2078  VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2079 }
2080 
2082  assert(Vec->getType()->isVectorTy() && "Invalid type");
2083  SmallVector<Constant *, 8> ShuffleMask;
2084  for (unsigned i = 0; i < VF; ++i)
2085  ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2086 
2087  return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2088  ConstantVector::get(ShuffleMask),
2089  "reverse");
2090 }
2091 
2092 // Return whether we allow using masked interleave-groups (for dealing with
2093 // strided loads/stores that reside in predicated blocks, or for dealing
2094 // with gaps).
2096  // If an override option has been passed in for interleaved accesses, use it.
2097  if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2099 
2101 }
2102 
2103 // Try to vectorize the interleave group that \p Instr belongs to.
2104 //
2105 // E.g. Translate following interleaved load group (factor = 3):
2106 // for (i = 0; i < N; i+=3) {
2107 // R = Pic[i]; // Member of index 0
2108 // G = Pic[i+1]; // Member of index 1
2109 // B = Pic[i+2]; // Member of index 2
2110 // ... // do something to R, G, B
2111 // }
2112 // To:
2113 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2114 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements
2115 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements
2116 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements
2117 //
2118 // Or translate following interleaved store group (factor = 3):
2119 // for (i = 0; i < N; i+=3) {
2120 // ... do something to R, G, B
2121 // Pic[i] = R; // Member of index 0
2122 // Pic[i+1] = G; // Member of index 1
2123 // Pic[i+2] = B; // Member of index 2
2124 // }
2125 // To:
2126 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2127 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2128 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2129 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2130 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2132  VectorParts *BlockInMask) {
2133  const InterleaveGroup<Instruction> *Group =
2135  assert(Group && "Fail to get an interleaved access group.");
2136 
2137  // Skip if current instruction is not the insert position.
2138  if (Instr != Group->getInsertPos())
2139  return;
2140 
2141  const DataLayout &DL = Instr->getModule()->getDataLayout();
2142  Value *Ptr = getLoadStorePointerOperand(Instr);
2143 
2144  // Prepare for the vector type of the interleaved load/store.
2145  Type *ScalarTy = getMemInstValueType(Instr);
2146  unsigned InterleaveFactor = Group->getFactor();
2147  Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2148  Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr));
2149 
2150  // Prepare for the new pointers.
2152  SmallVector<Value *, 2> NewPtrs;
2153  unsigned Index = Group->getIndex(Instr);
2154 
2155  VectorParts Mask;
2156  bool IsMaskForCondRequired = BlockInMask;
2157  if (IsMaskForCondRequired) {
2158  Mask = *BlockInMask;
2159  // TODO: extend the masked interleaved-group support to reversed access.
2160  assert(!Group->isReverse() && "Reversed masked interleave-group "
2161  "not supported.");
2162  }
2163 
2164  // If the group is reverse, adjust the index to refer to the last vector lane
2165  // instead of the first. We adjust the index from the first vector lane,
2166  // rather than directly getting the pointer for lane VF - 1, because the
2167  // pointer operand of the interleaved access is supposed to be uniform. For
2168  // uniform instructions, we're only required to generate a value for the
2169  // first vector lane in each unroll iteration.
2170  if (Group->isReverse())
2171  Index += (VF - 1) * Group->getFactor();
2172 
2173  bool InBounds = false;
2174  if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2175  InBounds = gep->isInBounds();
2176 
2177  for (unsigned Part = 0; Part < UF; Part++) {
2178  Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
2179 
2180  // Notice current instruction could be any index. Need to adjust the address
2181  // to the member of index 0.
2182  //
2183  // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2184  // b = A[i]; // Member of index 0
2185  // Current pointer is pointed to A[i+1], adjust it to A[i].
2186  //
2187  // E.g. A[i+1] = a; // Member of index 1
2188  // A[i] = b; // Member of index 0
2189  // A[i+2] = c; // Member of index 2 (Current instruction)
2190  // Current pointer is pointed to A[i+2], adjust it to A[i].
2191  NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index));
2192  if (InBounds)
2193  cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);
2194 
2195  // Cast to the vector pointer type.
2196  NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
2197  }
2198 
2199  setDebugLocFromInst(Builder, Instr);
2200  Value *UndefVec = UndefValue::get(VecTy);
2201 
2202  Value *MaskForGaps = nullptr;
2203  if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2204  MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2205  assert(MaskForGaps && "Mask for Gaps is required but it is null");
2206  }
2207 
2208  // Vectorize the interleaved load group.
2209  if (isa<LoadInst>(Instr)) {
2210  // For each unroll part, create a wide load for the group.
2211  SmallVector<Value *, 2> NewLoads;
2212  for (unsigned Part = 0; Part < UF; Part++) {
2213  Instruction *NewLoad;
2214  if (IsMaskForCondRequired || MaskForGaps) {
2216  "masked interleaved groups are not allowed.");
2217  Value *GroupMask = MaskForGaps;
2218  if (IsMaskForCondRequired) {
2219  auto *Undefs = UndefValue::get(Mask[Part]->getType());
2220  auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2221  Value *ShuffledMask = Builder.CreateShuffleVector(
2222  Mask[Part], Undefs, RepMask, "interleaved.mask");
2223  GroupMask = MaskForGaps
2224  ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2225  MaskForGaps)
2226  : ShuffledMask;
2227  }
2228  NewLoad =
2229  Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),
2230  GroupMask, UndefVec, "wide.masked.vec");
2231  }
2232  else
2233  NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part],
2234  Group->getAlignment(), "wide.vec");
2235  Group->addMetadata(NewLoad);
2236  NewLoads.push_back(NewLoad);
2237  }
2238 
2239  // For each member in the group, shuffle out the appropriate data from the
2240  // wide loads.
2241  for (unsigned I = 0; I < InterleaveFactor; ++I) {
2242  Instruction *Member = Group->getMember(I);
2243 
2244  // Skip the gaps in the group.
2245  if (!Member)
2246  continue;
2247 
2248  Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2249  for (unsigned Part = 0; Part < UF; Part++) {
2250  Value *StridedVec = Builder.CreateShuffleVector(
2251  NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2252 
2253  // If this member has different type, cast the result type.
2254  if (Member->getType() != ScalarTy) {
2255  VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2256  StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2257  }
2258 
2259  if (Group->isReverse())
2260  StridedVec = reverseVector(StridedVec);
2261 
2262  VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2263  }
2264  }
2265  return;
2266  }
2267 
2268  // The sub vector type for current instruction.
2269  VectorType *SubVT = VectorType::get(ScalarTy, VF);
2270 
2271  // Vectorize the interleaved store group.
2272  for (unsigned Part = 0; Part < UF; Part++) {
2273  // Collect the stored vector from each member.
2274  SmallVector<Value *, 4> StoredVecs;
2275  for (unsigned i = 0; i < InterleaveFactor; i++) {
2276  // Interleaved store group doesn't allow a gap, so each index has a member
2277  Instruction *Member = Group->getMember(i);
2278  assert(Member && "Fail to get a member from an interleaved store group");
2279 
2280  Value *StoredVec = getOrCreateVectorValue(
2281  cast<StoreInst>(Member)->getValueOperand(), Part);
2282  if (Group->isReverse())
2283  StoredVec = reverseVector(StoredVec);
2284 
2285  // If this member has different type, cast it to a unified type.
2286 
2287  if (StoredVec->getType() != SubVT)
2288  StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2289 
2290  StoredVecs.push_back(StoredVec);
2291  }
2292 
2293  // Concatenate all vectors into a wide vector.
2294  Value *WideVec = concatenateVectors(Builder, StoredVecs);
2295 
2296  // Interleave the elements in the wide vector.
2297  Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2298  Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2299  "interleaved.vec");
2300 
2301  Instruction *NewStoreInstr;
2302  if (IsMaskForCondRequired) {
2303  auto *Undefs = UndefValue::get(Mask[Part]->getType());
2304  auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2305  Value *ShuffledMask = Builder.CreateShuffleVector(
2306  Mask[Part], Undefs, RepMask, "interleaved.mask");
2307  NewStoreInstr = Builder.CreateMaskedStore(
2308  IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask);
2309  }
2310  else
2311  NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part],
2312  Group->getAlignment());
2313 
2314  Group->addMetadata(NewStoreInstr);
2315  }
2316 }
2317 
2319  VectorParts *BlockInMask) {
2320  // Attempt to issue a wide load.
2321  LoadInst *LI = dyn_cast<LoadInst>(Instr);
2322  StoreInst *SI = dyn_cast<StoreInst>(Instr);
2323 
2324  assert((LI || SI) && "Invalid Load/Store instruction");
2325 
2327  Cost->getWideningDecision(Instr, VF);
2329  "CM decision should be taken at this point");
2331  return vectorizeInterleaveGroup(Instr);
2332 
2333  Type *ScalarDataTy = getMemInstValueType(Instr);
2334  Type *DataTy = VectorType::get(ScalarDataTy, VF);
2335  Value *Ptr = getLoadStorePointerOperand(Instr);
2336  unsigned Alignment = getLoadStoreAlignment(Instr);
2337  // An alignment of 0 means target abi alignment. We need to use the scalar's
2338  // target abi alignment in such a case.
2339  const DataLayout &DL = Instr->getModule()->getDataLayout();
2340  if (!Alignment)
2341  Alignment = DL.getABITypeAlignment(ScalarDataTy);
2342  unsigned AddressSpace = getLoadStoreAddressSpace(Instr);
2343 
2344  // Determine if the pointer operand of the access is either consecutive or
2345  // reverse consecutive.
2346  bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2347  bool ConsecutiveStride =
2348  Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2349  bool CreateGatherScatter =
2351 
2352  // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2353  // gather/scatter. Otherwise Decision should have been to Scalarize.
2354  assert((ConsecutiveStride || CreateGatherScatter) &&
2355  "The instruction should be scalarized");
2356 
2357  // Handle consecutive loads/stores.
2358  if (ConsecutiveStride)
2359  Ptr = getOrCreateScalarValue(Ptr, {0, 0});
2360 
2361  VectorParts Mask;
2362  bool isMaskRequired = BlockInMask;
2363  if (isMaskRequired)
2364  Mask = *BlockInMask;
2365 
2366  bool InBounds = false;
2367  if (auto *gep = dyn_cast<GetElementPtrInst>(
2368  getLoadStorePointerOperand(Instr)->stripPointerCasts()))
2369  InBounds = gep->isInBounds();
2370 
2371  const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2372  // Calculate the pointer for the specific unroll-part.
2373  GetElementPtrInst *PartPtr = nullptr;
2374 
2375  if (Reverse) {
2376  // If the address is consecutive but reversed, then the
2377  // wide store needs to start at the last vector element.
2378  PartPtr = cast<GetElementPtrInst>(
2379  Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2380  PartPtr->setIsInBounds(InBounds);
2381  PartPtr = cast<GetElementPtrInst>(
2382  Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2383  PartPtr->setIsInBounds(InBounds);
2384  if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2385  Mask[Part] = reverseVector(Mask[Part]);
2386  } else {
2387  PartPtr = cast<GetElementPtrInst>(
2388  Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2389  PartPtr->setIsInBounds(InBounds);
2390  }
2391 
2392  return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2393  };
2394 
2395  // Handle Stores:
2396  if (SI) {
2398 
2399  for (unsigned Part = 0; Part < UF; ++Part) {
2400  Instruction *NewSI = nullptr;
2401  Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
2402  if (CreateGatherScatter) {
2403  Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2404  Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2405  NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2406  MaskPart);
2407  } else {
2408  if (Reverse) {
2409  // If we store to reverse consecutive memory locations, then we need
2410  // to reverse the order of elements in the stored value.
2411  StoredVal = reverseVector(StoredVal);
2412  // We don't want to update the value in the map as it might be used in
2413  // another expression. So don't call resetVectorValue(StoredVal).
2414  }
2415  auto *VecPtr = CreateVecPtr(Part, Ptr);
2416  if (isMaskRequired)
2417  NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2418  Mask[Part]);
2419  else
2420  NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2421  }
2422  addMetadata(NewSI, SI);
2423  }
2424  return;
2425  }
2426 
2427  // Handle loads.
2428  assert(LI && "Must have a load instruction");
2430  for (unsigned Part = 0; Part < UF; ++Part) {
2431  Value *NewLI;
2432  if (CreateGatherScatter) {
2433  Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2434  Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2435  NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2436  nullptr, "wide.masked.gather");
2437  addMetadata(NewLI, LI);
2438  } else {
2439  auto *VecPtr = CreateVecPtr(Part, Ptr);
2440  if (isMaskRequired)
2441  NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
2442  UndefValue::get(DataTy),
2443  "wide.masked.load");
2444  else
2445  NewLI =
2446  Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2447 
2448  // Add metadata to the load, but setVectorValue to the reverse shuffle.
2449  addMetadata(NewLI, LI);
2450  if (Reverse)
2451  NewLI = reverseVector(NewLI);
2452  }
2453  VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2454  }
2455 }
2456 
2458  const VPIteration &Instance,
2459  bool IfPredicateInstr) {
2460  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2461 
2462  setDebugLocFromInst(Builder, Instr);
2463 
2464  // Does this instruction return a value ?
2465  bool IsVoidRetTy = Instr->getType()->isVoidTy();
2466 
2467  Instruction *Cloned = Instr->clone();
2468  if (!IsVoidRetTy)
2469  Cloned->setName(Instr->getName() + ".cloned");
2470 
2471  // Replace the operands of the cloned instructions with their scalar
2472  // equivalents in the new loop.
2473  for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2474  auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2475  Cloned->setOperand(op, NewOp);
2476  }
2477  addNewMetadata(Cloned, Instr);
2478 
2479  // Place the cloned scalar in the new loop.
2480  Builder.Insert(Cloned);
2481 
2482  // Add the cloned scalar to the scalar map entry.
2483  VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2484 
2485  // If we just cloned a new assumption, add it the assumption cache.
2486  if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2487  if (II->getIntrinsicID() == Intrinsic::assume)
2488  AC->registerAssumption(II);
2489 
2490  // End if-block.
2491  if (IfPredicateInstr)
2492  PredicatedInstructions.push_back(Cloned);
2493 }
2494 
2496  Value *End, Value *Step,
2497  Instruction *DL) {
2498  BasicBlock *Header = L->getHeader();
2499  BasicBlock *Latch = L->getLoopLatch();
2500  // As we're just creating this loop, it's possible no latch exists
2501  // yet. If so, use the header as this will be a single block loop.
2502  if (!Latch)
2503  Latch = Header;
2504 
2507  setDebugLocFromInst(Builder, OldInst);
2508  auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2509 
2511  setDebugLocFromInst(Builder, OldInst);
2512 
2513  // Create i+1 and fill the PHINode.
2514  Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2515  Induction->addIncoming(Start, L->getLoopPreheader());
2516  Induction->addIncoming(Next, Latch);
2517  // Create the compare.
2518  Value *ICmp = Builder.CreateICmpEQ(Next, End);
2519  Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2520 
2521  // Now we have two terminators. Remove the old one from the block.
2522  Latch->getTerminator()->eraseFromParent();
2523 
2524  return Induction;
2525 }
2526 
2528  if (TripCount)
2529  return TripCount;
2530 
2531  assert(L && "Create Trip Count for null loop.");
2533  // Find the loop boundaries.
2534  ScalarEvolution *SE = PSE.getSE();
2535  const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2536  assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2537  "Invalid loop count");
2538 
2539  Type *IdxTy = Legal->getWidestInductionType();
2540  assert(IdxTy && "No type for induction");
2541 
2542  // The exit count might have the type of i64 while the phi is i32. This can
2543  // happen if we have an induction variable that is sign extended before the
2544  // compare. The only way that we get a backedge taken count is that the
2545  // induction variable was signed and as such will not overflow. In such a case
2546  // truncation is legal.
2547  if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
2548  IdxTy->getPrimitiveSizeInBits())
2549  BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2550  BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2551 
2552  // Get the total trip count from the count by adding 1.
2553  const SCEV *ExitCount = SE->getAddExpr(
2554  BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2555 
2556  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2557 
2558  // Expand the trip count and place the new instructions in the preheader.
2559  // Notice that the pre-header does not change, only the loop body.
2560  SCEVExpander Exp(*SE, DL, "induction");
2561 
2562  // Count holds the overall loop count (N).
2563  TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2565 
2566  if (TripCount->getType()->isPointerTy())
2567  TripCount =
2568  CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2570 
2571  return TripCount;
2572 }
2573 
2575  if (VectorTripCount)
2576  return VectorTripCount;
2577 
2578  Value *TC = getOrCreateTripCount(L);
2580 
2581  Type *Ty = TC->getType();
2582  Constant *Step = ConstantInt::get(Ty, VF * UF);
2583 
2584  // If the tail is to be folded by masking, round the number of iterations N
2585  // up to a multiple of Step instead of rounding down. This is done by first
2586  // adding Step-1 and then rounding down. Note that it's ok if this addition
2587  // overflows: the vector induction variable will eventually wrap to zero given
2588  // that it starts at zero and its Step is a power of two; the loop will then
2589  // exit, with the last early-exit vector comparison also producing all-true.
2590  if (Cost->foldTailByMasking()) {
2591  assert(isPowerOf2_32(VF * UF) &&
2592  "VF*UF must be a power of 2 when folding tail by masking");
2593  TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2594  }
2595 
2596  // Now we need to generate the expression for the part of the loop that the
2597  // vectorized body will execute. This is equal to N - (N % Step) if scalar
2598  // iterations are not required for correctness, or N - Step, otherwise. Step
2599  // is equal to the vectorization factor (number of SIMD elements) times the
2600  // unroll factor (number of SIMD instructions).
2601  Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2602 
2603  // If there is a non-reversed interleaved group that may speculatively access
2604  // memory out-of-bounds, we need to ensure that there will be at least one
2605  // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2606  // the trip count, we set the remainder to be equal to the step. If the step
2607  // does not evenly divide the trip count, no adjustment is necessary since
2608  // there will already be scalar iterations. Note that the minimum iterations
2609  // check ensures that N >= Step.
2610  if (VF > 1 && Cost->requiresScalarEpilogue()) {
2611  auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2612  R = Builder.CreateSelect(IsZero, Step, R);
2613  }
2614 
2615  VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2616 
2617  return VectorTripCount;
2618 }
2619 
2621  const DataLayout &DL) {
2622  // Verify that V is a vector type with same number of elements as DstVTy.
2623  unsigned VF = DstVTy->getNumElements();
2624  VectorType *SrcVecTy = cast<VectorType>(V->getType());
2625  assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2626  Type *SrcElemTy = SrcVecTy->getElementType();
2627  Type *DstElemTy = DstVTy->getElementType();
2628  assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2629  "Vector elements must have same size");
2630 
2631  // Do a direct cast if element types are castable.
2632  if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2633  return Builder.CreateBitOrPointerCast(V, DstVTy);
2634  }
2635  // V cannot be directly casted to desired vector type.
2636  // May happen when V is a floating point vector but DstVTy is a vector of
2637  // pointers or vice-versa. Handle this using a two-step bitcast using an
2638  // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2639  assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2640  "Only one type should be a pointer type");
2641  assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2642  "Only one type should be a floating point type");
2643  Type *IntTy =
2645  VectorType *VecIntTy = VectorType::get(IntTy, VF);
2646  Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2647  return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2648 }
2649 
2651  BasicBlock *Bypass) {
2652  Value *Count = getOrCreateTripCount(L);
2653  BasicBlock *BB = L->getLoopPreheader();
2655 
2656  // Generate code to check if the loop's trip count is less than VF * UF, or
2657  // equal to it in case a scalar epilogue is required; this implies that the
2658  // vector trip count is zero. This check also covers the case where adding one
2659  // to the backedge-taken count overflowed leading to an incorrect trip count
2660  // of zero. In this case we will also jump to the scalar loop.
2663 
2664  // If tail is to be folded, vector loop takes care of all iterations.
2665  Value *CheckMinIters = Builder.getFalse();
2666  if (!Cost->foldTailByMasking())
2667  CheckMinIters = Builder.CreateICmp(
2668  P, Count, ConstantInt::get(Count->getType(), VF * UF),
2669  "min.iters.check");
2670 
2671  BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2672  // Update dominator tree immediately if the generated block is a
2673  // LoopBypassBlock because SCEV expansions to generate loop bypass
2674  // checks may query it before the current function is finished.
2675  DT->addNewBlock(NewBB, BB);
2676  if (L->getParentLoop())
2677  L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2679  BranchInst::Create(Bypass, NewBB, CheckMinIters));
2680  LoopBypassBlocks.push_back(BB);
2681 }
2682 
2684  BasicBlock *BB = L->getLoopPreheader();
2685 
2686  // Generate the code to check that the SCEV assumptions that we made.
2687  // We want the new basic block to start at the first instruction in a
2688  // sequence of instructions that form a check.
2689  SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2690  "scev.check");
2691  Value *SCEVCheck =
2692  Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
2693 
2694  if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2695  if (C->isZero())
2696  return;
2697 
2698  assert(!BB->getParent()->hasOptSize() &&
2699  "Cannot SCEV check stride or overflow when optimizing for size");
2700 
2701  // Create a new block containing the stride check.
2702  BB->setName("vector.scevcheck");
2703  auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2704  // Update dominator tree immediately if the generated block is a
2705  // LoopBypassBlock because SCEV expansions to generate loop bypass
2706  // checks may query it before the current function is finished.
2707  DT->addNewBlock(NewBB, BB);
2708  if (L->getParentLoop())
2709  L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2711  BranchInst::Create(Bypass, NewBB, SCEVCheck));
2712  LoopBypassBlocks.push_back(BB);
2713  AddedSafetyChecks = true;
2714 }
2715 
2717  // VPlan-native path does not do any analysis for runtime checks currently.
2719  return;
2720 
2721  BasicBlock *BB = L->getLoopPreheader();
2722 
2723  // Generate the code that checks in runtime if arrays overlap. We put the
2724  // checks into a separate block to make the more common case of few elements
2725  // faster.
2726  Instruction *FirstCheckInst;
2727  Instruction *MemRuntimeCheck;
2728  std::tie(FirstCheckInst, MemRuntimeCheck) =
2730  if (!MemRuntimeCheck)
2731  return;
2732 
2733  assert(!BB->getParent()->hasOptSize() &&
2734  "Cannot emit memory checks when optimizing for size");
2735 
2736  // Create a new block containing the memory check.
2737  BB->setName("vector.memcheck");
2738  auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2739  // Update dominator tree immediately if the generated block is a
2740  // LoopBypassBlock because SCEV expansions to generate loop bypass
2741  // checks may query it before the current function is finished.
2742  DT->addNewBlock(NewBB, BB);
2743  if (L->getParentLoop())
2744  L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2746  BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
2747  LoopBypassBlocks.push_back(BB);
2748  AddedSafetyChecks = true;
2749 
2750  // We currently don't use LoopVersioning for the actual loop cloning but we
2751  // still use it to add the noalias metadata.
2752  LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2753  PSE.getSE());
2754  LVer->prepareNoAliasMetadata();
2755 }
2756 
2758  IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2759  const InductionDescriptor &ID) const {
2760 
2761  SCEVExpander Exp(*SE, DL, "induction");
2762  auto Step = ID.getStep();
2763  auto StartValue = ID.getStartValue();
2764  assert(Index->getType() == Step->getType() &&
2765  "Index type does not match StepValue type");
2766 
2767  // Note: the IR at this point is broken. We cannot use SE to create any new
2768  // SCEV and then expand it, hoping that SCEV's simplification will give us
2769  // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2770  // lead to various SCEV crashes. So all we can do is to use builder and rely
2771  // on InstCombine for future simplifications. Here we handle some trivial
2772  // cases only.
2773  auto CreateAdd = [&B](Value *X, Value *Y) {
2774  assert(X->getType() == Y->getType() && "Types don't match!");
2775  if (auto *CX = dyn_cast<ConstantInt>(X))
2776  if (CX->isZero())
2777  return Y;
2778  if (auto *CY = dyn_cast<ConstantInt>(Y))
2779  if (CY->isZero())
2780  return X;
2781  return B.CreateAdd(X, Y);
2782  };
2783 
2784  auto CreateMul = [&B](Value *X, Value *Y) {
2785  assert(X->getType() == Y->getType() && "Types don't match!");
2786  if (auto *CX = dyn_cast<ConstantInt>(X))
2787  if (CX->isOne())
2788  return Y;
2789  if (auto *CY = dyn_cast<ConstantInt>(Y))
2790  if (CY->isOne())
2791  return X;
2792  return B.CreateMul(X, Y);
2793  };
2794 
2795  switch (ID.getKind()) {
2797  assert(Index->getType() == StartValue->getType() &&
2798  "Index type does not match StartValue type");
2800  return B.CreateSub(StartValue, Index);
2801  auto *Offset = CreateMul(
2802  Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2803  return CreateAdd(StartValue, Offset);
2804  }
2806  assert(isa<SCEVConstant>(Step) &&
2807  "Expected constant step for pointer induction");
2808  return B.CreateGEP(
2809  StartValue->getType()->getPointerElementType(), StartValue,
2810  CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2811  &*B.GetInsertPoint())));
2812  }
2814  assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2815  auto InductionBinOp = ID.getInductionBinOp();
2816  assert(InductionBinOp &&
2817  (InductionBinOp->getOpcode() == Instruction::FAdd ||
2818  InductionBinOp->getOpcode() == Instruction::FSub) &&
2819  "Original bin op should be defined for FP induction");
2820 
2821  Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2822 
2823  // Floating point operations had to be 'fast' to enable the induction.
2824  FastMathFlags Flags;
2825  Flags.setFast();
2826 
2827  Value *MulExp = B.CreateFMul(StepValue, Index);
2828  if (isa<Instruction>(MulExp))
2829  // We have to check, the MulExp may be a constant.
2830  cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2831 
2832  Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2833  "induction");
2834  if (isa<Instruction>(BOp))
2835  cast<Instruction>(BOp)->setFastMathFlags(Flags);
2836 
2837  return BOp;
2838  }
2840  return nullptr;
2841  }
2842  llvm_unreachable("invalid enum");
2843 }
2844 
2846  /*
2847  In this function we generate a new loop. The new loop will contain
2848  the vectorized instructions while the old loop will continue to run the
2849  scalar remainder.
2850 
2851  [ ] <-- loop iteration number check.
2852  / |
2853  / v
2854  | [ ] <-- vector loop bypass (may consist of multiple blocks).
2855  | / |
2856  | / v
2857  || [ ] <-- vector pre header.
2858  |/ |
2859  | v
2860  | [ ] \
2861  | [ ]_| <-- vector loop.
2862  | |
2863  | v
2864  | -[ ] <--- middle-block.
2865  | / |
2866  | / v
2867  -|- >[ ] <--- new preheader.
2868  | |
2869  | v
2870  | [ ] \
2871  | [ ]_| <-- old scalar loop to handle remainder.
2872  \ |
2873  \ v
2874  >[ ] <-- exit block.
2875  ...
2876  */
2877 
2878  BasicBlock *OldBasicBlock = OrigLoop->getHeader();
2879  BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2880  BasicBlock *ExitBlock = OrigLoop->getExitBlock();
2881  MDNode *OrigLoopID = OrigLoop->getLoopID();
2882  assert(VectorPH && "Invalid loop structure");
2883  assert(ExitBlock && "Must have an exit block");
2884 
2885  // Some loops have a single integer induction variable, while other loops
2886  // don't. One example is c++ iterators that often have multiple pointer
2887  // induction variables. In the code below we also support a case where we
2888  // don't have a single induction variable.
2889  //
2890  // We try to obtain an induction variable from the original loop as hard
2891  // as possible. However if we don't find one that:
2892  // - is an integer
2893  // - counts from zero, stepping by one
2894  // - is the size of the widest induction variable type
2895  // then we create a new one.
2897  Type *IdxTy = Legal->getWidestInductionType();
2898 
2899  // Split the single block loop into the two loop structure described above.
2900  BasicBlock *VecBody =
2901  VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
2902  BasicBlock *MiddleBlock =
2903  VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
2904  BasicBlock *ScalarPH =
2905  MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
2906 
2907  // Create and register the new vector loop.
2908  Loop *Lp = LI->AllocateLoop();
2909  Loop *ParentLoop = OrigLoop->getParentLoop();
2910 
2911  // Insert the new loop into the loop nest and register the new basic blocks
2912  // before calling any utilities such as SCEV that require valid LoopInfo.
2913  if (ParentLoop) {
2914  ParentLoop->addChildLoop(Lp);
2915  ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
2916  ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
2917  } else {
2918  LI->addTopLevelLoop(Lp);
2919  }
2920  Lp->addBasicBlockToLoop(VecBody, *LI);
2921 
2922  // Find the loop boundaries.
2923  Value *Count = getOrCreateTripCount(Lp);
2924 
2925  Value *StartIdx = ConstantInt::get(IdxTy, 0);
2926 
2927  // Now, compare the new count to zero. If it is zero skip the vector loop and
2928  // jump to the scalar loop. This check also covers the case where the
2929  // backedge-taken count is uint##_max: adding one to it will overflow leading
2930  // to an incorrect trip count of zero. In this (rare) case we will also jump
2931  // to the scalar loop.
2932  emitMinimumIterationCountCheck(Lp, ScalarPH);
2933 
2934  // Generate the code to check any assumptions that we've made for SCEV
2935  // expressions.
2936  emitSCEVChecks(Lp, ScalarPH);
2937 
2938  // Generate the code that checks in runtime if arrays overlap. We put the
2939  // checks into a separate block to make the more common case of few elements
2940  // faster.
2941  emitMemRuntimeChecks(Lp, ScalarPH);
2942 
2943  // Generate the induction variable.
2944  // The loop step is equal to the vectorization factor (num of SIMD elements)
2945  // times the unroll factor (num of SIMD instructions).
2946  Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
2947  Constant *Step = ConstantInt::get(IdxTy, VF * UF);
2948  Induction =
2949  createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
2951 
2952  // We are going to resume the execution of the scalar loop.
2953  // Go over all of the induction variables that we found and fix the
2954  // PHIs that are left in the scalar version of the loop.
2955  // The starting values of PHI nodes depend on the counter of the last
2956  // iteration in the vectorized loop.
2957  // If we come from a bypass edge then we need to start from the original
2958  // start value.
2959 
2960  // This variable saves the new starting index for the scalar loop. It is used
2961  // to test if there are any tail iterations left once the vector loop has
2962  // completed.
2964  for (auto &InductionEntry : *List) {
2965  PHINode *OrigPhi = InductionEntry.first;
2966  InductionDescriptor II = InductionEntry.second;
2967 
2968  // Create phi nodes to merge from the backedge-taken check block.
2969  PHINode *BCResumeVal = PHINode::Create(
2970  OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
2971  // Copy original phi DL over to the new one.
2972  BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
2973  Value *&EndValue = IVEndValues[OrigPhi];
2974  if (OrigPhi == OldInduction) {
2975  // We know what the end value is.
2976  EndValue = CountRoundDown;
2977  } else {
2979  Type *StepType = II.getStep()->getType();
2980  Instruction::CastOps CastOp =
2981  CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
2982  Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
2983  const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2984  EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
2985  EndValue->setName("ind.end");
2986  }
2987 
2988  // The new PHI merges the original incoming value, in case of a bypass,
2989  // or the value at the end of the vectorized loop.
2990  BCResumeVal->addIncoming(EndValue, MiddleBlock);
2991 
2992  // Fix the scalar body counter (PHI node).
2993  // The old induction's phi node in the scalar body needs the truncated
2994  // value.
2995  for (BasicBlock *BB : LoopBypassBlocks)
2996  BCResumeVal->addIncoming(II.getStartValue(), BB);
2997  OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal);
2998  }
2999 
3000  // We need the OrigLoop (scalar loop part) latch terminator to help
3001  // produce correct debug info for the middle block BB instructions.
3002  // The legality check stage guarantees that the loop will have a single
3003  // latch.
3004  assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3005  "Scalar loop latch terminator isn't a branch");
3006  BranchInst *ScalarLatchBr =
3007  cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3008 
3009  // Add a check in the middle block to see if we have completed
3010  // all of the iterations in the first vector loop.
3011  // If (N - N%VF) == N, then we *don't* need to run the remainder.
3012  // If tail is to be folded, we know we don't need to run the remainder.
3013  Value *CmpN = Builder.getTrue();
3014  if (!Cost->foldTailByMasking()) {
3015  CmpN =
3016  CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3017  CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
3018 
3019  // Here we use the same DebugLoc as the scalar loop latch branch instead
3020  // of the corresponding compare because they may have ended up with
3021  // different line numbers and we want to avoid awkward line stepping while
3022  // debugging. Eg. if the compare has got a line number inside the loop.
3023  cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3024  }
3025 
3026  BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN);
3027  BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3028  ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst);
3029 
3030  // Get ready to start creating new instructions into the vectorized body.
3032 
3033  // Save the state.
3035  LoopScalarPreHeader = ScalarPH;
3036  LoopMiddleBlock = MiddleBlock;
3037  LoopExitBlock = ExitBlock;
3038  LoopVectorBody = VecBody;
3039  LoopScalarBody = OldBasicBlock;
3040 
3041  Optional<MDNode *> VectorizedLoopID =
3043  LLVMLoopVectorizeFollowupVectorized});
3044  if (VectorizedLoopID.hasValue()) {
3045  Lp->setLoopID(VectorizedLoopID.getValue());
3046 
3047  // Do not setAlreadyVectorized if loop attributes have been defined
3048  // explicitly.
3049  return LoopVectorPreHeader;
3050  }
3051 
3052  // Keep all loop hints from the original loop on the vector loop (we'll
3053  // replace the vectorizer-specific hints below).
3054  if (MDNode *LID = OrigLoop->getLoopID())
3055  Lp->setLoopID(LID);
3056 
3057  LoopVectorizeHints Hints(Lp, true, *ORE);
3058  Hints.setAlreadyVectorized();
3059 
3060  return LoopVectorPreHeader;
3061 }
3062 
3063 // Fix up external users of the induction variable. At this point, we are
3064 // in LCSSA form, with all external PHIs that use the IV having one input value,
3065 // coming from the remainder loop. We need those PHIs to also have a correct
3066 // value for the IV when arriving directly from the middle block.
3068  const InductionDescriptor &II,
3069  Value *CountRoundDown, Value *EndValue,
3070  BasicBlock *MiddleBlock) {
3071  // There are two kinds of external IV usages - those that use the value
3072  // computed in the last iteration (the PHI) and those that use the penultimate
3073  // value (the value that feeds into the phi from the loop latch).
3074  // We allow both, but they, obviously, have different values.
3075 
3076  assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3077 
3078  DenseMap<Value *, Value *> MissingVals;
3079 
3080  // An external user of the last iteration's value should see the value that
3081  // the remainder loop uses to initialize its own IV.
3083  for (User *U : PostInc->users()) {
3084  Instruction *UI = cast<Instruction>(U);
3085  if (!OrigLoop->contains(UI)) {
3086  assert(isa<PHINode>(UI) && "Expected LCSSA form");
3087  MissingVals[UI] = EndValue;
3088  }
3089  }
3090 
3091  // An external user of the penultimate value need to see EndValue - Step.
3092  // The simplest way to get this is to recompute it from the constituent SCEVs,
3093  // that is Start + (Step * (CRD - 1)).
3094  for (User *U : OrigPhi->users()) {
3095  auto *UI = cast<Instruction>(U);
3096  if (!OrigLoop->contains(UI)) {
3097  const DataLayout &DL =
3099  assert(isa<PHINode>(UI) && "Expected LCSSA form");
3100 
3101  IRBuilder<> B(MiddleBlock->getTerminator());
3102  Value *CountMinusOne = B.CreateSub(
3103  CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3104  Value *CMO =
3105  !II.getStep()->getType()->isIntegerTy()
3106  ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3107  II.getStep()->getType())
3108  : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3109  CMO->setName("cast.cmo");
3110  Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3111  Escape->setName("ind.escape");
3112  MissingVals[UI] = Escape;
3113  }
3114  }
3115 
3116  for (auto &I : MissingVals) {
3117  PHINode *PHI = cast<PHINode>(I.first);
3118  // One corner case we have to handle is two IVs "chasing" each-other,
3119  // that is %IV2 = phi [...], [ %IV1, %latch ]
3120  // In this case, if IV1 has an external use, we need to avoid adding both
3121  // "last value of IV1" and "penultimate value of IV2". So, verify that we
3122  // don't already have an incoming value for the middle block.
3123  if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3124  PHI->addIncoming(I.second, MiddleBlock);
3125  }
3126 }
3127 
3128 namespace {
3129 
3130 struct CSEDenseMapInfo {
3131  static bool canHandle(const Instruction *I) {
3132  return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3133  isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3134  }
3135 
3136  static inline Instruction *getEmptyKey() {
3138  }
3139 
3140  static inline Instruction *getTombstoneKey() {
3142  }
3143 
3144  static unsigned getHashValue(const Instruction *I) {
3145  assert(canHandle(I) && "Unknown instruction!");
3147  I->value_op_end()));
3148  }
3149 
3150  static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3151  if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3152  LHS == getTombstoneKey() || RHS == getTombstoneKey())
3153  return LHS == RHS;
3154  return LHS->isIdenticalTo(RHS);
3155  }
3156 };
3157 
3158 } // end anonymous namespace
3159 
3160 ///Perform cse of induction variable instructions.
3161 static void cse(BasicBlock *BB) {
3162  // Perform simple cse.
3164  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3165  Instruction *In = &*I++;
3166 
3167  if (!CSEDenseMapInfo::canHandle(In))
3168  continue;
3169 
3170  // Check if we can replace this instruction with any of the
3171  // visited instructions.
3172  if (Instruction *V = CSEMap.lookup(In)) {
3173  In->replaceAllUsesWith(V);
3174  In->eraseFromParent();
3175  continue;
3176  }
3177 
3178  CSEMap[In] = In;
3179  }
3180 }
3181 
3183  unsigned VF,
3184  bool &NeedToScalarize) {
3185  Function *F = CI->getCalledFunction();
3186  StringRef FnName = CI->getCalledFunction()->getName();
3187  Type *ScalarRetTy = CI->getType();
3188  SmallVector<Type *, 4> Tys, ScalarTys;
3189  for (auto &ArgOp : CI->arg_operands())
3190  ScalarTys.push_back(ArgOp->getType());
3191 
3192  // Estimate cost of scalarized vector call. The source operands are assumed
3193  // to be vectors, so we need to extract individual elements from there,
3194  // execute VF scalar calls, and then gather the result into the vector return
3195  // value.
3196  unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3197  if (VF == 1)
3198  return ScalarCallCost;
3199 
3200  // Compute corresponding vector type for return value and arguments.
3201  Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3202  for (Type *ScalarTy : ScalarTys)
3203  Tys.push_back(ToVectorTy(ScalarTy, VF));
3204 
3205  // Compute costs of unpacking argument values for the scalar calls and
3206  // packing the return values to a vector.
3207  unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3208 
3209  unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3210 
3211  // If we can't emit a vector call for this function, then the currently found
3212  // cost is the cost we need to return.
3213  NeedToScalarize = true;
3214  if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
3215  return Cost;
3216 
3217  // If the corresponding vector cost is cheaper, return its cost.
3218  unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3219  if (VectorCallCost < Cost) {
3220  NeedToScalarize = false;
3221  return VectorCallCost;
3222  }
3223  return Cost;
3224 }
3225 
3227  unsigned VF) {
3229  assert(ID && "Expected intrinsic call!");
3230 
3231  FastMathFlags FMF;
3232  if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3233  FMF = FPMO->getFastMathFlags();
3234 
3236  return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3237 }
3238 
3240  auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3241  auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3242  return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3243 }
3245  auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3246  auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3247  return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3248 }
3249 
3251  // For every instruction `I` in MinBWs, truncate the operands, create a
3252  // truncated version of `I` and reextend its result. InstCombine runs
3253  // later and will remove any ext/trunc pairs.
3254  SmallPtrSet<Value *, 4> Erased;
3255  for (const auto &KV : Cost->getMinimalBitwidths()) {
3256  // If the value wasn't vectorized, we must maintain the original scalar
3257  // type. The absence of the value from VectorLoopValueMap indicates that it
3258  // wasn't vectorized.
3259  if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3260  continue;
3261  for (unsigned Part = 0; Part < UF; ++Part) {
3262  Value *I = getOrCreateVectorValue(KV.first, Part);
3263  if (Erased.find(I) != Erased.end() || I->use_empty() ||
3264  !isa<Instruction>(I))
3265  continue;
3266  Type *OriginalTy = I->getType();
3267  Type *ScalarTruncatedTy =
3268  IntegerType::get(OriginalTy->getContext(), KV.second);
3269  Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3270  OriginalTy->getVectorNumElements());
3271  if (TruncatedTy == OriginalTy)
3272  continue;
3273 
3274  IRBuilder<> B(cast<Instruction>(I));
3275  auto ShrinkOperand = [&](Value *V) -> Value * {
3276  if (auto *ZI = dyn_cast<ZExtInst>(V))
3277  if (ZI->getSrcTy() == TruncatedTy)
3278  return ZI->getOperand(0);
3279  return B.CreateZExtOrTrunc(V, TruncatedTy);
3280  };
3281 
3282  // The actual instruction modification depends on the instruction type,
3283  // unfortunately.
3284  Value *NewI = nullptr;
3285  if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3286  NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3287  ShrinkOperand(BO->getOperand(1)));
3288 
3289  // Any wrapping introduced by shrinking this operation shouldn't be
3290  // considered undefined behavior. So, we can't unconditionally copy
3291  // arithmetic wrapping flags to NewI.
3292  cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3293  } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3294  NewI =
3295  B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3296  ShrinkOperand(CI->getOperand(1)));
3297  } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3298  NewI = B.CreateSelect(SI->getCondition(),
3299  ShrinkOperand(SI->getTrueValue()),
3300  ShrinkOperand(SI->getFalseValue()));
3301  } else if (auto *CI = dyn_cast<CastInst>(I)) {
3302  switch (CI->getOpcode()) {
3303  default:
3304  llvm_unreachable("Unhandled cast!");
3305  case Instruction::Trunc:
3306  NewI = ShrinkOperand(CI->getOperand(0));
3307  break;
3308  case Instruction::SExt:
3309  NewI = B.CreateSExtOrTrunc(
3310  CI->getOperand(0),
3311  smallestIntegerVectorType(OriginalTy, TruncatedTy));
3312  break;
3313  case Instruction::ZExt:
3314  NewI = B.CreateZExtOrTrunc(
3315  CI->getOperand(0),
3316  smallestIntegerVectorType(OriginalTy, TruncatedTy));
3317  break;
3318  }
3319  } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3320  auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3321  auto *O0 = B.CreateZExtOrTrunc(
3322  SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3323  auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3324  auto *O1 = B.CreateZExtOrTrunc(
3325  SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3326 
3327  NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3328  } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3329  // Don't do anything with the operands, just extend the result.
3330  continue;
3331  } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3332  auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3333  auto *O0 = B.CreateZExtOrTrunc(
3334  IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3335  auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3336  NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3337  } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3338  auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3339  auto *O0 = B.CreateZExtOrTrunc(
3340  EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3341  NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3342  } else {
3343  // If we don't know what to do, be conservative and don't do anything.
3344  continue;
3345  }
3346 
3347  // Lastly, extend the result.
3348  NewI->takeName(cast<Instruction>(I));
3349  Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3350  I->replaceAllUsesWith(Res);
3351  cast<Instruction>(I)->eraseFromParent();
3352  Erased.insert(I);
3353  VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3354  }
3355  }
3356 
3357  // We'll have created a bunch of ZExts that are now parentless. Clean up.
3358  for (const auto &KV : Cost->getMinimalBitwidths()) {
3359  // If the value wasn't vectorized, we must maintain the original scalar
3360  // type. The absence of the value from VectorLoopValueMap indicates that it
3361  // wasn't vectorized.
3362  if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3363  continue;
3364  for (unsigned Part = 0; Part < UF; ++Part) {
3365  Value *I = getOrCreateVectorValue(KV.first, Part);
3366  ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3367  if (Inst && Inst->use_empty()) {
3368  Value *NewI = Inst->getOperand(0);
3369  Inst->eraseFromParent();
3370  VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3371  }
3372  }
3373  }
3374 }
3375 
3377  // Insert truncates and extends for any truncated instructions as hints to
3378  // InstCombine.
3379  if (VF > 1)
3381 
3382  // Fix widened non-induction PHIs by setting up the PHI operands.
3383  if (OrigPHIsToFix.size()) {
3385  "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3387  }
3388 
3389  // At this point every instruction in the original loop is widened to a
3390  // vector form. Now we need to fix the recurrences in the loop. These PHI
3391  // nodes are currently empty because we did not want to introduce cycles.
3392  // This is the second stage of vectorizing recurrences.
3394 
3395  // Update the dominator tree.
3396  //
3397  // FIXME: After creating the structure of the new loop, the dominator tree is
3398  // no longer up-to-date, and it remains that way until we update it
3399  // here. An out-of-date dominator tree is problematic for SCEV,
3400  // because SCEVExpander uses it to guide code generation. The
3401  // vectorizer use SCEVExpanders in several places. Instead, we should
3402  // keep the dominator tree up-to-date as we go.
3403  updateAnalysis();
3404 
3405  // Fix-up external users of the induction variables.
3406  for (auto &Entry : *Legal->getInductionVars())
3407  fixupIVUsers(Entry.first, Entry.second,
3410 
3411  fixLCSSAPHIs();
3413  sinkScalarOperands(&*PI);
3414 
3415  // Remove redundant induction instructions.
3417 }
3418 
3420  // In order to support recurrences we need to be able to vectorize Phi nodes.
3421  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3422  // stage #2: We now need to fix the recurrences by adding incoming edges to
3423  // the currently empty PHI nodes. At this point every instruction in the
3424  // original loop is widened to a vector form so we can use them to construct
3425  // the incoming edges.
3426  for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3427  // Handle first-order recurrences and reductions that need to be fixed.
3428  if (Legal->isFirstOrderRecurrence(&Phi))
3430  else if (Legal->isReductionVariable(&Phi))
3431  fixReduction(&Phi);
3432  }
3433 }
3434 
3436  // This is the second phase of vectorizing first-order recurrences. An
3437  // overview of the transformation is described below. Suppose we have the
3438  // following loop.
3439  //
3440  // for (int i = 0; i < n; ++i)
3441  // b[i] = a[i] - a[i - 1];
3442  //
3443  // There is a first-order recurrence on "a". For this loop, the shorthand
3444  // scalar IR looks like:
3445  //
3446  // scalar.ph:
3447  // s_init = a[-1]
3448  // br scalar.body
3449  //
3450  // scalar.body:
3451  // i = phi [0, scalar.ph], [i+1, scalar.body]
3452  // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3453  // s2 = a[i]
3454  // b[i] = s2 - s1
3455  // br cond, scalar.body, ...
3456  //
3457  // In this example, s1 is a recurrence because it's value depends on the
3458  // previous iteration. In the first phase of vectorization, we created a
3459  // temporary value for s1. We now complete the vectorization and produce the
3460  // shorthand vector IR shown below (for VF = 4, UF = 1).
3461  //
3462  // vector.ph:
3463  // v_init = vector(..., ..., ..., a[-1])
3464  // br vector.body
3465  //
3466  // vector.body
3467  // i = phi [0, vector.ph], [i+4, vector.body]
3468  // v1 = phi [v_init, vector.ph], [v2, vector.body]
3469  // v2 = a[i, i+1, i+2, i+3];
3470  // v3 = vector(v1(3), v2(0, 1, 2))
3471  // b[i, i+1, i+2, i+3] = v2 - v3
3472  // br cond, vector.body, middle.block
3473  //
3474  // middle.block:
3475  // x = v2(3)
3476  // br scalar.ph
3477  //
3478  // scalar.ph:
3479  // s_init = phi [x, middle.block], [a[-1], otherwise]
3480  // br scalar.body
3481  //
3482  // After execution completes the vector loop, we extract the next value of
3483  // the recurrence (x) to use as the initial value in the scalar loop.
3484 
3485  // Get the original loop preheader and single loop latch.
3486  auto *Preheader = OrigLoop->getLoopPreheader();
3487  auto *Latch = OrigLoop->getLoopLatch();
3488 
3489  // Get the initial and previous values of the scalar recurrence.
3490  auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3491  auto *Previous = Phi->getIncomingValueForBlock(Latch);
3492 
3493  // Create a vector from the initial value.
3494  auto *VectorInit = ScalarInit;
3495  if (VF > 1) {
3497  VectorInit = Builder.CreateInsertElement(
3498  UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3499  Builder.getInt32(VF - 1), "vector.recur.init");
3500  }
3501 
3502  // We constructed a temporary phi node in the first phase of vectorization.
3503  // This phi node will eventually be deleted.
3505  cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3506 
3507  // Create a phi node for the new recurrence. The current value will either be
3508  // the initial value inserted into a vector or loop-varying vector value.
3509  auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3510  VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3511 
3512  // Get the vectorized previous value of the last part UF - 1. It appears last
3513  // among all unrolled iterations, due to the order of their construction.
3514  Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3515 
3516  // Set the insertion point after the previous value if it is an instruction.
3517  // Note that the previous value may have been constant-folded so it is not
3518  // guaranteed to be an instruction in the vector loop. Also, if the previous
3519  // value is a phi node, we should insert after all the phi nodes to avoid
3520  // breaking basic block verification.
3521  if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||
3522  isa<PHINode>(PreviousLastPart))
3524  else
3526  &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));
3527 
3528  // We will construct a vector for the recurrence by combining the values for
3529  // the current and previous iterations. This is the required shuffle mask.
3530  SmallVector<Constant *, 8> ShuffleMask(VF);
3531  ShuffleMask[0] = Builder.getInt32(VF - 1);
3532  for (unsigned I = 1; I < VF; ++I)
3533  ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3534 
3535  // The vector from which to take the initial value for the current iteration
3536  // (actual or unrolled). Initially, this is the vector phi node.
3537  Value *Incoming = VecPhi;
3538 
3539  // Shuffle the current and previous vector and update the vector parts.
3540  for (unsigned Part = 0; Part < UF; ++Part) {
3541  Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3542  Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3543  auto *Shuffle =
3544  VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3545  ConstantVector::get(ShuffleMask))
3546  : Incoming;
3547  PhiPart->replaceAllUsesWith(Shuffle);
3548  cast<Instruction>(PhiPart)->eraseFromParent();
3549  VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3550  Incoming = PreviousPart;
3551  }
3552 
3553  // Fix the latch value of the new recurrence in the vector loop.
3554  VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3555 
3556  // Extract the last vector element in the middle block. This will be the
3557  // initial value for the recurrence when jumping to the scalar loop.
3558  auto *ExtractForScalar = Incoming;
3559  if (VF > 1) {
3561  ExtractForScalar = Builder.CreateExtractElement(
3562  ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3563  }
3564  // Extract the second last element in the middle block if the
3565  // Phi is used outside the loop. We need to extract the phi itself
3566  // and not the last element (the phi update in the current iteration). This
3567  // will be the value when jumping to the exit block from the LoopMiddleBlock,
3568  // when the scalar loop is not run at all.
3569  Value *ExtractForPhiUsedOutsideLoop = nullptr;
3570  if (VF > 1)
3571  ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3572  Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3573  // When loop is unrolled without vectorizing, initialize
3574  // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3575  // `Incoming`. This is analogous to the vectorized case above: extracting the
3576  // second last element when VF > 1.
3577  else if (UF > 1)
3578  ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3579 
3580  // Fix the initial value of the original recurrence in the scalar loop.
3582  auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3583  for (auto *BB : predecessors(LoopScalarPreHeader)) {
3584  auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3585  Start->addIncoming(Incoming, BB);
3586  }
3587 
3589  Phi->setName("scalar.recur");
3590 
3591  // Finally, fix users of the recurrence outside the loop. The users will need
3592  // either the last value of the scalar recurrence or the last value of the
3593  // vector recurrence we extracted in the middle block. Since the loop is in
3594  // LCSSA form, we just need to find all the phi nodes for the original scalar
3595  // recurrence in the exit block, and then add an edge for the middle block.
3596  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3597  if (LCSSAPhi.getIncomingValue(0) == Phi) {
3598  LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3599  }
3600  }
3601 }
3602 
3604  Constant *Zero = Builder.getInt32(0);
3605 
3606  // Get it's reduction variable descriptor.
3608  "Unable to find the reduction variable");
3609  RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
3610 
3612  TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3613  Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3615  RdxDesc.getMinMaxRecurrenceKind();
3616  setDebugLocFromInst(Builder, ReductionStartValue);
3617 
3618  // We need to generate a reduction vector from the incoming scalar.
3619  // To do so, we need to generate the 'identity' vector and override
3620  // one of the elements with the incoming scalar reduction. We need
3621  // to do it in the vector-loop preheader.
3623 
3624  // This is the vector-clone of the value that leaves the loop.
3625  Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3626 
3627  // Find the reduction identity variable. Zero for addition, or, xor,
3628  // one for multiplication, -1 for And.
3629  Value *Identity;
3630  Value *VectorStart;
3633  // MinMax reduction have the start value as their identify.
3634  if (VF == 1) {
3635  VectorStart = Identity = ReductionStartValue;
3636  } else {
3637  VectorStart = Identity =
3638  Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3639  }
3640  } else {
3641  // Handle other reduction kinds:
3643  RK, VecTy->getScalarType());
3644  if (VF == 1) {
3645  Identity = Iden;
3646  // This vector is the Identity vector where the first element is the
3647  // incoming scalar reduction.
3648  VectorStart = ReductionStartValue;
3649  } else {
3650  Identity = ConstantVector::getSplat(VF, Iden);
3651 
3652  // This vector is the Identity vector where the first element is the
3653  // incoming scalar reduction.
3654  VectorStart =
3655  Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3656  }
3657  }
3658 
3659  // Fix the vector-loop phi.
3660 
3661  // Reductions do not have to start at zero. They can start with
3662  // any loop invariant values.
3663  BasicBlock *Latch = OrigLoop->getLoopLatch();
3664  Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3665  for (unsigned Part = 0; Part < UF; ++Part) {
3666  Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3667  Value *Val = getOrCreateVectorValue(LoopVal, Part);
3668  // Make sure to add the reduction stat value only to the
3669  // first unroll part.
3670  Value *StartVal = (Part == 0) ? VectorStart : Identity;
3671  cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3672  cast<PHINode>(VecRdxPhi)
3673  ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3674  }
3675 
3676  // Before each round, move the insertion point right between
3677  // the PHIs and the values we are going to write.
3678  // This allows us to write both PHINodes and the extractelement
3679  // instructions.
3681 
3682  setDebugLocFromInst(Builder, LoopExitInst);
3683 
3684  // If tail is folded by masking, the vector value to leave the loop should be
3685  // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3686  // instead of the former.
3687  if (Cost->foldTailByMasking()) {
3688  for (unsigned Part = 0; Part < UF; ++Part) {
3689  Value *VecLoopExitInst =
3690  VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3691  Value *Sel = nullptr;
3692  for (User *U : VecLoopExitInst->users()) {
3693  if (isa<SelectInst>(U)) {
3694  assert(!Sel && "Reduction exit feeding two selects");
3695  Sel = U;
3696  } else
3697  assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3698  }
3699  assert(Sel && "Reduction exit feeds no select");
3700  VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
3701  }
3702  }
3703 
3704  // If the vector reduction can be performed in a smaller type, we truncate
3705  // then extend the loop exit value to enable InstCombine to evaluate the
3706  // entire expression in the smaller type.
3707  if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3708  Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3711  VectorParts RdxParts(UF);
3712  for (unsigned Part = 0; Part < UF; ++Part) {
3713  RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3714  Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3715  Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3716  : Builder.CreateZExt(Trunc, VecTy);
3717  for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3718  UI != RdxParts[Part]->user_end();)
3719  if (*UI != Trunc) {
3720  (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3721  RdxParts[Part] = Extnd;
3722  } else {
3723  ++UI;
3724  }
3725  }
3727  for (unsigned Part = 0; Part < UF; ++Part) {
3728  RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3729  VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3730  }
3731  }
3732 
3733  // Reduce all of the unrolled parts into a single vector.
3734  Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3736 
3737  // The middle block terminator has already been assigned a DebugLoc here (the
3738  // OrigLoop's single latch terminator). We want the whole middle block to
3739  // appear to execute on this line because: (a) it is all compiler generated,
3740  // (b) these instructions are always executed after evaluating the latch
3741  // conditional branch, and (c) other passes may add new predecessors which
3742  // terminate on this line. This is the easiest way to ensure we don't
3743  // accidentally cause an extra step back into the loop while debugging.
3745  for (unsigned Part = 1; Part < UF; ++Part) {
3746  Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3747  if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3748  // Floating point operations had to be 'fast' to enable the reduction.
3749  ReducedPartRdx = addFastMathFlag(
3751  ReducedPartRdx, "bin.rdx"),
3752  RdxDesc.getFastMathFlags());
3753  else
3754  ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3755  RdxPart);
3756  }
3757 
3758  if (VF > 1) {
3759  bool NoNaN = Legal->hasFunNoNaNAttr();
3760  ReducedPartRdx =
3761  createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3762  // If the reduction can be performed in a smaller type, we need to extend
3763  // the reduction to the wider type before we branch to the original loop.
3764  if (Phi->getType() != RdxDesc.getRecurrenceType())
3765  ReducedPartRdx =
3766  RdxDesc.isSigned()
3767  ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3768  : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3769  }
3770 
3771  // Create a phi node that merges control-flow from the backedge-taken check
3772  // block and the middle block.
3773  PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3775  for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3776  BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3777  BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3778 
3779  // Now, we need to fix the users of the reduction variable
3780  // inside and outside of the scalar remainder loop.
3781  // We know that the loop is in LCSSA form. We need to update the
3782  // PHI nodes in the exit blocks.
3783  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3784  // All PHINodes need to have a single entry edge, or two if
3785  // we already fixed them.
3786  assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3787 
3788  // We found a reduction value exit-PHI. Update it with the
3789  // incoming bypass edge.
3790  if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3791  LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3792  } // end of the LCSSA phi scan.
3793 
3794  // Fix the scalar loop reduction variable with the incoming reduction sum
3795  // from the vector body and from the backedge value.
3796  int IncomingEdgeBlockIdx =
3798  assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3799  // Pick the other block.
3800  int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3801  Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3802  Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3803 }
3804 
3806  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3807  if (LCSSAPhi.getNumIncomingValues() == 1) {
3808  auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3809  // Non-instruction incoming values will have only one value.
3810  unsigned LastLane = 0;
3811  if (isa<Instruction>(IncomingValue))
3812  LastLane = Cost->isUniformAfterVectorization(
3813  cast<Instruction>(IncomingValue), VF)
3814  ? 0
3815  : VF - 1;
3816  // Can be a loop invariant incoming value or the last scalar value to be
3817  // extracted from the vectorized loop.
3819  Value *lastIncomingValue =
3820  getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3821  LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3822  }
3823  }
3824 }
3825 
3827  // The basic block and loop containing the predicated instruction.
3828  auto *PredBB = PredInst->getParent();
3829  auto *VectorLoop = LI->getLoopFor(PredBB);
3830 
3831  // Initialize a worklist with the operands of the predicated instruction.
3832  SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3833 
3834  // Holds instructions that we need to analyze again. An instruction may be
3835  // reanalyzed if we don't yet know if we can sink it or not.
3836  SmallVector<Instruction *, 8> InstsToReanalyze;
3837 
3838  // Returns true if a given use occurs in the predicated block. Phi nodes use
3839  // their operands in their corresponding predecessor blocks.
3840  auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3841  auto *I = cast<Instruction>(U.getUser());
3842  BasicBlock *BB = I->getParent();
3843  if (auto *Phi = dyn_cast<PHINode>(I))
3844  BB = Phi->getIncomingBlock(
3845  PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3846  return BB == PredBB;
3847  };
3848 
3849  // Iteratively sink the scalarized operands of the predicated instruction
3850  // into the block we created for it. When an instruction is sunk, it's
3851  // operands are then added to the worklist. The algorithm ends after one pass
3852  // through the worklist doesn't sink a single instruction.
3853  bool Changed;
3854  do {
3855  // Add the instructions that need to be reanalyzed to the worklist, and
3856  // reset the changed indicator.
3857  Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3858  InstsToReanalyze.clear();
3859  Changed = false;
3860 
3861  while (!Worklist.empty()) {
3862  auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3863 
3864  // We can't sink an instruction if it is a phi node, is already in the
3865  // predicated block, is not in the loop, or may have side effects.
3866  if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
3867  !VectorLoop->contains(I) || I->mayHaveSideEffects())
3868  continue;
3869 
3870  // It's legal to sink the instruction if all its uses occur in the
3871  // predicated block. Otherwise, there's nothing to do yet, and we may
3872  // need to reanalyze the instruction.
3873  if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3874  InstsToReanalyze.push_back(I);
3875  continue;
3876  }
3877 
3878  // Move the instruction to the beginning of the predicated block, and add
3879  // it's operands to the worklist.
3880  I->moveBefore(&*PredBB->getFirstInsertionPt());
3881  Worklist.insert(I->op_begin(), I->op_end());
3882 
3883  // The sinking may have enabled other instructions to be sunk, so we will
3884  // need to iterate.
3885  Changed = true;
3886  }
3887  } while (Changed);
3888 }
3889 
3891  for (PHINode *OrigPhi : OrigPHIsToFix) {
3892  PHINode *NewPhi =
3893  cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
3894  unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
3895 
3896  SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
3897  predecessors(OrigPhi->getParent()));
3898  SmallVector<BasicBlock *, 2> VectorBBPredecessors(
3899  predecessors(NewPhi->getParent()));
3900  assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
3901  "Scalar and Vector BB should have the same number of predecessors");
3902 
3903  // The insertion point in Builder may be invalidated by the time we get
3904  // here. Force the Builder insertion point to something valid so that we do
3905  // not run into issues during insertion point restore in
3906  // getOrCreateVectorValue calls below.
3907  Builder.SetInsertPoint(NewPhi);
3908 
3909  // The predecessor order is preserved and we can rely on mapping between
3910  // scalar and vector block predecessors.
3911  for (unsigned i = 0; i < NumIncomingValues; ++i) {
3912  BasicBlock *NewPredBB = VectorBBPredecessors[i];
3913 
3914  // When looking up the new scalar/vector values to fix up, use incoming
3915  // values from original phi.
3916  Value *ScIncV =
3917  OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
3918 
3919  // Scalar incoming value may need a broadcast
3920  Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
3921  NewPhi->addIncoming(NewIncV, NewPredBB);
3922  }
3923  }
3924 }
3925 
3927  unsigned VF) {
3928  PHINode *P = cast<PHINode>(PN);
3929  if (EnableVPlanNativePath) {
3930  // Currently we enter here in the VPlan-native path for non-induction
3931  // PHIs where all control flow is uniform. We simply widen these PHIs.
3932  // Create a vector phi with no operands - the vector phi operands will be
3933  // set at the end of vector code generation.
3934  Type *VecTy =
3935  (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3936  Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
3937  VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
3938  OrigPHIsToFix.push_back(P);
3939 
3940  return;
3941  }
3942 
3943  assert(PN->getParent() == OrigLoop->getHeader() &&
3944  "Non-header phis should have been handled elsewhere");
3945 
3946  // In order to support recurrences we need to be able to vectorize Phi nodes.
3947  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3948  // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3949  // this value when we vectorize all of the instructions that use the PHI.
3951  for (unsigned Part = 0; Part < UF; ++Part) {
3952  // This is phase one of vectorizing PHIs.
3953  Type *VecTy =
3954  (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3955  Value *EntryPart = PHINode::Create(
3956  VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
3957  VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
3958  }
3959  return;
3960  }
3961 
3963 
3964  // This PHINode must be an induction variable.
3965  // Make sure that we know about it.
3966  assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
3967 
3969  const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
3970 
3971  // FIXME: The newly created binary instructions should contain nsw/nuw flags,
3972  // which can be found from the original scalar operations.
3973  switch (II.getKind()) {
3975  llvm_unreachable("Unknown induction");
3978  llvm_unreachable("Integer/fp induction is handled elsewhere.");
3980  // Handle the pointer induction variable case.
3981  assert(P->getType()->isPointerTy() && "Unexpected type.");
3982  // This is the normalized GEP that starts counting at zero.
3983  Value *PtrInd = Induction;
3984  PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
3985  // Determine the number of scalars we need to generate for each unroll
3986  // iteration. If the instruction is uniform, we only need to generate the
3987  // first lane. Otherwise, we generate all VF values.
3988  unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
3989  // These are the scalar results. Notice that we don't generate vector GEPs
3990  // because scalar GEPs result in better code.
3991  for (unsigned Part = 0; Part < UF; ++Part) {
3992  for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
3993  Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
3994  Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
3995  Value *SclrGep =
3996  emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
3997  SclrGep->setName("next.gep");
3998  VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
3999  }
4000  }
4001  return;
4002  }
4003  }
4004 }
4005 
4006 /// A helper function for checking whether an integer division-related
4007 /// instruction may divide by zero (in which case it must be predicated if
4008 /// executed conditionally in the scalar code).
4009 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4010 /// Non-zero divisors that are non compile-time constants will not be
4011 /// converted into multiplication, so we will still end up scalarizing
4012 /// the division, but can do so w/o predication.
4014  assert((I.getOpcode() == Instruction::UDiv ||
4015  I.getOpcode() == Instruction::SDiv ||
4016  I.getOpcode() == Instruction::URem ||
4017  I.getOpcode() == Instruction::SRem) &&
4018  "Unexpected instruction");
4019  Value *Divisor = I.getOperand(1);
4020  auto *CInt = dyn_cast<ConstantInt>(Divisor);
4021  return !CInt || CInt->isZero();
4022 }
4023 
4025  switch (I.getOpcode()) {
4026  case Instruction::Br:
4027  case Instruction::PHI:
4028  llvm_unreachable("This instruction is handled by a different recipe.");
4029  case Instruction::GetElementPtr: {
4030  // Construct a vector GEP by widening the operands of the scalar GEP as
4031  // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4032  // results in a vector of pointers when at least one operand of the GEP
4033  // is vector-typed. Thus, to keep the representation compact, we only use
4034  // vector-typed operands for loop-varying values.
4035  auto *GEP = cast<GetElementPtrInst>(&I);
4036 
4037  if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
4038  // If we are vectorizing, but the GEP has only loop-invariant operands,
4039  // the GEP we build (by only using vector-typed operands for
4040  // loop-varying values) would be a scalar pointer. Thus, to ensure we
4041  // produce a vector of pointers, we need to either arbitrarily pick an
4042  // operand to broadcast, or broadcast a clone of the original GEP.
4043  // Here, we broadcast a clone of the original.
4044  //
4045  // TODO: If at some point we decide to scalarize instructions having
4046  // loop-invariant operands, this special case will no longer be
4047  // required. We would add the scalarization decision to
4048  // collectLoopScalars() and teach getVectorValue() to broadcast
4049  // the lane-zero scalar value.
4050  auto *Clone = Builder.Insert(GEP->clone());
4051  for (unsigned Part = 0; Part < UF; ++Part) {
4052  Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4053  VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
4054  addMetadata(EntryPart, GEP);
4055  }
4056  } else {
4057  // If the GEP has at least one loop-varying operand, we are sure to
4058  // produce a vector of pointers. But if we are only unrolling, we want
4059  // to produce a scalar GEP for each unroll part. Thus, the GEP we
4060  // produce with the code below will be scalar (if VF == 1) or vector
4061  // (otherwise). Note that for the unroll-only case, we still maintain
4062  // values in the vector mapping with initVector, as we do for other
4063  // instructions.
4064  for (unsigned Part = 0; Part < UF; ++Part) {
4065  // The pointer operand of the new GEP. If it's loop-invariant, we
4066  // won't broadcast it.
4067  auto *Ptr =
4068  OrigLoop->isLoopInvariant(GEP->getPointerOperand())
4069  ? GEP->getPointerOperand()
4070  : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
4071 
4072  // Collect all the indices for the new GEP. If any index is
4073  // loop-invariant, we won't broadcast it.
4074  SmallVector<Value *, 4> Indices;
4075  for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
4076  if (OrigLoop->isLoopInvariant(U.get()))
4077  Indices.push_back(U.get());
4078  else
4079  Indices.push_back(getOrCreateVectorValue(U.get(), Part));
4080  }
4081 
4082  // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4083  // but it should be a vector, otherwise.
4084  auto *NewGEP =
4085  GEP->isInBounds()
4086  ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4087  Indices)
4088  : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4089  assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4090  "NewGEP is not a pointer vector");
4091  VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
4092  addMetadata(NewGEP, GEP);
4093  }
4094  }
4095 
4096  break;
4097  }
4098  case Instruction::UDiv:
4099  case Instruction::SDiv:
4100  case Instruction::SRem:
4101  case Instruction::URem:
4102  case Instruction::Add:
4103  case Instruction::FAdd:
4104  case Instruction::Sub:
4105  case Instruction::FSub:
4106  case Instruction::FNeg:
4107  case Instruction::Mul:
4108  case Instruction::FMul:
4109  case Instruction::FDiv:
4110  case Instruction::FRem:
4111  case Instruction::Shl:
4112  case Instruction::LShr:
4113  case Instruction::AShr:
4114  case Instruction::And:
4115  case Instruction::Or:
4116  case Instruction::Xor: {
4117  // Just widen unops and binops.
4119 
4120  for (unsigned Part = 0; Part < UF; ++Part) {
4122  for (Value *Op : I.operands())
4123  Ops.push_back(getOrCreateVectorValue(Op, Part));
4124 
4125  Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4126 
4127  if (auto *VecOp = dyn_cast<Instruction>(V))
4128  VecOp->copyIRFlags(&I);
4129 
4130  // Use this vector value for all users of the original instruction.
4131  VectorLoopValueMap.setVectorValue(&I, Part, V);
4132  addMetadata(V, &I);
4133  }
4134 
4135  break;
4136  }
4137  case Instruction::Select: {
4138  // Widen selects.
4139  // If the selector is loop invariant we can create a select
4140  // instruction with a scalar condition. Otherwise, use vector-select.
4141  auto *SE = PSE.getSE();
4142  bool InvariantCond =
4145 
4146  // The condition can be loop invariant but still defined inside the
4147  // loop. This means that we can't just use the original 'cond' value.
4148  // We have to take the 'vectorized' value and pick the first lane.
4149  // Instcombine will make this a no-op.
4150 
4151  auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4152 
4153  for (unsigned Part = 0; Part < UF; ++Part) {
4154  Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4155  Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4156  Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4157  Value *Sel =
4158  Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4159  VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4160  addMetadata(Sel, &I);
4161  }
4162 
4163  break;
4164  }
4165 
4166  case Instruction::ICmp:
4167  case Instruction::FCmp: {
4168  // Widen compares. Generate vector compares.
4169  bool FCmp = (I.getOpcode() == Instruction::FCmp);
4170  auto *Cmp = dyn_cast<CmpInst>(&I);
4172  for (unsigned Part = 0; Part < UF; ++Part) {
4173  Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4174  Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4175  Value *C = nullptr;
4176  if (FCmp) {
4177  // Propagate fast math flags.
4179  Builder.setFastMathFlags(Cmp->getFastMathFlags());
4180  C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4181  } else {
4182  C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4183  }
4184  VectorLoopValueMap.setVectorValue(&I, Part, C);
4185  addMetadata(C, &I);
4186  }
4187 
4188  break;
4189  }
4190 
4191  case Instruction::ZExt:
4192  case Instruction::SExt:
4193  case Instruction::FPToUI:
4194  case Instruction::FPToSI:
4195  case Instruction::FPExt:
4196  case Instruction::PtrToInt:
4197  case Instruction::IntToPtr:
4198  case Instruction::SIToFP:
4199  case Instruction::UIToFP:
4200  case Instruction::Trunc:
4201  case Instruction::FPTrunc:
4202  case Instruction::BitCast: {
4203  auto *CI = dyn_cast<CastInst>(&I);
4205 
4206  /// Vectorize casts.
4207  Type *DestTy =
4208  (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4209 
4210  for (unsigned Part = 0; Part < UF; ++Part) {
4211  Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4212  Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4213  VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4214  addMetadata(Cast, &I);
4215  }
4216  break;
4217  }
4218 
4219  case Instruction::Call: {
4220  // Ignore dbg intrinsics.
4221  if (isa<DbgInfoIntrinsic>(I))
4222  break;
4224 
4225  Module *M = I.getParent()->getParent()->getParent();
4226  auto *CI = cast<CallInst>(&I);
4227 
4228  StringRef FnName = CI->getCalledFunction()->getName();
4229  Function *F = CI->getCalledFunction();
4230  Type *RetTy = ToVectorTy(CI->getType(), VF);
4232  for (Value *ArgOperand : CI->arg_operands())
4233  Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4234 
4236 
4237  // The flag shows whether we use Intrinsic or a usual Call for vectorized
4238  // version of the instruction.
4239  // Is it beneficial to perform intrinsic call compared to lib call?
4240  bool NeedToScalarize;
4241  unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4242  bool UseVectorIntrinsic =
4243  ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4244  assert((UseVectorIntrinsic || !NeedToScalarize) &&
4245  "Instruction should be scalarized elsewhere.");
4246 
4247  for (unsigned Part = 0; Part < UF; ++Part) {
4249  for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4250  Value *Arg = CI->getArgOperand(i);
4251  // Some intrinsics have a scalar argument - don't replace it with a
4252  // vector.
4253  if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4254  Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4255  Args.push_back(Arg);
4256  }
4257 
4258  Function *VectorF;
4259  if (UseVectorIntrinsic) {
4260  // Use vector version of the intrinsic.
4261  Type *TysForDecl[] = {CI->getType()};
4262  if (VF > 1)
4263  TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4264  VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4265  } else {
4266  // Use vector version of the library call.
4267  StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
4268  assert(!VFnName.empty() && "Vector function name is empty.");
4269  VectorF = M->getFunction(VFnName);
4270  if (!VectorF) {
4271  // Generate a declaration
4272  FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
4273  VectorF =
4274  Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
4275  VectorF->copyAttributesFrom(F);
4276  }
4277  }
4278  assert(VectorF && "Can't create vector function.");
4279 
4281  CI->getOperandBundlesAsDefs(OpBundles);
4282  CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4283 
4284  if (isa<FPMathOperator>(V))
4285  V->copyFastMathFlags(CI);
4286 
4287  VectorLoopValueMap.setVectorValue(&I, Part, V);
4288  addMetadata(V, &I);
4289  }
4290 
4291  break;
4292  }
4293 
4294  default:
4295  // This instruction is not vectorized by simple widening.
4296  LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4297  llvm_unreachable("Unhandled instruction!");
4298  } // end of switch.
4299 }
4300 
4302  // Forget the original basic block.
4304 
4305  // DT is not kept up-to-date for outer loop vectorization
4307  return;
4308 
4309  // Update the dominator tree information.
4311  "Entry does not dominate exit.");
4312 
4319 }
4320 
4321 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4322  // We should not collect Scalars more than once per VF. Right now, this
4323  // function is called from collectUniformsAndScalars(), which already does
4324  // this check. Collecting Scalars for VF=1 does not make any sense.
4325  assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4326  "This function should not be visited twice for the same VF");
4327 
4329 
4330  // These sets are used to seed the analysis with pointers used by memory
4331  // accesses that will remain scalar.
4333  SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4334 
4335  // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4336  // The pointer operands of loads and stores will be scalar as long as the
4337  // memory access is not a gather or scatter operation. The value operand of a
4338  // store will remain scalar if the store is scalarized.
4339  auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4340  InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4341  assert(WideningDecision != CM_Unknown &&
4342  "Widening decision should be ready at this moment");
4343  if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4344  if (Ptr == Store->getValueOperand())
4345  return WideningDecision == CM_Scalarize;
4346  assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4347  "Ptr is neither a value or pointer operand");
4348  return WideningDecision != CM_GatherScatter;
4349  };
4350 
4351  // A helper that returns true if the given value is a bitcast or
4352  // getelementptr instruction contained in the loop.
4353  auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4354  return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4355  isa<GetElementPtrInst>(V)) &&
4356  !TheLoop->isLoopInvariant(V);
4357  };
4358 
4359  // A helper that evaluates a memory access's use of a pointer. If the use
4360  // will be a scalar use, and the pointer is only used by memory accesses, we
4361  // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4362  // PossibleNonScalarPtrs.
4363  auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4364  // We only care about bitcast and getelementptr instructions contained in
4365  // the loop.
4366  if (!isLoopVaryingBitCastOrGEP(Ptr))
4367  return;
4368 
4369  // If the pointer has already been identified as scalar (e.g., if it was
4370  // also identified as uniform), there's nothing to do.
4371  auto *I = cast<Instruction>(Ptr);
4372  if (Worklist.count(I))
4373  return;
4374 
4375  // If the use of the pointer will be a scalar use, and all users of the
4376  // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4377  // place the pointer in PossibleNonScalarPtrs.
4378  if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4379  return isa<LoadInst>(U) || isa<StoreInst>(U);
4380  }))
4381  ScalarPtrs.insert(I);
4382  else
4383  PossibleNonScalarPtrs.insert(I);
4384  };
4385 
4386  // We seed the scalars analysis with three classes of instructions: (1)
4387  // instructions marked uniform-after-vectorization, (2) bitcast and
4388  // getelementptr instructions used by memory accesses requiring a scalar use,
4389  // and (3) pointer induction variables and their update instructions (we
4390  // currently only scalarize these).
4391  //
4392  // (1) Add to the worklist all instructions that have been identified as
4393  // uniform-after-vectorization.
4394  Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4395 
4396  // (2) Add to the worklist all bitcast and getelementptr instructions used by
4397  // memory accesses requiring a scalar use. The pointer operands of loads and
4398  // stores will be scalar as long as the memory accesses is not a gather or
4399  // scatter operation. The value operand of a store will remain scalar if the
4400  // store is scalarized.
4401  for (auto *BB : TheLoop->blocks())
4402  for (auto &I : *BB) {
4403  if (auto *Load = dyn_cast<LoadInst>(&I)) {
4404  evaluatePtrUse(Load, Load->getPointerOperand());
4405  } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4406  evaluatePtrUse(Store, Store->getPointerOperand());
4407  evaluatePtrUse(Store, Store->getValueOperand());
4408  }
4409  }
4410  for (auto *I : ScalarPtrs)
4411  if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4412  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4413  Worklist.insert(I);
4414  }
4415 
4416  // (3) Add to the worklist all pointer induction variables and their update
4417  // instructions.
4418  //
4419  // TODO: Once we are able to vectorize pointer induction variables we should
4420  // no longer insert them into the worklist here.
4421  auto *Latch = TheLoop->getLoopLatch();
4422  for (auto &Induction : *Legal->getInductionVars()) {
4423  auto *Ind = Induction.first;
4424  auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4425  if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4426  continue;
4427  Worklist.insert(Ind);
4428  Worklist.insert(IndUpdate);
4429  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4430  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4431  << "\n");
4432  }
4433 
4434  // Insert the forced scalars.
4435  // FIXME: Currently widenPHIInstruction() often creates a dead vector
4436  // induction variable when the PHI user is scalarized.
4437  auto ForcedScalar = ForcedScalars.find(VF);
4438  if (ForcedScalar != ForcedScalars.end())
4439  for (auto *I : ForcedScalar->second)
4440  Worklist.insert(I);
4441 
4442  // Expand the worklist by looking through any bitcasts and getelementptr
4443  // instructions we've already identified as scalar. This is similar to the
4444  // expansion step in collectLoopUniforms(); however, here we're only
4445  // expanding to include additional bitcasts and getelementptr instructions.
4446  unsigned Idx = 0;
4447  while (Idx != Worklist.size()) {
4448  Instruction *Dst = Worklist[Idx++];
4449  if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4450  continue;
4451  auto *Src = cast<Instruction>(Dst->getOperand(0));
4452  if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4453  auto *J = cast<Instruction>(U);
4454  return !TheLoop->contains(J) || Worklist.count(J) ||
4455  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4456  isScalarUse(J, Src));
4457  })) {
4458  Worklist.insert(Src);
4459  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4460  }
4461  }
4462 
4463  // An induction variable will remain scalar if all users of the induction
4464  // variable and induction variable update remain scalar.
4465  for (auto &Induction : *Legal->getInductionVars()) {
4466  auto *Ind = Induction.first;
4467  auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4468 
4469  // We already considered pointer induction variables, so there's no reason
4470  // to look at their users again.
4471  //
4472  // TODO: Once we are able to vectorize pointer induction variables we
4473  // should no longer skip over them here.
4474  if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4475  continue;
4476 
4477  // Determine if all users of the induction variable are scalar after
4478  // vectorization.
4479  auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4480  auto *I = cast<Instruction>(U);
4481  return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4482  });
4483  if (!ScalarInd)
4484  continue;
4485 
4486  // Determine if all users of the induction variable update instruction are
4487  // scalar after vectorization.
4488  auto ScalarIndUpdate =
4489  llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4490  auto *I = cast<Instruction>(U);
4491  return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4492  });
4493  if (!ScalarIndUpdate)
4494  continue;
4495 
4496  // The induction variable and its update instruction will remain scalar.
4497  Worklist.insert(Ind);
4498  Worklist.insert(IndUpdate);
4499  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4500  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4501  << "\n");
4502  }
4503 
4504  Scalars[VF].insert(Worklist.begin(), Worklist.end());
4505 }
4506 
4508  if (!blockNeedsPredication(I->getParent()))
4509  return false;
4510  switch(I->getOpcode()) {
4511  default:
4512  break;
4513  case Instruction::Load:
4514  case Instruction::Store: {
4515  if (!Legal->isMaskRequired(I))
4516  return false;
4517  auto *Ptr = getLoadStorePointerOperand(I);
4518  auto *Ty = getMemInstValueType(I);
4519  // We have already decided how to vectorize this instruction, get that
4520  // result.
4521  if (VF > 1) {
4522  InstWidening WideningDecision = getWideningDecision(I, VF);
4523  assert(WideningDecision != CM_Unknown &&
4524  "Widening decision should be ready at this moment");
4525  return WideningDecision == CM_Scalarize;
4526  }
4527  return isa<LoadInst>(I) ?
4528  !(isLegalMaskedLoad(Ty, Ptr) || isLegalMaskedGather(Ty))
4529  : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty));
4530  }
4531  case Instruction::UDiv:
4532  case Instruction::SDiv:
4533  case Instruction::SRem:
4534  case Instruction::URem:
4535  return mayDivideByZero(*I);
4536  }
4537  return false;
4538 }
4539 
4541  unsigned VF) {
4542  assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4543  assert(getWideningDecision(I, VF) == CM_Unknown &&
4544  "Decision should not be set yet.");
4545  auto *Group = getInterleavedAccessGroup(I);
4546  assert(Group && "Must have a group.");
4547 
4548  // If the instruction's allocated size doesn't equal it's type size, it
4549  // requires padding and will be scalarized.
4550  auto &DL = I->getModule()->getDataLayout();
4551  auto *ScalarTy = getMemInstValueType(I);
4552  if (hasIrregularType(ScalarTy, DL, VF))
4553  return false;
4554 
4555  // Check if masking is required.
4556  // A Group may need masking for one of two reasons: it resides in a block that
4557  // needs predication, or it was decided to use masking to deal with gaps.
4558  bool PredicatedAccessRequiresMasking =
4560  bool AccessWithGapsRequiresMasking =
4561  Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4562  if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4563  return true;
4564 
4565  // If masked interleaving is required, we expect that the user/target had
4566  // enabled it, because otherwise it either wouldn't have been created or
4567  // it should have been invalidated by the CostModel.
4569  "Masked interleave-groups for predicated accesses are not enabled.");
4570 
4571  auto *Ty = getMemInstValueType(I);
4572  return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty)
4573  : TTI.isLegalMaskedStore(Ty);
4574 }
4575 
4577  unsigned VF) {
4578  // Get and ensure we have a valid memory instruction.
4579  LoadInst *LI = dyn_cast<LoadInst>(I);
4581  assert((LI || SI) && "Invalid memory instruction");
4582 
4583  auto *Ptr = getLoadStorePointerOperand(I);
4584 
4585  // In order to be widened, the pointer should be consecutive, first of all.
4586  if (!Legal->isConsecutivePtr(Ptr))
4587  return false;
4588 
4589  // If the instruction is a store located in a predicated block, it will be
4590  // scalarized.
4591  if (isScalarWithPredication(I))
4592  return false;
4593 
4594  // If the instruction's allocated size doesn't equal it's type size, it
4595  // requires padding and will be scalarized.
4596  auto &DL = I->getModule()->getDataLayout();
4597  auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4598  if (hasIrregularType(ScalarTy, DL, VF))
4599  return false;
4600 
4601  return true;
4602 }
4603 
4604 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4605  // We should not collect Uniforms more than once per VF. Right now,
4606  // this function is called from collectUniformsAndScalars(), which
4607  // already does this check. Collecting Uniforms for VF=1 does not make any
4608  // sense.
4609 
4610  assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4611  "This function should not be visited twice for the same VF");
4612 
4613  // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4614  // not analyze again. Uniforms.count(VF) will return 1.
4615  Uniforms[VF].clear();
4616 
4617  // We now know that the loop is vectorizable!
4618  // Collect instructions inside the loop that will remain uniform after
4619  // vectorization.
4620 
4621  // Global values, params and instructions outside of current loop are out of
4622  // scope.
4623  auto isOutOfScope = [&](Value *V) -> bool {
4625  return (!I || !TheLoop->contains(I));
4626  };
4627 
4628  SetVector<Instruction *> Worklist;
4629  BasicBlock *Latch = TheLoop->getLoopLatch();
4630 
4631  // Start with the conditional branch. If the branch condition is an
4632  // instruction contained in the loop that is only used by the branch, it is
4633  // uniform.
4634  auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4635  if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
4636  Worklist.insert(Cmp);
4637  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
4638  }
4639 
4640  // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4641  // are pointers that are treated like consecutive pointers during
4642  // vectorization. The pointer operands of interleaved accesses are an
4643  // example.
4644  SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4645 
4646  // Holds pointer operands of instructions that are possibly non-uniform.
4647  SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4648 
4649  auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4650  InstWidening WideningDecision = getWideningDecision(I, VF);
4651  assert(WideningDecision != CM_Unknown &&
4652  "Widening decision should be ready at this moment");
4653 
4654  return (WideningDecision == CM_Widen ||
4655  WideningDecision == CM_Widen_Reverse ||
4656  WideningDecision == CM_Interleave);
4657  };
4658  // Iterate over the instructions in the loop, and collect all
4659  // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4660  // that a consecutive-like pointer operand will be scalarized, we collect it
4661  // in PossibleNonUniformPtrs instead. We use two sets here because a single
4662  // getelementptr instruction can be used by both vectorized and scalarized
4663  // memory instructions. For example, if a loop loads and stores from the same
4664  // location, but the store is conditional, the store will be scalarized, and
4665  // the getelementptr won't remain uniform.
4666  for (auto *BB : TheLoop->blocks())
4667  for (auto &I : *BB) {
4668  // If there's no pointer operand, there's nothing to do.
4669  auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4670  if (!Ptr)
4671  continue;
4672 
4673  // True if all users of Ptr are memory accesses that have Ptr as their
4674  // pointer operand.
4675  auto UsersAreMemAccesses =
4676  llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4677  return getLoadStorePointerOperand(U) == Ptr;
4678  });
4679 
4680  // Ensure the memory instruction will not be scalarized or used by
4681  // gather/scatter, making its pointer operand non-uniform. If the pointer
4682  // operand is used by any instruction other than a memory access, we
4683  // conservatively assume the pointer operand may be non-uniform.
4684  if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4685  PossibleNonUniformPtrs.insert(Ptr);
4686 
4687  // If the memory instruction will be vectorized and its pointer operand
4688  // is consecutive-like, or interleaving - the pointer operand should
4689  // remain uniform.
4690  else
4691  ConsecutiveLikePtrs.insert(Ptr);
4692  }
4693 
4694  // Add to the Worklist all consecutive and consecutive-like pointers that
4695  // aren't also identified as possibly non-uniform.
4696  for (auto *V : ConsecutiveLikePtrs)
4697  if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) {
4698  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
4699  Worklist.insert(V);
4700  }
4701 
4702  // Expand Worklist in topological order: whenever a new instruction
4703  // is added , its users should be already inside Worklist. It ensures
4704  // a uniform instruction will only be used by uniform instructions.
4705  unsigned idx = 0;
4706  while (idx != Worklist.size()) {
4707  Instruction *I = Worklist[idx++];
4708 
4709  for (auto OV : I->operand_values()) {
4710  // isOutOfScope operands cannot be uniform instructions.
4711  if (isOutOfScope(OV))
4712  continue;
4713  // First order recurrence Phi's should typically be considered
4714  // non-uniform.
4715  auto *OP = dyn_cast<PHINode>(OV);
4716  if (OP && Legal->isFirstOrderRecurrence(OP))
4717  continue;
4718  // If all the users of the operand are uniform, then add the
4719  // operand into the uniform worklist.
4720  auto *OI = cast<Instruction>(OV);
4721  if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4722  auto *J = cast<Instruction>(U);
4723  return Worklist.count(J) ||
4724  (OI == getLoadStorePointerOperand(J) &&
4725  isUniformDecision(J, VF));
4726  })) {
4727  Worklist.insert(OI);
4728  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
4729  }
4730  }
4731  }
4732 
4733  // Returns true if Ptr is the pointer operand of a memory access instruction
4734  // I, and I is known to not require scalarization.
4735  auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4736  return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4737  };
4738 
4739  // For an instruction to be added into Worklist above, all its users inside
4740  // the loop should also be in Worklist. However, this condition cannot be
4741  // true for phi nodes that form a cyclic dependence. We must process phi
4742  // nodes separately. An induction variable will remain uniform if all users
4743  // of the induction variable and induction variable update remain uniform.
4744  // The code below handles both pointer and non-pointer induction variables.
4745  for (auto &Induction : *Legal->getInductionVars()) {
4746  auto *Ind = Induction.first;
4747  auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4748 
4749  // Determine if all users of the induction variable are uniform after
4750  // vectorization.
4751  auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4752  auto *I = cast<Instruction>(U);
4753  return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4754  isVectorizedMemAccessUse(I, Ind);
4755  });
4756  if (!UniformInd)
4757  continue;
4758 
4759  // Determine if all users of the induction variable update instruction are
4760  // uniform after vectorization.
4761  auto UniformIndUpdate =
4762  llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4763  auto *I = cast<Instruction>(U);
4764  return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4765  isVectorizedMemAccessUse(I, IndUpdate);
4766  });
4767  if (!UniformIndUpdate)
4768  continue;
4769 
4770  // The induction variable and its update instruction will remain uniform.
4771  Worklist.insert(Ind);
4772  Worklist.insert(IndUpdate);
4773  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
4774  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate
4775  << "\n");
4776  }
4777 
4778  Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4779 }
4780 
4782  LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4783 
4785  reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4786  "runtime pointer checks needed. Enable vectorization of this "
4787  "loop with '#pragma clang loop vectorize(enable)' when "
4788  "compiling with -Os/-Oz",
4789  "CantVersionLoopWithOptForSize", ORE, TheLoop);
4790  return true;
4791  }
4792 
4793  if (!PSE.getUnionPredicate().getPredicates().empty()) {
4794  reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4795  "runtime SCEV checks needed. Enable vectorization of this "
4796  "loop with '#pragma clang loop vectorize(enable)' when "
4797  "compiling with -Os/-Oz",
4798  "CantVersionLoopWithOptForSize", ORE, TheLoop);
4799  return true;
4800  }
4801 
4802  // FIXME: Avoid specializing for stride==1 instead of bailing out.
4803  if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4804  reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
4805  "runtime stride == 1 checks needed. Enable vectorization of "
4806  "this loop with '#pragma clang loop vectorize(enable)' when "
4807  "compiling with -Os/-Oz",
4808  "CantVersionLoopWithOptForSize", ORE, TheLoop);
4809  return true;
4810  }
4811 
4812  return false;
4813 }
4814 
4817  // TODO: It may by useful to do since it's still likely to be dynamically
4818  // uniform if the target can skip.
4820  "Not inserting runtime ptr check for divergent target",
4821  "runtime pointer checks needed. Not enabled for divergent target",
4822  "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4823  return None;
4824  }
4825 
4826  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4827  LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4828  if (TC == 1) {
4829  reportVectorizationFailure("Single iteration (non) loop",
4830  "loop trip count is one, irrelevant for vectorization",
4831  "SingleIterationLoop", ORE, TheLoop);
4832  return None;
4833  }
4834 
4835  switch (ScalarEpilogueStatus) {
4837  return computeFeasibleMaxVF(TC);
4839  LLVM_DEBUG(
4840  dbgs() << "LV: vector predicate hint/switch found.\n"
4841  << "LV: Not allowing scalar epilogue, creating predicated "
4842  << "vector loop.\n");
4843  break;
4845  // fallthrough as a special case of OptForSize
4847  if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4848  LLVM_DEBUG(
4849  dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4850  else
4851  LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4852  << "count.\n");
4853 
4854  // Bail if runtime checks are required, which are not good when optimising
4855  // for size.
4856  if (runtimeChecksRequired())
4857  return None;
4858  break;
4859  }
4860 
4861  // Now try the tail folding
4862 
4863  // Invalidate interleave groups that require an epilogue if we can't mask
4864  // the interleave-group.
4866  InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4867 
4868  unsigned MaxVF = computeFeasibleMaxVF(TC);
4869  if (TC > 0 && TC % MaxVF == 0) {
4870  // Accept MaxVF if we do not have a tail.
4871  LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4872  return MaxVF;
4873  }
4874 
4875  // If we don't know the precise trip count, or if the trip count that we
4876  // found modulo the vectorization factor is not zero, try to fold the tail
4877  // by masking.
4878  // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4880  FoldTailByMasking = true;
4881  return MaxVF;
4882  }
4883 
4884  if (TC == 0) {
4886  "Unable to calculate the loop count due to complex control flow",
4887  "unable to calculate the loop count due to complex control flow",
4888  "UnknownLoopCountComplexCFG", ORE, TheLoop);
4889  return None;
4890  }
4891 
4893  "Cannot optimize for size and vectorize at the same time.",
4894  "cannot optimize for size and vectorize at the same time. "
4895  "Enable vectorization of this loop with '#pragma clang loop "
4896  "vectorize(enable)' when compiling with -Os/-Oz",
4897  "NoTailLoopWithOptForSize", ORE, TheLoop);
4898  return None;
4899 }
4900 
4901 unsigned
4902 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
4903  MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4904  unsigned SmallestType, WidestType;
4905  std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4906  unsigned WidestRegister = TTI.getRegisterBitWidth(true);
4907 
4908  // Get the maximum safe dependence distance in bits computed by LAA.
4909  // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4910  // the memory accesses that is most restrictive (involved in the smallest
4911  // dependence distance).
4912  unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
4913 
4914  WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
4915 
4916  unsigned MaxVectorSize = WidestRegister / WidestType;
4917 
4918  LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4919  << " / " << WidestType << " bits.\n");
4920  LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4921  << WidestRegister << " bits.\n");
4922 
4923  assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
4924  " into one vector!");
4925  if (MaxVectorSize == 0) {
4926  LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
4927  MaxVectorSize = 1;
4928  return MaxVectorSize;
4929  } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
4930  isPowerOf2_32(ConstTripCount)) {
4931  // We need to clamp the VF to be the ConstTripCount. There is no point in
4932  // choosing a higher viable VF as done in the loop below.
4933  LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
4934  << ConstTripCount << "\n");
4935  MaxVectorSize = ConstTripCount;
4936  return MaxVectorSize;
4937  }
4938 
4939  unsigned MaxVF = MaxVectorSize;
4940  if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
4941  (MaximizeBandwidth && isScalarEpilogueAllowed())) {
4942  // Collect all viable vectorization factors larger than the default MaxVF
4943  // (i.e. MaxVectorSize).
4945  unsigned NewMaxVectorSize = WidestRegister / SmallestType;
4946