LLVM  9.0.0svn
LoopVectorize.cpp
Go to the documentation of this file.
1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 // of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 // widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 // of vectorization. It decides on the optimal vector width, which
26 // can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 // Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46 // Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52 // Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanHCFGTransforms.h"
62 #include "VPlanPredicator.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
82 #include "llvm/Analysis/CFG.h"
88 #include "llvm/Analysis/LoopInfo.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/LLVMContext.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
133 #include "llvm/Support/MathExtras.h"
141 #include <algorithm>
142 #include <cassert>
143 #include <cstdint>
144 #include <cstdlib>
145 #include <functional>
146 #include <iterator>
147 #include <limits>
148 #include <memory>
149 #include <string>
150 #include <tuple>
151 #include <utility>
152 #include <vector>
153 
154 using namespace llvm;
155 
156 #define LV_NAME "loop-vectorize"
157 #define DEBUG_TYPE LV_NAME
158 
159 /// @{
160 /// Metadata attribute names
161 static const char *const LLVMLoopVectorizeFollowupAll =
162  "llvm.loop.vectorize.followup_all";
163 static const char *const LLVMLoopVectorizeFollowupVectorized =
164  "llvm.loop.vectorize.followup_vectorized";
165 static const char *const LLVMLoopVectorizeFollowupEpilogue =
166  "llvm.loop.vectorize.followup_epilogue";
167 /// @}
168 
169 STATISTIC(LoopsVectorized, "Number of loops vectorized");
170 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
171 
172 /// Loops with a known constant trip count below this number are vectorized only
173 /// if no scalar iteration overheads are incurred.
175  "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
176  cl::desc("Loops with a constant trip count that is smaller than this "
177  "value are vectorized only if no scalar iteration overheads "
178  "are incurred."));
179 
181  "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
182  cl::desc("Maximize bandwidth when selecting vectorization factor which "
183  "will be determined by the smallest type in loop."));
184 
186  "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
187  cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
188 
189 /// An interleave-group may need masking if it resides in a block that needs
190 /// predication, or in order to mask away gaps.
192  "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
193  cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
194 
195 /// We don't interleave loops with a known constant trip count below this
196 /// number.
197 static const unsigned TinyTripCountInterleaveThreshold = 128;
198 
200  "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
201  cl::desc("A flag that overrides the target's number of scalar registers."));
202 
204  "force-target-num-vector-regs", cl::init(0), cl::Hidden,
205  cl::desc("A flag that overrides the target's number of vector registers."));
206 
208  "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
209  cl::desc("A flag that overrides the target's max interleave factor for "
210  "scalar loops."));
211 
213  "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
214  cl::desc("A flag that overrides the target's max interleave factor for "
215  "vectorized loops."));
216 
218  "force-target-instruction-cost", cl::init(0), cl::Hidden,
219  cl::desc("A flag that overrides the target's expected cost for "
220  "an instruction to a single constant value. Mostly "
221  "useful for getting consistent testing."));
222 
224  "small-loop-cost", cl::init(20), cl::Hidden,
225  cl::desc(
226  "The cost of a loop that is considered 'small' by the interleaver."));
227 
229  "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
230  cl::desc("Enable the use of the block frequency analysis to access PGO "
231  "heuristics minimizing code growth in cold regions and being more "
232  "aggressive in hot regions."));
233 
234 // Runtime interleave loops for load/store throughput.
236  "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
237  cl::desc(
238  "Enable runtime interleaving until load/store ports are saturated"));
239 
240 /// The number of stores in a loop that are allowed to need predication.
242  "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
243  cl::desc("Max number of stores to be predicated behind an if."));
244 
246  "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
247  cl::desc("Count the induction variable only once when interleaving"));
248 
250  "enable-cond-stores-vec", cl::init(true), cl::Hidden,
251  cl::desc("Enable if predication of stores during vectorization."));
252 
254  "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
255  cl::desc("The maximum interleave count to use when interleaving a scalar "
256  "reduction in a nested loop."));
257 
259  "enable-vplan-native-path", cl::init(false), cl::Hidden,
260  cl::desc("Enable VPlan-native vectorization path with "
261  "support for outer loop vectorization."));
262 
263 // FIXME: Remove this switch once we have divergence analysis. Currently we
264 // assume divergent non-backedge branches when this switch is true.
266  "enable-vplan-predication", cl::init(false), cl::Hidden,
267  cl::desc("Enable VPlan-native vectorization path predicator with "
268  "support for outer loop vectorization."));
269 
270 // This flag enables the stress testing of the VPlan H-CFG construction in the
271 // VPlan-native vectorization path. It must be used in conjuction with
272 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
273 // verification of the H-CFGs built.
275  "vplan-build-stress-test", cl::init(false), cl::Hidden,
276  cl::desc(
277  "Build VPlan for every supported loop nest in the function and bail "
278  "out right after the build (stress test the VPlan H-CFG construction "
279  "in the VPlan-native vectorization path)."));
280 
282  "interleave-loops", cl::init(true), cl::Hidden,
283  cl::desc("Enable loop interleaving in Loop vectorization passes"));
285  "vectorize-loops", cl::init(true), cl::Hidden,
286  cl::desc("Run the Loop vectorization passes"));
287 
288 /// A helper function for converting Scalar types to vector types.
289 /// If the incoming type is void, we return void. If the VF is 1, we return
290 /// the scalar type.
291 static Type *ToVectorTy(Type *Scalar, unsigned VF) {
292  if (Scalar->isVoidTy() || VF == 1)
293  return Scalar;
294  return VectorType::get(Scalar, VF);
295 }
296 
297 /// A helper function that returns the type of loaded or stored value.
299  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
300  "Expected Load or Store instruction");
301  if (auto *LI = dyn_cast<LoadInst>(I))
302  return LI->getType();
303  return cast<StoreInst>(I)->getValueOperand()->getType();
304 }
305 
306 /// A helper function that returns true if the given type is irregular. The
307 /// type is irregular if its allocated size doesn't equal the store size of an
308 /// element of the corresponding vector type at the given vectorization factor.
309 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
310  // Determine if an array of VF elements of type Ty is "bitcast compatible"
311  // with a <VF x Ty> vector.
312  if (VF > 1) {
313  auto *VectorTy = VectorType::get(Ty, VF);
314  return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
315  }
316 
317  // If the vectorization factor is one, we just check if an array of type Ty
318  // requires padding between elements.
319  return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
320 }
321 
322 /// A helper function that returns the reciprocal of the block probability of
323 /// predicated blocks. If we return X, we are assuming the predicated block
324 /// will execute once for every X iterations of the loop header.
325 ///
326 /// TODO: We should use actual block probability here, if available. Currently,
327 /// we always assume predicated blocks have a 50% chance of executing.
328 static unsigned getReciprocalPredBlockProb() { return 2; }
329 
330 /// A helper function that adds a 'fast' flag to floating-point operations.
332  if (isa<FPMathOperator>(V))
333  cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
334  return V;
335 }
336 
338  if (isa<FPMathOperator>(V))
339  cast<Instruction>(V)->setFastMathFlags(FMF);
340  return V;
341 }
342 
343 /// A helper function that returns an integer or floating-point constant with
344 /// value C.
345 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
346  return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
347  : ConstantFP::get(Ty, C);
348 }
349 
350 namespace llvm {
351 
352 /// InnerLoopVectorizer vectorizes loops which contain only one basic
353 /// block to a specified vectorization factor (VF).
354 /// This class performs the widening of scalars into vectors, or multiple
355 /// scalars. This class also implements the following features:
356 /// * It inserts an epilogue loop for handling loops that don't have iteration
357 /// counts that are known to be a multiple of the vectorization factor.
358 /// * It handles the code generation for reduction variables.
359 /// * Scalarization (implementation using scalars) of un-vectorizable
360 /// instructions.
361 /// InnerLoopVectorizer does not perform any vectorization-legality
362 /// checks, and relies on the caller to check for the different legality
363 /// aspects. The InnerLoopVectorizer relies on the
364 /// LoopVectorizationLegality class to provide information about the induction
365 /// and reduction variables that were found to a given vectorization factor.
367 public:
370  const TargetLibraryInfo *TLI,
372  OptimizationRemarkEmitter *ORE, unsigned VecWidth,
373  unsigned UnrollFactor, LoopVectorizationLegality *LVL,
375  : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
376  AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
377  Builder(PSE.getSE()->getContext()),
378  VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
379  virtual ~InnerLoopVectorizer() = default;
380 
381  /// Create a new empty loop. Unlink the old loop and connect the new one.
382  /// Return the pre-header block of the new loop.
384 
385  /// Widen a single instruction within the innermost loop.
387 
388  /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
389  void fixVectorizedLoop();
390 
391  // Return true if any runtime check is added.
393 
394  /// A type for vectorized values in the new loop. Each value from the
395  /// original loop, when vectorized, is represented by UF vector values in the
396  /// new unrolled loop, where UF is the unroll factor.
398 
399  /// Vectorize a single PHINode in a block. This method handles the induction
400  /// variable canonicalization. It supports both VF = 1 for unrolled loops and
401  /// arbitrary length vectors.
402  void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
403 
404  /// A helper function to scalarize a single Instruction in the innermost loop.
405  /// Generates a sequence of scalar instances for each lane between \p MinLane
406  /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
407  /// inclusive..
408  void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
409  bool IfPredicateInstr);
410 
411  /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
412  /// is provided, the integer induction variable will first be truncated to
413  /// the corresponding type.
414  void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
415 
416  /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
417  /// vector or scalar value on-demand if one is not yet available. When
418  /// vectorizing a loop, we visit the definition of an instruction before its
419  /// uses. When visiting the definition, we either vectorize or scalarize the
420  /// instruction, creating an entry for it in the corresponding map. (In some
421  /// cases, such as induction variables, we will create both vector and scalar
422  /// entries.) Then, as we encounter uses of the definition, we derive values
423  /// for each scalar or vector use unless such a value is already available.
424  /// For example, if we scalarize a definition and one of its uses is vector,
425  /// we build the required vector on-demand with an insertelement sequence
426  /// when visiting the use. Otherwise, if the use is scalar, we can use the
427  /// existing scalar definition.
428  ///
429  /// Return a value in the new loop corresponding to \p V from the original
430  /// loop at unroll index \p Part. If the value has already been vectorized,
431  /// the corresponding vector entry in VectorLoopValueMap is returned. If,
432  /// however, the value has a scalar entry in VectorLoopValueMap, we construct
433  /// a new vector value on-demand by inserting the scalar values into a vector
434  /// with an insertelement sequence. If the value has been neither vectorized
435  /// nor scalarized, it must be loop invariant, so we simply broadcast the
436  /// value into a vector.
437  Value *getOrCreateVectorValue(Value *V, unsigned Part);
438 
439  /// Return a value in the new loop corresponding to \p V from the original
440  /// loop at unroll and vector indices \p Instance. If the value has been
441  /// vectorized but not scalarized, the necessary extractelement instruction
442  /// will be generated.
443  Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
444 
445  /// Construct the vector value of a scalarized value \p V one lane at a time.
446  void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
447 
448  /// Try to vectorize the interleaved access group that \p Instr belongs to,
449  /// optionally masking the vector operations if \p BlockInMask is non-null.
451  VectorParts *BlockInMask = nullptr);
452 
453  /// Vectorize Load and Store instructions, optionally masking the vector
454  /// operations if \p BlockInMask is non-null.
456  VectorParts *BlockInMask = nullptr);
457 
458  /// Set the debug location in the builder using the debug location in
459  /// the instruction.
460  void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
461 
462  /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
463  void fixNonInductionPHIs(void);
464 
465 protected:
467 
468  /// A small list of PHINodes.
470 
471  /// A type for scalarized values in the new loop. Each value from the
472  /// original loop, when scalarized, is represented by UF x VF scalar values
473  /// in the new unrolled loop, where UF is the unroll factor and VF is the
474  /// vectorization factor.
476 
477  /// Set up the values of the IVs correctly when exiting the vector loop.
478  void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
479  Value *CountRoundDown, Value *EndValue,
480  BasicBlock *MiddleBlock);
481 
482  /// Create a new induction variable inside L.
483  PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
484  Value *Step, Instruction *DL);
485 
486  /// Handle all cross-iteration phis in the header.
487  void fixCrossIterationPHIs();
488 
489  /// Fix a first-order recurrence. This is the second phase of vectorizing
490  /// this phi node.
491  void fixFirstOrderRecurrence(PHINode *Phi);
492 
493  /// Fix a reduction cross-iteration phi. This is the second phase of
494  /// vectorizing this phi node.
495  void fixReduction(PHINode *Phi);
496 
497  /// The Loop exit block may have single value PHI nodes with some
498  /// incoming value. While vectorizing we only handled real values
499  /// that were defined inside the loop and we should have one value for
500  /// each predecessor of its parent basic block. See PR14725.
501  void fixLCSSAPHIs();
502 
503  /// Iteratively sink the scalarized operands of a predicated instruction into
504  /// the block that was created for it.
505  void sinkScalarOperands(Instruction *PredInst);
506 
507  /// Shrinks vector element sizes to the smallest bitwidth they can be legally
508  /// represented as.
510 
511  /// Insert the new loop to the loop hierarchy and pass manager
512  /// and update the analysis passes.
513  void updateAnalysis();
514 
515  /// Create a broadcast instruction. This method generates a broadcast
516  /// instruction (shuffle) for loop invariant values and for the induction
517  /// value. If this is the induction variable then we extend it to N, N+1, ...
518  /// this is needed because each iteration in the loop corresponds to a SIMD
519  /// element.
520  virtual Value *getBroadcastInstrs(Value *V);
521 
522  /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
523  /// to each vector element of Val. The sequence starts at StartIndex.
524  /// \p Opcode is relevant for FP induction variable.
525  virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
526  Instruction::BinaryOps Opcode =
527  Instruction::BinaryOpsEnd);
528 
529  /// Compute scalar induction steps. \p ScalarIV is the scalar induction
530  /// variable on which to base the steps, \p Step is the size of the step, and
531  /// \p EntryVal is the value from the original loop that maps to the steps.
532  /// Note that \p EntryVal doesn't have to be an induction variable - it
533  /// can also be a truncate instruction.
534  void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
535  const InductionDescriptor &ID);
536 
537  /// Create a vector induction phi node based on an existing scalar one. \p
538  /// EntryVal is the value from the original loop that maps to the vector phi
539  /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
540  /// truncate instruction, instead of widening the original IV, we widen a
541  /// version of the IV truncated to \p EntryVal's type.
543  Value *Step, Instruction *EntryVal);
544 
545  /// Returns true if an instruction \p I should be scalarized instead of
546  /// vectorized for the chosen vectorization factor.
548 
549  /// Returns true if we should generate a scalar version of \p IV.
550  bool needsScalarInduction(Instruction *IV) const;
551 
552  /// If there is a cast involved in the induction variable \p ID, which should
553  /// be ignored in the vectorized loop body, this function records the
554  /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
555  /// cast. We had already proved that the casted Phi is equal to the uncasted
556  /// Phi in the vectorized loop (under a runtime guard), and therefore
557  /// there is no need to vectorize the cast - the same value can be used in the
558  /// vector loop for both the Phi and the cast.
559  /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
560  /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
561  ///
562  /// \p EntryVal is the value from the original loop that maps to the vector
563  /// phi node and is used to distinguish what is the IV currently being
564  /// processed - original one (if \p EntryVal is a phi corresponding to the
565  /// original IV) or the "newly-created" one based on the proof mentioned above
566  /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
567  /// latter case \p EntryVal is a TruncInst and we must not record anything for
568  /// that IV, but it's error-prone to expect callers of this routine to care
569  /// about that, hence this explicit parameter.
571  const Instruction *EntryVal,
572  Value *VectorLoopValue,
573  unsigned Part,
574  unsigned Lane = UINT_MAX);
575 
576  /// Generate a shuffle sequence that will reverse the vector Vec.
577  virtual Value *reverseVector(Value *Vec);
578 
579  /// Returns (and creates if needed) the original loop trip count.
580  Value *getOrCreateTripCount(Loop *NewLoop);
581 
582  /// Returns (and creates if needed) the trip count of the widened loop.
584 
585  /// Returns a bitcasted value to the requested vector type.
586  /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
588  const DataLayout &DL);
589 
590  /// Emit a bypass check to see if the vector trip count is zero, including if
591  /// it overflows.
593 
594  /// Emit a bypass check to see if all of the SCEV assumptions we've
595  /// had to make are correct.
596  void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
597 
598  /// Emit bypass checks to check any memory assumptions we may have made.
599  void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
600 
601  /// Compute the transformed value of Index at offset StartValue using step
602  /// StepValue.
603  /// For integer induction, returns StartValue + Index * StepValue.
604  /// For pointer induction, returns StartValue[Index * StepValue].
605  /// FIXME: The newly created binary instructions should contain nsw/nuw
606  /// flags, which can be found from the original scalar operations.
608  const DataLayout &DL,
609  const InductionDescriptor &ID) const;
610 
611  /// Add additional metadata to \p To that was not present on \p Orig.
612  ///
613  /// Currently this is used to add the noalias annotations based on the
614  /// inserted memchecks. Use this for instructions that are *cloned* into the
615  /// vector loop.
616  void addNewMetadata(Instruction *To, const Instruction *Orig);
617 
618  /// Add metadata from one instruction to another.
619  ///
620  /// This includes both the original MDs from \p From and additional ones (\see
621  /// addNewMetadata). Use this for *newly created* instructions in the vector
622  /// loop.
624 
625  /// Similar to the previous function but it adds the metadata to a
626  /// vector of instructions.
628 
629  /// The original loop.
631 
632  /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
633  /// dynamic knowledge to simplify SCEV expressions and converts them to a
634  /// more usable form.
636 
637  /// Loop Info.
639 
640  /// Dominator Tree.
642 
643  /// Alias Analysis.
645 
646  /// Target Library Info.
648 
649  /// Target Transform Info.
651 
652  /// Assumption Cache.
654 
655  /// Interface to emit optimization remarks.
657 
658  /// LoopVersioning. It's only set up (non-null) if memchecks were
659  /// used.
660  ///
661  /// This is currently only used to add no-alias metadata based on the
662  /// memchecks. The actually versioning is performed manually.
663  std::unique_ptr<LoopVersioning> LVer;
664 
665  /// The vectorization SIMD factor to use. Each vector will have this many
666  /// vector elements.
667  unsigned VF;
668 
669  /// The vectorization unroll factor to use. Each scalar is vectorized to this
670  /// many different vector instructions.
671  unsigned UF;
672 
673  /// The builder that we use
675 
676  // --- Vectorization state ---
677 
678  /// The vector-loop preheader.
680 
681  /// The scalar-loop preheader.
683 
684  /// Middle Block between the vector and the scalar.
686 
687  /// The ExitBlock of the scalar loop.
689 
690  /// The vector loop body.
692 
693  /// The scalar loop body.
695 
696  /// A list of all bypass blocks. The first block is the entry of the loop.
698 
699  /// The new Induction variable which was added to the new block.
700  PHINode *Induction = nullptr;
701 
702  /// The induction variable of the old basic block.
703  PHINode *OldInduction = nullptr;
704 
705  /// Maps values from the original loop to their corresponding values in the
706  /// vectorized loop. A key value can map to either vector values, scalar
707  /// values or both kinds of values, depending on whether the key was
708  /// vectorized and scalarized.
710 
711  /// Store instructions that were predicated.
713 
714  /// Trip count of the original loop.
715  Value *TripCount = nullptr;
716 
717  /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
718  Value *VectorTripCount = nullptr;
719 
720  /// The legality analysis.
722 
723  /// The profitablity analysis.
725 
726  // Record whether runtime checks are added.
727  bool AddedSafetyChecks = false;
728 
729  // Holds the end values for each induction variable. We save the end values
730  // so we can later fix-up the external users of the induction variables.
732 
733  // Vector of original scalar PHIs whose corresponding widened PHIs need to be
734  // fixed up at the end of vector code generation.
736 };
737 
739 public:
742  const TargetLibraryInfo *TLI,
744  OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
747  : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
748  UnrollFactor, LVL, CM) {}
749 
750 private:
751  Value *getBroadcastInstrs(Value *V) override;
752  Value *getStepVector(Value *Val, int StartIdx, Value *Step,
753  Instruction::BinaryOps Opcode =
754  Instruction::BinaryOpsEnd) override;
755  Value *reverseVector(Value *Vec) override;
756 };
757 
758 } // end namespace llvm
759 
760 /// Look for a meaningful debug location on the instruction or it's
761 /// operands.
763  if (!I)
764  return I;
765 
766  DebugLoc Empty;
767  if (I->getDebugLoc() != Empty)
768  return I;
769 
770  for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
771  if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
772  if (OpInst->getDebugLoc() != Empty)
773  return OpInst;
774  }
775 
776  return I;
777 }
778 
780  if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
781  const DILocation *DIL = Inst->getDebugLoc();
782  if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
783  !isa<DbgInfoIntrinsic>(Inst)) {
784  auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
785  if (NewDIL)
786  B.SetCurrentDebugLocation(NewDIL.getValue());
787  else
788  LLVM_DEBUG(dbgs()
789  << "Failed to create new discriminator: "
790  << DIL->getFilename() << " Line: " << DIL->getLine());
791  }
792  else
794  } else
796 }
797 
798 #ifndef NDEBUG
799 /// \return string containing a file name and a line # for the given loop.
800 static std::string getDebugLocString(const Loop *L) {
801  std::string Result;
802  if (L) {
803  raw_string_ostream OS(Result);
804  if (const DebugLoc LoopDbgLoc = L->getStartLoc())
805  LoopDbgLoc.print(OS);
806  else
807  // Just print the module name.
808  OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
809  OS.flush();
810  }
811  return Result;
812 }
813 #endif
814 
816  const Instruction *Orig) {
817  // If the loop was versioned with memchecks, add the corresponding no-alias
818  // metadata.
819  if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
820  LVer->annotateInstWithNoAlias(To, Orig);
821 }
822 
824  Instruction *From) {
825  propagateMetadata(To, From);
826  addNewMetadata(To, From);
827 }
828 
830  Instruction *From) {
831  for (Value *V : To) {
832  if (Instruction *I = dyn_cast<Instruction>(V))
833  addMetadata(I, From);
834  }
835 }
836 
837 namespace llvm {
838 
839 /// LoopVectorizationCostModel - estimates the expected speedups due to
840 /// vectorization.
841 /// In many cases vectorization is not profitable. This can happen because of
842 /// a number of reasons. In this class we mainly attempt to predict the
843 /// expected speedup/slowdowns due to the supported instruction set. We use the
844 /// TargetTransformInfo to query the different backends for the cost of
845 /// different operations.
847 public:
850  const TargetTransformInfo &TTI,
851  const TargetLibraryInfo *TLI, DemandedBits *DB,
854  const LoopVectorizeHints *Hints,
856  : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
857  AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {}
858 
859  /// \return An upper bound for the vectorization factor, or None if
860  /// vectorization and interleaving should be avoided up front.
861  Optional<unsigned> computeMaxVF(bool OptForSize);
862 
863  /// \return The most profitable vectorization factor and the cost of that VF.
864  /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
865  /// then this vectorization factor will be selected if vectorization is
866  /// possible.
867  VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
868 
869  /// Setup cost-based decisions for user vectorization factor.
870  void selectUserVectorizationFactor(unsigned UserVF) {
871  collectUniformsAndScalars(UserVF);
872  collectInstsToScalarize(UserVF);
873  }
874 
875  /// \return The size (in bits) of the smallest and widest types in the code
876  /// that needs to be vectorized. We ignore values that remain scalar such as
877  /// 64 bit loop indices.
878  std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
879 
880  /// \return The desired interleave count.
881  /// If interleave count has been specified by metadata it will be returned.
882  /// Otherwise, the interleave count is computed and returned. VF and LoopCost
883  /// are the selected vectorization factor and the cost of the selected VF.
884  unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
885  unsigned LoopCost);
886 
887  /// Memory access instruction may be vectorized in more than one way.
888  /// Form of instruction after vectorization depends on cost.
889  /// This function takes cost-based decisions for Load/Store instructions
890  /// and collects them in a map. This decisions map is used for building
891  /// the lists of loop-uniform and loop-scalar instructions.
892  /// The calculated cost is saved with widening decision in order to
893  /// avoid redundant calculations.
894  void setCostBasedWideningDecision(unsigned VF);
895 
896  /// A struct that represents some properties of the register usage
897  /// of a loop.
898  struct RegisterUsage {
899  /// Holds the number of loop invariant values that are used in the loop.
901 
902  /// Holds the maximum number of concurrent live intervals in the loop.
903  unsigned MaxLocalUsers;
904  };
905 
906  /// \return Returns information about the register usages of the loop for the
907  /// given vectorization factors.
908  SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
909 
910  /// Collect values we want to ignore in the cost model.
911  void collectValuesToIgnore();
912 
913  /// \returns The smallest bitwidth each instruction can be represented with.
914  /// The vector equivalents of these instructions should be truncated to this
915  /// type.
917  return MinBWs;
918  }
919 
920  /// \returns True if it is more profitable to scalarize instruction \p I for
921  /// vectorization factor \p VF.
922  bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
923  assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
924 
925  // Cost model is not run in the VPlan-native path - return conservative
926  // result until this changes.
928  return false;
929 
930  auto Scalars = InstsToScalarize.find(VF);
931  assert(Scalars != InstsToScalarize.end() &&
932  "VF not yet analyzed for scalarization profitability");
933  return Scalars->second.find(I) != Scalars->second.end();
934  }
935 
936  /// Returns true if \p I is known to be uniform after vectorization.
937  bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
938  if (VF == 1)
939  return true;
940 
941  // Cost model is not run in the VPlan-native path - return conservative
942  // result until this changes.
944  return false;
945 
946  auto UniformsPerVF = Uniforms.find(VF);
947  assert(UniformsPerVF != Uniforms.end() &&
948  "VF not yet analyzed for uniformity");
949  return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
950  }
951 
952  /// Returns true if \p I is known to be scalar after vectorization.
953  bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
954  if (VF == 1)
955  return true;
956 
957  // Cost model is not run in the VPlan-native path - return conservative
958  // result until this changes.
960  return false;
961 
962  auto ScalarsPerVF = Scalars.find(VF);
963  assert(ScalarsPerVF != Scalars.end() &&
964  "Scalar values are not calculated for VF");
965  return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
966  }
967 
968  /// \returns True if instruction \p I can be truncated to a smaller bitwidth
969  /// for vectorization factor \p VF.
970  bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
971  return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
972  !isProfitableToScalarize(I, VF) &&
973  !isScalarAfterVectorization(I, VF);
974  }
975 
976  /// Decision that was taken during cost calculation for memory instruction.
979  CM_Widen, // For consecutive accesses with stride +1.
980  CM_Widen_Reverse, // For consecutive accesses with stride -1.
983  CM_Scalarize
984  };
985 
986  /// Save vectorization decision \p W and \p Cost taken by the cost model for
987  /// instruction \p I and vector width \p VF.
989  unsigned Cost) {
990  assert(VF >= 2 && "Expected VF >=2");
991  WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
992  }
993 
994  /// Save vectorization decision \p W and \p Cost taken by the cost model for
995  /// interleaving group \p Grp and vector width \p VF.
996  void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
997  InstWidening W, unsigned Cost) {
998  assert(VF >= 2 && "Expected VF >=2");
999  /// Broadcast this decicion to all instructions inside the group.
1000  /// But the cost will be assigned to one instruction only.
1001  for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1002  if (auto *I = Grp->getMember(i)) {
1003  if (Grp->getInsertPos() == I)
1004  WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1005  else
1006  WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1007  }
1008  }
1009  }
1010 
1011  /// Return the cost model decision for the given instruction \p I and vector
1012  /// width \p VF. Return CM_Unknown if this instruction did not pass
1013  /// through the cost modeling.
1015  assert(VF >= 2 && "Expected VF >=2");
1016 
1017  // Cost model is not run in the VPlan-native path - return conservative
1018  // result until this changes.
1020  return CM_GatherScatter;
1021 
1022  std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1023  auto Itr = WideningDecisions.find(InstOnVF);
1024  if (Itr == WideningDecisions.end())
1025  return CM_Unknown;
1026  return Itr->second.first;
1027  }
1028 
1029  /// Return the vectorization cost for the given instruction \p I and vector
1030  /// width \p VF.
1031  unsigned getWideningCost(Instruction *I, unsigned VF) {
1032  assert(VF >= 2 && "Expected VF >=2");
1033  std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1034  assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1035  "The cost is not calculated");
1036  return WideningDecisions[InstOnVF].second;
1037  }
1038 
1039  /// Return True if instruction \p I is an optimizable truncate whose operand
1040  /// is an induction variable. Such a truncate will be removed by adding a new
1041  /// induction variable with the destination type.
1042  bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1043  // If the instruction is not a truncate, return false.
1044  auto *Trunc = dyn_cast<TruncInst>(I);
1045  if (!Trunc)
1046  return false;
1047 
1048  // Get the source and destination types of the truncate.
1049  Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1050  Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1051 
1052  // If the truncate is free for the given types, return false. Replacing a
1053  // free truncate with an induction variable would add an induction variable
1054  // update instruction to each iteration of the loop. We exclude from this
1055  // check the primary induction variable since it will need an update
1056  // instruction regardless.
1057  Value *Op = Trunc->getOperand(0);
1058  if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1059  return false;
1060 
1061  // If the truncated value is not an induction variable, return false.
1062  return Legal->isInductionPhi(Op);
1063  }
1064 
1065  /// Collects the instructions to scalarize for each predicated instruction in
1066  /// the loop.
1067  void collectInstsToScalarize(unsigned VF);
1068 
1069  /// Collect Uniform and Scalar values for the given \p VF.
1070  /// The sets depend on CM decision for Load/Store instructions
1071  /// that may be vectorized as interleave, gather-scatter or scalarized.
1072  void collectUniformsAndScalars(unsigned VF) {
1073  // Do the analysis once.
1074  if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1075  return;
1076  setCostBasedWideningDecision(VF);
1077  collectLoopUniforms(VF);
1078  collectLoopScalars(VF);
1079  }
1080 
1081  /// Returns true if the target machine supports masked store operation
1082  /// for the given \p DataType and kind of access to \p Ptr.
1084  return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType);
1085  }
1086 
1087  /// Returns true if the target machine supports masked load operation
1088  /// for the given \p DataType and kind of access to \p Ptr.
1090  return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType);
1091  }
1092 
1093  /// Returns true if the target machine supports masked scatter operation
1094  /// for the given \p DataType.
1096  return TTI.isLegalMaskedScatter(DataType);
1097  }
1098 
1099  /// Returns true if the target machine supports masked gather operation
1100  /// for the given \p DataType.
1102  return TTI.isLegalMaskedGather(DataType);
1103  }
1104 
1105  /// Returns true if the target machine can represent \p V as a masked gather
1106  /// or scatter operation.
1108  bool LI = isa<LoadInst>(V);
1109  bool SI = isa<StoreInst>(V);
1110  if (!LI && !SI)
1111  return false;
1112  auto *Ty = getMemInstValueType(V);
1113  return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
1114  }
1115 
1116  /// Returns true if \p I is an instruction that will be scalarized with
1117  /// predication. Such instructions include conditional stores and
1118  /// instructions that may divide by zero.
1119  /// If a non-zero VF has been calculated, we check if I will be scalarized
1120  /// predication for that VF.
1121  bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1122 
1123  // Returns true if \p I is an instruction that will be predicated either
1124  // through scalar predication or masked load/store or masked gather/scatter.
1125  // Superset of instructions that return true for isScalarWithPredication.
1127  if (!blockNeedsPredication(I->getParent()))
1128  return false;
1129  // Loads and stores that need some form of masked operation are predicated
1130  // instructions.
1131  if (isa<LoadInst>(I) || isa<StoreInst>(I))
1132  return Legal->isMaskRequired(I);
1133  return isScalarWithPredication(I);
1134  }
1135 
1136  /// Returns true if \p I is a memory instruction with consecutive memory
1137  /// access that can be widened.
1138  bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1139 
1140  /// Returns true if \p I is a memory instruction in an interleaved-group
1141  /// of memory accesses that can be vectorized with wide vector loads/stores
1142  /// and shuffles.
1143  bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1144 
1145  /// Check if \p Instr belongs to any interleaved access group.
1147  return InterleaveInfo.isInterleaved(Instr);
1148  }
1149 
1150  /// Get the interleaved access group that \p Instr belongs to.
1153  return InterleaveInfo.getInterleaveGroup(Instr);
1154  }
1155 
1156  /// Returns true if an interleaved group requires a scalar iteration
1157  /// to handle accesses with gaps, and there is nothing preventing us from
1158  /// creating a scalar epilogue.
1159  bool requiresScalarEpilogue() const {
1160  return IsScalarEpilogueAllowed && InterleaveInfo.requiresScalarEpilogue();
1161  }
1162 
1163  /// Returns true if a scalar epilogue is not allowed due to optsize.
1164  bool isScalarEpilogueAllowed() const { return IsScalarEpilogueAllowed; }
1165 
1166  /// Returns true if all loop blocks should be masked to fold tail loop.
1167  bool foldTailByMasking() const { return FoldTailByMasking; }
1168 
1170  return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1171  }
1172 
1173  /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1174  /// with factor VF. Return the cost of the instruction, including
1175  /// scalarization overhead if it's needed.
1176  unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1177 
1178  /// Estimate cost of a call instruction CI if it were vectorized with factor
1179  /// VF. Return the cost of the instruction, including scalarization overhead
1180  /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1181  /// scalarized -
1182  /// i.e. either vector version isn't available, or is too expensive.
1183  unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1184 
1185 private:
1186  unsigned NumPredStores = 0;
1187 
1188  /// \return An upper bound for the vectorization factor, larger than zero.
1189  /// One is returned if vectorization should best be avoided due to cost.
1190  unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount);
1191 
1192  /// The vectorization cost is a combination of the cost itself and a boolean
1193  /// indicating whether any of the contributing operations will actually
1194  /// operate on
1195  /// vector values after type legalization in the backend. If this latter value
1196  /// is
1197  /// false, then all operations will be scalarized (i.e. no vectorization has
1198  /// actually taken place).
1199  using VectorizationCostTy = std::pair<unsigned, bool>;
1200 
1201  /// Returns the expected execution cost. The unit of the cost does
1202  /// not matter because we use the 'cost' units to compare different
1203  /// vector widths. The cost that is returned is *not* normalized by
1204  /// the factor width.
1205  VectorizationCostTy expectedCost(unsigned VF);
1206 
1207  /// Returns the execution time cost of an instruction for a given vector
1208  /// width. Vector width of one means scalar.
1209  VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1210 
1211  /// The cost-computation logic from getInstructionCost which provides
1212  /// the vector type as an output parameter.
1213  unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1214 
1215  /// Calculate vectorization cost of memory instruction \p I.
1216  unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1217 
1218  /// The cost computation for scalarized memory instruction.
1219  unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1220 
1221  /// The cost computation for interleaving group of memory instructions.
1222  unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1223 
1224  /// The cost computation for Gather/Scatter instruction.
1225  unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1226 
1227  /// The cost computation for widening instruction \p I with consecutive
1228  /// memory access.
1229  unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1230 
1231  /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1232  /// Load: scalar load + broadcast.
1233  /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1234  /// element)
1235  unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1236 
1237  /// Estimate the overhead of scalarizing an instruction. This is a
1238  /// convenience wrapper for the type-based getScalarizationOverhead API.
1239  unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1240 
1241  /// Returns whether the instruction is a load or store and will be a emitted
1242  /// as a vector operation.
1243  bool isConsecutiveLoadOrStore(Instruction *I);
1244 
1245  /// Returns true if an artificially high cost for emulated masked memrefs
1246  /// should be used.
1247  bool useEmulatedMaskMemRefHack(Instruction *I);
1248 
1249  /// Create an analysis remark that explains why vectorization failed
1250  ///
1251  /// \p RemarkName is the identifier for the remark. \return the remark object
1252  /// that can be streamed to.
1253  OptimizationRemarkAnalysis createMissedAnalysis(StringRef RemarkName) {
1254  return createLVMissedAnalysis(Hints->vectorizeAnalysisPassName(),
1255  RemarkName, TheLoop);
1256  }
1257 
1258  /// Map of scalar integer values to the smallest bitwidth they can be legally
1259  /// represented as. The vector equivalents of these values should be truncated
1260  /// to this type.
1262 
1263  /// A type representing the costs for instructions if they were to be
1264  /// scalarized rather than vectorized. The entries are Instruction-Cost
1265  /// pairs.
1267 
1268  /// A set containing all BasicBlocks that are known to present after
1269  /// vectorization as a predicated block.
1270  SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1271 
1272  /// Records whether it is allowed to have the original scalar loop execute at
1273  /// least once. This may be needed as a fallback loop in case runtime
1274  /// aliasing/dependence checks fail, or to handle the tail/remainder
1275  /// iterations when the trip count is unknown or doesn't divide by the VF,
1276  /// or as a peel-loop to handle gaps in interleave-groups.
1277  /// Under optsize and when the trip count is very small we don't allow any
1278  /// iterations to execute in the scalar loop.
1279  bool IsScalarEpilogueAllowed = true;
1280 
1281  /// All blocks of loop are to be masked to fold tail of scalar iterations.
1282  bool FoldTailByMasking = false;
1283 
1284  /// A map holding scalar costs for different vectorization factors. The
1285  /// presence of a cost for an instruction in the mapping indicates that the
1286  /// instruction will be scalarized when vectorizing with the associated
1287  /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1288  DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1289 
1290  /// Holds the instructions known to be uniform after vectorization.
1291  /// The data is collected per VF.
1293 
1294  /// Holds the instructions known to be scalar after vectorization.
1295  /// The data is collected per VF.
1297 
1298  /// Holds the instructions (address computations) that are forced to be
1299  /// scalarized.
1301 
1302  /// Returns the expected difference in cost from scalarizing the expression
1303  /// feeding a predicated instruction \p PredInst. The instructions to
1304  /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1305  /// non-negative return value implies the expression will be scalarized.
1306  /// Currently, only single-use chains are considered for scalarization.
1307  int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1308  unsigned VF);
1309 
1310  /// Collect the instructions that are uniform after vectorization. An
1311  /// instruction is uniform if we represent it with a single scalar value in
1312  /// the vectorized loop corresponding to each vector iteration. Examples of
1313  /// uniform instructions include pointer operands of consecutive or
1314  /// interleaved memory accesses. Note that although uniformity implies an
1315  /// instruction will be scalar, the reverse is not true. In general, a
1316  /// scalarized instruction will be represented by VF scalar values in the
1317  /// vectorized loop, each corresponding to an iteration of the original
1318  /// scalar loop.
1319  void collectLoopUniforms(unsigned VF);
1320 
1321  /// Collect the instructions that are scalar after vectorization. An
1322  /// instruction is scalar if it is known to be uniform or will be scalarized
1323  /// during vectorization. Non-uniform scalarized instructions will be
1324  /// represented by VF values in the vectorized loop, each corresponding to an
1325  /// iteration of the original scalar loop.
1326  void collectLoopScalars(unsigned VF);
1327 
1328  /// Keeps cost model vectorization decision and cost for instructions.
1329  /// Right now it is used for memory instructions only.
1331  std::pair<InstWidening, unsigned>>;
1332 
1333  DecisionList WideningDecisions;
1334 
1335  /// Returns true if \p V is expected to be vectorized and it needs to be
1336  /// extracted.
1337  bool needsExtract(Value *V, unsigned VF) const {
1338  Instruction *I = dyn_cast<Instruction>(V);
1339  if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1340  return false;
1341 
1342  // Assume we can vectorize V (and hence we need extraction) if the
1343  // scalars are not computed yet. This can happen, because it is called
1344  // via getScalarizationOverhead from setCostBasedWideningDecision, before
1345  // the scalars are collected. That should be a safe assumption in most
1346  // cases, because we check if the operands have vectorizable types
1347  // beforehand in LoopVectorizationLegality.
1348  return Scalars.find(VF) == Scalars.end() ||
1349  !isScalarAfterVectorization(I, VF);
1350  };
1351 
1352  /// Returns a range containing only operands needing to be extracted.
1353  SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1354  unsigned VF) {
1356  Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1357  }
1358 
1359 public:
1360  /// The loop that we evaluate.
1362 
1363  /// Predicated scalar evolution analysis.
1365 
1366  /// Loop Info analysis.
1368 
1369  /// Vectorization legality.
1371 
1372  /// Vector target information.
1374 
1375  /// Target Library Info.
1377 
1378  /// Demanded bits analysis.
1380 
1381  /// Assumption cache.
1383 
1384  /// Interface to emit optimization remarks.
1386 
1388 
1389  /// Loop Vectorize Hint.
1391 
1392  /// The interleave access information contains groups of interleaved accesses
1393  /// with the same stride and close to each other.
1395 
1396  /// Values to ignore in the cost model.
1398 
1399  /// Values to ignore in the cost model when VF > 1.
1401 };
1402 
1403 } // end namespace llvm
1404 
1405 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1406 // vectorization. The loop needs to be annotated with #pragma omp simd
1407 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1408 // vector length information is not provided, vectorization is not considered
1409 // explicit. Interleave hints are not allowed either. These limitations will be
1410 // relaxed in the future.
1411 // Please, note that we are currently forced to abuse the pragma 'clang
1412 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1413 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1414 // provides *explicit vectorization hints* (LV can bypass legal checks and
1415 // assume that vectorization is legal). However, both hints are implemented
1416 // using the same metadata (llvm.loop.vectorize, processed by
1417 // LoopVectorizeHints). This will be fixed in the future when the native IR
1418 // representation for pragma 'omp simd' is introduced.
1419 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1421  assert(!OuterLp->empty() && "This is not an outer loop");
1422  LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1423 
1424  // Only outer loops with an explicit vectorization hint are supported.
1425  // Unannotated outer loops are ignored.
1426  if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1427  return false;
1428 
1429  Function *Fn = OuterLp->getHeader()->getParent();
1430  if (!Hints.allowVectorization(Fn, OuterLp,
1431  true /*VectorizeOnlyWhenForced*/)) {
1432  LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1433  return false;
1434  }
1435 
1436  if (Hints.getInterleave() > 1) {
1437  // TODO: Interleave support is future work.
1438  LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1439  "outer loops.\n");
1440  Hints.emitRemarkWithHints();
1441  return false;
1442  }
1443 
1444  return true;
1445 }
1446 
1450  // Collect inner loops and outer loops without irreducible control flow. For
1451  // now, only collect outer loops that have explicit vectorization hints. If we
1452  // are stress testing the VPlan H-CFG construction, we collect the outermost
1453  // loop of every loop nest.
1454  if (L.empty() || VPlanBuildStressTest ||
1456  LoopBlocksRPO RPOT(&L);
1457  RPOT.perform(LI);
1458  if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1459  V.push_back(&L);
1460  // TODO: Collect inner loops inside marked outer loops in case
1461  // vectorization fails for the outer loop. Do not invoke
1462  // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1463  // already known to be reducible. We can use an inherited attribute for
1464  // that.
1465  return;
1466  }
1467  }
1468  for (Loop *InnerL : L)
1469  collectSupportedLoops(*InnerL, LI, ORE, V);
1470 }
1471 
1472 namespace {
1473 
1474 /// The LoopVectorize Pass.
1475 struct LoopVectorize : public FunctionPass {
1476  /// Pass identification, replacement for typeid
1477  static char ID;
1478 
1479  LoopVectorizePass Impl;
1480 
1481  explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1482  bool VectorizeOnlyWhenForced = false)
1483  : FunctionPass(ID) {
1484  Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
1485  Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
1487  }
1488 
1489  bool runOnFunction(Function &F) override {
1490  if (skipFunction(F))
1491  return false;
1492 
1493  auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1494  auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1495  auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1496  auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1497  auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1498  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1499  auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
1500  auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1501  auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1502  auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1503  auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1504  auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1505  auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1506 
1507  std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1508  [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1509 
1510  return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1511  GetLAA, *ORE, PSI);
1512  }
1513 
1514  void getAnalysisUsage(AnalysisUsage &AU) const override {
1525 
1526  // We currently do not preserve loopinfo/dominator analyses with outer loop
1527  // vectorization. Until this is addressed, mark these analyses as preserved
1528  // only for non-VPlan-native path.
1529  // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1530  if (!EnableVPlanNativePath) {
1533  }
1534 
1538  }
1539 };
1540 
1541 } // end anonymous namespace
1542 
1543 //===----------------------------------------------------------------------===//
1544 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1545 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1546 //===----------------------------------------------------------------------===//
1547 
1549  // We need to place the broadcast of invariant variables outside the loop,
1550  // but only if it's proven safe to do so. Else, broadcast will be inside
1551  // vector loop body.
1552  Instruction *Instr = dyn_cast<Instruction>(V);
1553  bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1554  (!Instr ||
1556  // Place the code for broadcasting invariant variables in the new preheader.
1558  if (SafeToHoist)
1560 
1561  // Broadcast the scalar into all locations in the vector.
1562  Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1563 
1564  return Shuf;
1565 }
1566 
1568  const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1569  assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1570  "Expected either an induction phi-node or a truncate of it!");
1571  Value *Start = II.getStartValue();
1572 
1573  // Construct the initial value of the vector IV in the vector loop preheader
1574  auto CurrIP = Builder.saveIP();
1576  if (isa<TruncInst>(EntryVal)) {
1577  assert(Start->getType()->isIntegerTy() &&
1578  "Truncation requires an integer type");
1579  auto *TruncType = cast<IntegerType>(EntryVal->getType());
1580  Step = Builder.CreateTrunc(Step, TruncType);
1581  Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1582  }
1583  Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1584  Value *SteppedStart =
1585  getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1586 
1587  // We create vector phi nodes for both integer and floating-point induction
1588  // variables. Here, we determine the kind of arithmetic we will perform.
1589  Instruction::BinaryOps AddOp;
1590  Instruction::BinaryOps MulOp;
1591  if (Step->getType()->isIntegerTy()) {
1592  AddOp = Instruction::Add;
1593  MulOp = Instruction::Mul;
1594  } else {
1595  AddOp = II.getInductionOpcode();
1596  MulOp = Instruction::FMul;
1597  }
1598 
1599  // Multiply the vectorization factor by the step using integer or
1600  // floating-point arithmetic as appropriate.
1601  Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1602  Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1603 
1604  // Create a vector splat to use in the induction update.
1605  //
1606  // FIXME: If the step is non-constant, we create the vector splat with
1607  // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1608  // handle a constant vector splat.
1609  Value *SplatVF = isa<Constant>(Mul)
1610  ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1611  : Builder.CreateVectorSplat(VF, Mul);
1612  Builder.restoreIP(CurrIP);
1613 
1614  // We may need to add the step a number of times, depending on the unroll
1615  // factor. The last of those goes into the PHI.
1616  PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1618  VecInd->setDebugLoc(EntryVal->getDebugLoc());
1619  Instruction *LastInduction = VecInd;
1620  for (unsigned Part = 0; Part < UF; ++Part) {
1621  VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1622 
1623  if (isa<TruncInst>(EntryVal))
1624  addMetadata(LastInduction, EntryVal);
1625  recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1626 
1627  LastInduction = cast<Instruction>(addFastMathFlag(
1628  Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1629  LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1630  }
1631 
1632  // Move the last step to the end of the latch block. This ensures consistent
1633  // placement of all induction updates.
1634  auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1635  auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1636  auto *ICmp = cast<Instruction>(Br->getCondition());
1637  LastInduction->moveBefore(ICmp);
1638  LastInduction->setName("vec.ind.next");
1639 
1640  VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1641  VecInd->addIncoming(LastInduction, LoopVectorLatch);
1642 }
1643 
1645  return Cost->isScalarAfterVectorization(I, VF) ||
1647 }
1648 
1651  return true;
1652  auto isScalarInst = [&](User *U) -> bool {
1653  auto *I = cast<Instruction>(U);
1655  };
1656  return llvm::any_of(IV->users(), isScalarInst);
1657 }
1658 
1660  const InductionDescriptor &ID, const Instruction *EntryVal,
1661  Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1662  assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1663  "Expected either an induction phi-node or a truncate of it!");
1664 
1665  // This induction variable is not the phi from the original loop but the
1666  // newly-created IV based on the proof that casted Phi is equal to the
1667  // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1668  // re-uses the same InductionDescriptor that original IV uses but we don't
1669  // have to do any recording in this case - that is done when original IV is
1670  // processed.
1671  if (isa<TruncInst>(EntryVal))
1672  return;
1673 
1674  const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1675  if (Casts.empty())
1676  return;
1677  // Only the first Cast instruction in the Casts vector is of interest.
1678  // The rest of the Casts (if exist) have no uses outside the
1679  // induction update chain itself.
1680  Instruction *CastInst = *Casts.begin();
1681  if (Lane < UINT_MAX)
1682  VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1683  else
1684  VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1685 }
1686 
1688  assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1689  "Primary induction variable must have an integer type");
1690 
1691  auto II = Legal->getInductionVars()->find(IV);
1692  assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
1693 
1694  auto ID = II->second;
1695  assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1696 
1697  // The scalar value to broadcast. This will be derived from the canonical
1698  // induction variable.
1699  Value *ScalarIV = nullptr;
1700 
1701  // The value from the original loop to which we are mapping the new induction
1702  // variable.
1703  Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1704 
1705  // True if we have vectorized the induction variable.
1706  auto VectorizedIV = false;
1707 
1708  // Determine if we want a scalar version of the induction variable. This is
1709  // true if the induction variable itself is not widened, or if it has at
1710  // least one user in the loop that is not widened.
1711  auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
1712 
1713  // Generate code for the induction step. Note that induction steps are
1714  // required to be loop-invariant
1715  assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
1716  "Induction step should be loop invariant");
1717  auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1718  Value *Step = nullptr;
1719  if (PSE.getSE()->isSCEVable(IV->getType())) {
1720  SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1721  Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
1723  } else {
1724  Step = cast<SCEVUnknown>(ID.getStep())->getValue();
1725  }
1726 
1727  // Try to create a new independent vector induction variable. If we can't
1728  // create the phi node, we will splat the scalar induction variable in each
1729  // loop iteration.
1730  if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
1731  createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1732  VectorizedIV = true;
1733  }
1734 
1735  // If we haven't yet vectorized the induction variable, or if we will create
1736  // a scalar one, we need to define the scalar induction variable and step
1737  // values. If we were given a truncation type, truncate the canonical
1738  // induction variable and step. Otherwise, derive these values from the
1739  // induction descriptor.
1740  if (!VectorizedIV || NeedsScalarIV) {
1741  ScalarIV = Induction;
1742  if (IV != OldInduction) {
1743  ScalarIV = IV->getType()->isIntegerTy()
1745  : Builder.CreateCast(Instruction::SIToFP, Induction,
1746  IV->getType());
1747  ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1748  ScalarIV->setName("offset.idx");
1749  }
1750  if (Trunc) {
1751  auto *TruncType = cast<IntegerType>(Trunc->getType());
1752  assert(Step->getType()->isIntegerTy() &&
1753  "Truncation requires an integer step");
1754  ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1755  Step = Builder.CreateTrunc(Step, TruncType);
1756  }
1757  }
1758 
1759  // If we haven't yet vectorized the induction variable, splat the scalar
1760  // induction variable, and build the necessary step vectors.
1761  // TODO: Don't do it unless the vectorized IV is really required.
1762  if (!VectorizedIV) {
1763  Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1764  for (unsigned Part = 0; Part < UF; ++Part) {
1765  Value *EntryPart =
1766  getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1767  VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1768  if (Trunc)
1769  addMetadata(EntryPart, Trunc);
1770  recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1771  }
1772  }
1773 
1774  // If an induction variable is only used for counting loop iterations or
1775  // calculating addresses, it doesn't need to be widened. Create scalar steps
1776  // that can be used by instructions we will later scalarize. Note that the
1777  // addition of the scalar steps will not increase the number of instructions
1778  // in the loop in the common case prior to InstCombine. We will be trading
1779  // one vector extract for each scalar step.
1780  if (NeedsScalarIV)
1781  buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1782 }
1783 
1785  Instruction::BinaryOps BinOp) {
1786  // Create and check the types.
1787  assert(Val->getType()->isVectorTy() && "Must be a vector");
1788  int VLen = Val->getType()->getVectorNumElements();
1789 
1790  Type *STy = Val->getType()->getScalarType();
1791  assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1792  "Induction Step must be an integer or FP");
1793  assert(Step->getType() == STy && "Step has wrong type");
1794 
1796 
1797  if (STy->isIntegerTy()) {
1798  // Create a vector of consecutive numbers from zero to VF.
1799  for (int i = 0; i < VLen; ++i)
1800  Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1801 
1802  // Add the consecutive indices to the vector value.
1803  Constant *Cv = ConstantVector::get(Indices);
1804  assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1805  Step = Builder.CreateVectorSplat(VLen, Step);
1806  assert(Step->getType() == Val->getType() && "Invalid step vec");
1807  // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1808  // which can be found from the original scalar operations.
1809  Step = Builder.CreateMul(Cv, Step);
1810  return Builder.CreateAdd(Val, Step, "induction");
1811  }
1812 
1813  // Floating point induction.
1814  assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1815  "Binary Opcode should be specified for FP induction");
1816  // Create a vector of consecutive numbers from zero to VF.
1817  for (int i = 0; i < VLen; ++i)
1818  Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1819 
1820  // Add the consecutive indices to the vector value.
1821  Constant *Cv = ConstantVector::get(Indices);
1822 
1823  Step = Builder.CreateVectorSplat(VLen, Step);
1824 
1825  // Floating point operations had to be 'fast' to enable the induction.
1826  FastMathFlags Flags;
1827  Flags.setFast();
1828 
1829  Value *MulOp = Builder.CreateFMul(Cv, Step);
1830  if (isa<Instruction>(MulOp))
1831  // Have to check, MulOp may be a constant
1832  cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1833 
1834  Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1835  if (isa<Instruction>(BOp))
1836  cast<Instruction>(BOp)->setFastMathFlags(Flags);
1837  return BOp;
1838 }
1839 
1841  Instruction *EntryVal,
1842  const InductionDescriptor &ID) {
1843  // We shouldn't have to build scalar steps if we aren't vectorizing.
1844  assert(VF > 1 && "VF should be greater than one");
1845 
1846  // Get the value type and ensure it and the step have the same integer type.
1847  Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1848  assert(ScalarIVTy == Step->getType() &&
1849  "Val and Step should have the same type");
1850 
1851  // We build scalar steps for both integer and floating-point induction
1852  // variables. Here, we determine the kind of arithmetic we will perform.
1853  Instruction::BinaryOps AddOp;
1854  Instruction::BinaryOps MulOp;
1855  if (ScalarIVTy->isIntegerTy()) {
1856  AddOp = Instruction::Add;
1857  MulOp = Instruction::Mul;
1858  } else {
1859  AddOp = ID.getInductionOpcode();
1860  MulOp = Instruction::FMul;
1861  }
1862 
1863  // Determine the number of scalars we need to generate for each unroll
1864  // iteration. If EntryVal is uniform, we only need to generate the first
1865  // lane. Otherwise, we generate all VF values.
1866  unsigned Lanes =
1867  Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1868  : VF;
1869  // Compute the scalar steps and save the results in VectorLoopValueMap.
1870  for (unsigned Part = 0; Part < UF; ++Part) {
1871  for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
1872  auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1873  auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1874  auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
1875  VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
1876  recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
1877  }
1878  }
1879 }
1880 
1882  assert(V != Induction && "The new induction variable should not be used.");
1883  assert(!V->getType()->isVectorTy() && "Can't widen a vector");
1884  assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1885 
1886  // If we have a stride that is replaced by one, do it here. Defer this for
1887  // the VPlan-native path until we start running Legal checks in that path.
1889  V = ConstantInt::get(V->getType(), 1);
1890 
1891  // If we have a vector mapped to this value, return it.
1892  if (VectorLoopValueMap.hasVectorValue(V, Part))
1893  return VectorLoopValueMap.getVectorValue(V, Part);
1894 
1895  // If the value has not been vectorized, check if it has been scalarized
1896  // instead. If it has been scalarized, and we actually need the value in
1897  // vector form, we will construct the vector values on demand.
1899  Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
1900 
1901  // If we've scalarized a value, that value should be an instruction.
1902  auto *I = cast<Instruction>(V);
1903 
1904  // If we aren't vectorizing, we can just copy the scalar map values over to
1905  // the vector map.
1906  if (VF == 1) {
1907  VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
1908  return ScalarValue;
1909  }
1910 
1911  // Get the last scalar instruction we generated for V and Part. If the value
1912  // is known to be uniform after vectorization, this corresponds to lane zero
1913  // of the Part unroll iteration. Otherwise, the last instruction is the one
1914  // we created for the last vector lane of the Part unroll iteration.
1915  unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
1916  auto *LastInst = cast<Instruction>(
1917  VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
1918 
1919  // Set the insert point after the last scalarized instruction. This ensures
1920  // the insertelement sequence will directly follow the scalar definitions.
1921  auto OldIP = Builder.saveIP();
1922  auto NewIP = std::next(BasicBlock::iterator(LastInst));
1923  Builder.SetInsertPoint(&*NewIP);
1924 
1925  // However, if we are vectorizing, we need to construct the vector values.
1926  // If the value is known to be uniform after vectorization, we can just
1927  // broadcast the scalar value corresponding to lane zero for each unroll
1928  // iteration. Otherwise, we construct the vector values using insertelement
1929  // instructions. Since the resulting vectors are stored in
1930  // VectorLoopValueMap, we will only generate the insertelements once.
1931  Value *VectorValue = nullptr;
1933  VectorValue = getBroadcastInstrs(ScalarValue);
1934  VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
1935  } else {
1936  // Initialize packing with insertelements to start from undef.
1938  VectorLoopValueMap.setVectorValue(V, Part, Undef);
1939  for (unsigned Lane = 0; Lane < VF; ++Lane)
1940  packScalarIntoVectorValue(V, {Part, Lane});
1941  VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
1942  }
1943  Builder.restoreIP(OldIP);
1944  return VectorValue;
1945  }
1946 
1947  // If this scalar is unknown, assume that it is a constant or that it is
1948  // loop invariant. Broadcast V and save the value for future uses.
1949  Value *B = getBroadcastInstrs(V);
1950  VectorLoopValueMap.setVectorValue(V, Part, B);
1951  return B;
1952 }
1953 
1954 Value *
1956  const VPIteration &Instance) {
1957  // If the value is not an instruction contained in the loop, it should
1958  // already be scalar.
1959  if (OrigLoop->isLoopInvariant(V))
1960  return V;
1961 
1962  assert(Instance.Lane > 0
1963  ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
1964  : true && "Uniform values only have lane zero");
1965 
1966  // If the value from the original loop has not been vectorized, it is
1967  // represented by UF x VF scalar values in the new loop. Return the requested
1968  // scalar value.
1969  if (VectorLoopValueMap.hasScalarValue(V, Instance))
1970  return VectorLoopValueMap.getScalarValue(V, Instance);
1971 
1972  // If the value has not been scalarized, get its entry in VectorLoopValueMap
1973  // for the given unroll part. If this entry is not a vector type (i.e., the
1974  // vectorization factor is one), there is no need to generate an
1975  // extractelement instruction.
1976  auto *U = getOrCreateVectorValue(V, Instance.Part);
1977  if (!U->getType()->isVectorTy()) {
1978  assert(VF == 1 && "Value not scalarized has non-vector type");
1979  return U;
1980  }
1981 
1982  // Otherwise, the value from the original loop has been vectorized and is
1983  // represented by UF vector values. Extract and return the requested scalar
1984  // value from the appropriate vector lane.
1985  return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
1986 }
1987 
1989  Value *V, const VPIteration &Instance) {
1990  assert(V != Induction && "The new induction variable should not be used.");
1991  assert(!V->getType()->isVectorTy() && "Can't pack a vector");
1992  assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1993 
1994  Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
1995  Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
1996  VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
1997  Builder.getInt32(Instance.Lane));
1998  VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
1999 }
2000 
2002  assert(Vec->getType()->isVectorTy() && "Invalid type");
2003  SmallVector<Constant *, 8> ShuffleMask;
2004  for (unsigned i = 0; i < VF; ++i)
2005  ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2006 
2007  return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2008  ConstantVector::get(ShuffleMask),
2009  "reverse");
2010 }
2011 
2012 // Return whether we allow using masked interleave-groups (for dealing with
2013 // strided loads/stores that reside in predicated blocks, or for dealing
2014 // with gaps).
2016  // If an override option has been passed in for interleaved accesses, use it.
2017  if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2019 
2021 }
2022 
2023 // Try to vectorize the interleave group that \p Instr belongs to.
2024 //
2025 // E.g. Translate following interleaved load group (factor = 3):
2026 // for (i = 0; i < N; i+=3) {
2027 // R = Pic[i]; // Member of index 0
2028 // G = Pic[i+1]; // Member of index 1
2029 // B = Pic[i+2]; // Member of index 2
2030 // ... // do something to R, G, B
2031 // }
2032 // To:
2033 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2034 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements
2035 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements
2036 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements
2037 //
2038 // Or translate following interleaved store group (factor = 3):
2039 // for (i = 0; i < N; i+=3) {
2040 // ... do something to R, G, B
2041 // Pic[i] = R; // Member of index 0
2042 // Pic[i+1] = G; // Member of index 1
2043 // Pic[i+2] = B; // Member of index 2
2044 // }
2045 // To:
2046 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2047 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2048 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2049 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2050 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2052  VectorParts *BlockInMask) {
2053  const InterleaveGroup<Instruction> *Group =
2055  assert(Group && "Fail to get an interleaved access group.");
2056 
2057  // Skip if current instruction is not the insert position.
2058  if (Instr != Group->getInsertPos())
2059  return;
2060 
2061  const DataLayout &DL = Instr->getModule()->getDataLayout();
2062  Value *Ptr = getLoadStorePointerOperand(Instr);
2063 
2064  // Prepare for the vector type of the interleaved load/store.
2065  Type *ScalarTy = getMemInstValueType(Instr);
2066  unsigned InterleaveFactor = Group->getFactor();
2067  Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2068  Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr));
2069 
2070  // Prepare for the new pointers.
2072  SmallVector<Value *, 2> NewPtrs;
2073  unsigned Index = Group->getIndex(Instr);
2074 
2075  VectorParts Mask;
2076  bool IsMaskForCondRequired = BlockInMask;
2077  if (IsMaskForCondRequired) {
2078  Mask = *BlockInMask;
2079  // TODO: extend the masked interleaved-group support to reversed access.
2080  assert(!Group->isReverse() && "Reversed masked interleave-group "
2081  "not supported.");
2082  }
2083 
2084  // If the group is reverse, adjust the index to refer to the last vector lane
2085  // instead of the first. We adjust the index from the first vector lane,
2086  // rather than directly getting the pointer for lane VF - 1, because the
2087  // pointer operand of the interleaved access is supposed to be uniform. For
2088  // uniform instructions, we're only required to generate a value for the
2089  // first vector lane in each unroll iteration.
2090  if (Group->isReverse())
2091  Index += (VF - 1) * Group->getFactor();
2092 
2093  bool InBounds = false;
2094  if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2095  InBounds = gep->isInBounds();
2096 
2097  for (unsigned Part = 0; Part < UF; Part++) {
2098  Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
2099 
2100  // Notice current instruction could be any index. Need to adjust the address
2101  // to the member of index 0.
2102  //
2103  // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2104  // b = A[i]; // Member of index 0
2105  // Current pointer is pointed to A[i+1], adjust it to A[i].
2106  //
2107  // E.g. A[i+1] = a; // Member of index 1
2108  // A[i] = b; // Member of index 0
2109  // A[i+2] = c; // Member of index 2 (Current instruction)
2110  // Current pointer is pointed to A[i+2], adjust it to A[i].
2111  NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index));
2112  if (InBounds)
2113  cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);
2114 
2115  // Cast to the vector pointer type.
2116  NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
2117  }
2118 
2119  setDebugLocFromInst(Builder, Instr);
2120  Value *UndefVec = UndefValue::get(VecTy);
2121 
2122  Value *MaskForGaps = nullptr;
2123  if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2124  MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2125  assert(MaskForGaps && "Mask for Gaps is required but it is null");
2126  }
2127 
2128  // Vectorize the interleaved load group.
2129  if (isa<LoadInst>(Instr)) {
2130  // For each unroll part, create a wide load for the group.
2131  SmallVector<Value *, 2> NewLoads;
2132  for (unsigned Part = 0; Part < UF; Part++) {
2133  Instruction *NewLoad;
2134  if (IsMaskForCondRequired || MaskForGaps) {
2136  "masked interleaved groups are not allowed.");
2137  Value *GroupMask = MaskForGaps;
2138  if (IsMaskForCondRequired) {
2139  auto *Undefs = UndefValue::get(Mask[Part]->getType());
2140  auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2141  Value *ShuffledMask = Builder.CreateShuffleVector(
2142  Mask[Part], Undefs, RepMask, "interleaved.mask");
2143  GroupMask = MaskForGaps
2144  ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2145  MaskForGaps)
2146  : ShuffledMask;
2147  }
2148  NewLoad =
2149  Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),
2150  GroupMask, UndefVec, "wide.masked.vec");
2151  }
2152  else
2153  NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part],
2154  Group->getAlignment(), "wide.vec");
2155  Group->addMetadata(NewLoad);
2156  NewLoads.push_back(NewLoad);
2157  }
2158 
2159  // For each member in the group, shuffle out the appropriate data from the
2160  // wide loads.
2161  for (unsigned I = 0; I < InterleaveFactor; ++I) {
2162  Instruction *Member = Group->getMember(I);
2163 
2164  // Skip the gaps in the group.
2165  if (!Member)
2166  continue;
2167 
2168  Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2169  for (unsigned Part = 0; Part < UF; Part++) {
2170  Value *StridedVec = Builder.CreateShuffleVector(
2171  NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2172 
2173  // If this member has different type, cast the result type.
2174  if (Member->getType() != ScalarTy) {
2175  VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2176  StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2177  }
2178 
2179  if (Group->isReverse())
2180  StridedVec = reverseVector(StridedVec);
2181 
2182  VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2183  }
2184  }
2185  return;
2186  }
2187 
2188  // The sub vector type for current instruction.
2189  VectorType *SubVT = VectorType::get(ScalarTy, VF);
2190 
2191  // Vectorize the interleaved store group.
2192  for (unsigned Part = 0; Part < UF; Part++) {
2193  // Collect the stored vector from each member.
2194  SmallVector<Value *, 4> StoredVecs;
2195  for (unsigned i = 0; i < InterleaveFactor; i++) {
2196  // Interleaved store group doesn't allow a gap, so each index has a member
2197  Instruction *Member = Group->getMember(i);
2198  assert(Member && "Fail to get a member from an interleaved store group");
2199 
2200  Value *StoredVec = getOrCreateVectorValue(
2201  cast<StoreInst>(Member)->getValueOperand(), Part);
2202  if (Group->isReverse())
2203  StoredVec = reverseVector(StoredVec);
2204 
2205  // If this member has different type, cast it to a unified type.
2206 
2207  if (StoredVec->getType() != SubVT)
2208  StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2209 
2210  StoredVecs.push_back(StoredVec);
2211  }
2212 
2213  // Concatenate all vectors into a wide vector.
2214  Value *WideVec = concatenateVectors(Builder, StoredVecs);
2215 
2216  // Interleave the elements in the wide vector.
2217  Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2218  Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2219  "interleaved.vec");
2220 
2221  Instruction *NewStoreInstr;
2222  if (IsMaskForCondRequired) {
2223  auto *Undefs = UndefValue::get(Mask[Part]->getType());
2224  auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2225  Value *ShuffledMask = Builder.CreateShuffleVector(
2226  Mask[Part], Undefs, RepMask, "interleaved.mask");
2227  NewStoreInstr = Builder.CreateMaskedStore(
2228  IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask);
2229  }
2230  else
2231  NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part],
2232  Group->getAlignment());
2233 
2234  Group->addMetadata(NewStoreInstr);
2235  }
2236 }
2237 
2239  VectorParts *BlockInMask) {
2240  // Attempt to issue a wide load.
2241  LoadInst *LI = dyn_cast<LoadInst>(Instr);
2242  StoreInst *SI = dyn_cast<StoreInst>(Instr);
2243 
2244  assert((LI || SI) && "Invalid Load/Store instruction");
2245 
2247  Cost->getWideningDecision(Instr, VF);
2249  "CM decision should be taken at this point");
2251  return vectorizeInterleaveGroup(Instr);
2252 
2253  Type *ScalarDataTy = getMemInstValueType(Instr);
2254  Type *DataTy = VectorType::get(ScalarDataTy, VF);
2255  Value *Ptr = getLoadStorePointerOperand(Instr);
2256  unsigned Alignment = getLoadStoreAlignment(Instr);
2257  // An alignment of 0 means target abi alignment. We need to use the scalar's
2258  // target abi alignment in such a case.
2259  const DataLayout &DL = Instr->getModule()->getDataLayout();
2260  if (!Alignment)
2261  Alignment = DL.getABITypeAlignment(ScalarDataTy);
2262  unsigned AddressSpace = getLoadStoreAddressSpace(Instr);
2263 
2264  // Determine if the pointer operand of the access is either consecutive or
2265  // reverse consecutive.
2266  bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2267  bool ConsecutiveStride =
2268  Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2269  bool CreateGatherScatter =
2271 
2272  // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2273  // gather/scatter. Otherwise Decision should have been to Scalarize.
2274  assert((ConsecutiveStride || CreateGatherScatter) &&
2275  "The instruction should be scalarized");
2276 
2277  // Handle consecutive loads/stores.
2278  if (ConsecutiveStride)
2279  Ptr = getOrCreateScalarValue(Ptr, {0, 0});
2280 
2281  VectorParts Mask;
2282  bool isMaskRequired = BlockInMask;
2283  if (isMaskRequired)
2284  Mask = *BlockInMask;
2285 
2286  bool InBounds = false;
2287  if (auto *gep = dyn_cast<GetElementPtrInst>(
2288  getLoadStorePointerOperand(Instr)->stripPointerCasts()))
2289  InBounds = gep->isInBounds();
2290 
2291  const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2292  // Calculate the pointer for the specific unroll-part.
2293  GetElementPtrInst *PartPtr = nullptr;
2294 
2295  if (Reverse) {
2296  // If the address is consecutive but reversed, then the
2297  // wide store needs to start at the last vector element.
2298  PartPtr = cast<GetElementPtrInst>(
2299  Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2300  PartPtr->setIsInBounds(InBounds);
2301  PartPtr = cast<GetElementPtrInst>(
2302  Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2303  PartPtr->setIsInBounds(InBounds);
2304  if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2305  Mask[Part] = reverseVector(Mask[Part]);
2306  } else {
2307  PartPtr = cast<GetElementPtrInst>(
2308  Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2309  PartPtr->setIsInBounds(InBounds);
2310  }
2311 
2312  return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2313  };
2314 
2315  // Handle Stores:
2316  if (SI) {
2318 
2319  for (unsigned Part = 0; Part < UF; ++Part) {
2320  Instruction *NewSI = nullptr;
2321  Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
2322  if (CreateGatherScatter) {
2323  Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2324  Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2325  NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2326  MaskPart);
2327  } else {
2328  if (Reverse) {
2329  // If we store to reverse consecutive memory locations, then we need
2330  // to reverse the order of elements in the stored value.
2331  StoredVal = reverseVector(StoredVal);
2332  // We don't want to update the value in the map as it might be used in
2333  // another expression. So don't call resetVectorValue(StoredVal).
2334  }
2335  auto *VecPtr = CreateVecPtr(Part, Ptr);
2336  if (isMaskRequired)
2337  NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2338  Mask[Part]);
2339  else
2340  NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2341  }
2342  addMetadata(NewSI, SI);
2343  }
2344  return;
2345  }
2346 
2347  // Handle loads.
2348  assert(LI && "Must have a load instruction");
2350  for (unsigned Part = 0; Part < UF; ++Part) {
2351  Value *NewLI;
2352  if (CreateGatherScatter) {
2353  Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2354  Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2355  NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2356  nullptr, "wide.masked.gather");
2357  addMetadata(NewLI, LI);
2358  } else {
2359  auto *VecPtr = CreateVecPtr(Part, Ptr);
2360  if (isMaskRequired)
2361  NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
2362  UndefValue::get(DataTy),
2363  "wide.masked.load");
2364  else
2365  NewLI =
2366  Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2367 
2368  // Add metadata to the load, but setVectorValue to the reverse shuffle.
2369  addMetadata(NewLI, LI);
2370  if (Reverse)
2371  NewLI = reverseVector(NewLI);
2372  }
2373  VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2374  }
2375 }
2376 
2378  const VPIteration &Instance,
2379  bool IfPredicateInstr) {
2380  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2381 
2382  setDebugLocFromInst(Builder, Instr);
2383 
2384  // Does this instruction return a value ?
2385  bool IsVoidRetTy = Instr->getType()->isVoidTy();
2386 
2387  Instruction *Cloned = Instr->clone();
2388  if (!IsVoidRetTy)
2389  Cloned->setName(Instr->getName() + ".cloned");
2390 
2391  // Replace the operands of the cloned instructions with their scalar
2392  // equivalents in the new loop.
2393  for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2394  auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2395  Cloned->setOperand(op, NewOp);
2396  }
2397  addNewMetadata(Cloned, Instr);
2398 
2399  // Place the cloned scalar in the new loop.
2400  Builder.Insert(Cloned);
2401 
2402  // Add the cloned scalar to the scalar map entry.
2403  VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2404 
2405  // If we just cloned a new assumption, add it the assumption cache.
2406  if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2407  if (II->getIntrinsicID() == Intrinsic::assume)
2408  AC->registerAssumption(II);
2409 
2410  // End if-block.
2411  if (IfPredicateInstr)
2412  PredicatedInstructions.push_back(Cloned);
2413 }
2414 
2416  Value *End, Value *Step,
2417  Instruction *DL) {
2418  BasicBlock *Header = L->getHeader();
2419  BasicBlock *Latch = L->getLoopLatch();
2420  // As we're just creating this loop, it's possible no latch exists
2421  // yet. If so, use the header as this will be a single block loop.
2422  if (!Latch)
2423  Latch = Header;
2424 
2427  setDebugLocFromInst(Builder, OldInst);
2428  auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2429 
2431  setDebugLocFromInst(Builder, OldInst);
2432 
2433  // Create i+1 and fill the PHINode.
2434  Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2435  Induction->addIncoming(Start, L->getLoopPreheader());
2436  Induction->addIncoming(Next, Latch);
2437  // Create the compare.
2438  Value *ICmp = Builder.CreateICmpEQ(Next, End);
2439  Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2440 
2441  // Now we have two terminators. Remove the old one from the block.
2442  Latch->getTerminator()->eraseFromParent();
2443 
2444  return Induction;
2445 }
2446 
2448  if (TripCount)
2449  return TripCount;
2450 
2451  assert(L && "Create Trip Count for null loop.");
2453  // Find the loop boundaries.
2454  ScalarEvolution *SE = PSE.getSE();
2455  const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2456  assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2457  "Invalid loop count");
2458 
2459  Type *IdxTy = Legal->getWidestInductionType();
2460  assert(IdxTy && "No type for induction");
2461 
2462  // The exit count might have the type of i64 while the phi is i32. This can
2463  // happen if we have an induction variable that is sign extended before the
2464  // compare. The only way that we get a backedge taken count is that the
2465  // induction variable was signed and as such will not overflow. In such a case
2466  // truncation is legal.
2467  if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
2468  IdxTy->getPrimitiveSizeInBits())
2469  BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2470  BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2471 
2472  // Get the total trip count from the count by adding 1.
2473  const SCEV *ExitCount = SE->getAddExpr(
2474  BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2475 
2476  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2477 
2478  // Expand the trip count and place the new instructions in the preheader.
2479  // Notice that the pre-header does not change, only the loop body.
2480  SCEVExpander Exp(*SE, DL, "induction");
2481 
2482  // Count holds the overall loop count (N).
2483  TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2485 
2486  if (TripCount->getType()->isPointerTy())
2487  TripCount =
2488  CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2490 
2491  return TripCount;
2492 }
2493 
2495  if (VectorTripCount)
2496  return VectorTripCount;
2497 
2498  Value *TC = getOrCreateTripCount(L);
2500 
2501  Type *Ty = TC->getType();
2502  Constant *Step = ConstantInt::get(Ty, VF * UF);
2503 
2504  // If the tail is to be folded by masking, round the number of iterations N
2505  // up to a multiple of Step instead of rounding down. This is done by first
2506  // adding Step-1 and then rounding down. Note that it's ok if this addition
2507  // overflows: the vector induction variable will eventually wrap to zero given
2508  // that it starts at zero and its Step is a power of two; the loop will then
2509  // exit, with the last early-exit vector comparison also producing all-true.
2510  if (Cost->foldTailByMasking()) {
2511  assert(isPowerOf2_32(VF * UF) &&
2512  "VF*UF must be a power of 2 when folding tail by masking");
2513  TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2514  }
2515 
2516  // Now we need to generate the expression for the part of the loop that the
2517  // vectorized body will execute. This is equal to N - (N % Step) if scalar
2518  // iterations are not required for correctness, or N - Step, otherwise. Step
2519  // is equal to the vectorization factor (number of SIMD elements) times the
2520  // unroll factor (number of SIMD instructions).
2521  Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2522 
2523  // If there is a non-reversed interleaved group that may speculatively access
2524  // memory out-of-bounds, we need to ensure that there will be at least one
2525  // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2526  // the trip count, we set the remainder to be equal to the step. If the step
2527  // does not evenly divide the trip count, no adjustment is necessary since
2528  // there will already be scalar iterations. Note that the minimum iterations
2529  // check ensures that N >= Step.
2530  if (VF > 1 && Cost->requiresScalarEpilogue()) {
2531  auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2532  R = Builder.CreateSelect(IsZero, Step, R);
2533  }
2534 
2535  VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2536 
2537  return VectorTripCount;
2538 }
2539 
2541  const DataLayout &DL) {
2542  // Verify that V is a vector type with same number of elements as DstVTy.
2543  unsigned VF = DstVTy->getNumElements();
2544  VectorType *SrcVecTy = cast<VectorType>(V->getType());
2545  assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2546  Type *SrcElemTy = SrcVecTy->getElementType();
2547  Type *DstElemTy = DstVTy->getElementType();
2548  assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2549  "Vector elements must have same size");
2550 
2551  // Do a direct cast if element types are castable.
2552  if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2553  return Builder.CreateBitOrPointerCast(V, DstVTy);
2554  }
2555  // V cannot be directly casted to desired vector type.
2556  // May happen when V is a floating point vector but DstVTy is a vector of
2557  // pointers or vice-versa. Handle this using a two-step bitcast using an
2558  // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2559  assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2560  "Only one type should be a pointer type");
2561  assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2562  "Only one type should be a floating point type");
2563  Type *IntTy =
2565  VectorType *VecIntTy = VectorType::get(IntTy, VF);
2566  Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2567  return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2568 }
2569 
2571  BasicBlock *Bypass) {
2572  Value *Count = getOrCreateTripCount(L);
2573  BasicBlock *BB = L->getLoopPreheader();
2575 
2576  // Generate code to check if the loop's trip count is less than VF * UF, or
2577  // equal to it in case a scalar epilogue is required; this implies that the
2578  // vector trip count is zero. This check also covers the case where adding one
2579  // to the backedge-taken count overflowed leading to an incorrect trip count
2580  // of zero. In this case we will also jump to the scalar loop.
2583 
2584  // If tail is to be folded, vector loop takes care of all iterations.
2585  Value *CheckMinIters = Builder.getFalse();
2586  if (!Cost->foldTailByMasking())
2587  CheckMinIters = Builder.CreateICmp(
2588  P, Count, ConstantInt::get(Count->getType(), VF * UF),
2589  "min.iters.check");
2590 
2591  BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2592  // Update dominator tree immediately if the generated block is a
2593  // LoopBypassBlock because SCEV expansions to generate loop bypass
2594  // checks may query it before the current function is finished.
2595  DT->addNewBlock(NewBB, BB);
2596  if (L->getParentLoop())
2597  L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2599  BranchInst::Create(Bypass, NewBB, CheckMinIters));
2600  LoopBypassBlocks.push_back(BB);
2601 }
2602 
2604  BasicBlock *BB = L->getLoopPreheader();
2605 
2606  // Generate the code to check that the SCEV assumptions that we made.
2607  // We want the new basic block to start at the first instruction in a
2608  // sequence of instructions that form a check.
2609  SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2610  "scev.check");
2611  Value *SCEVCheck =
2612  Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
2613 
2614  if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2615  if (C->isZero())
2616  return;
2617 
2619  "Cannot SCEV check stride or overflow when folding tail");
2620  // Create a new block containing the stride check.
2621  BB->setName("vector.scevcheck");
2622  auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2623  // Update dominator tree immediately if the generated block is a
2624  // LoopBypassBlock because SCEV expansions to generate loop bypass
2625  // checks may query it before the current function is finished.
2626  DT->addNewBlock(NewBB, BB);
2627  if (L->getParentLoop())
2628  L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2630  BranchInst::Create(Bypass, NewBB, SCEVCheck));
2631  LoopBypassBlocks.push_back(BB);
2632  AddedSafetyChecks = true;
2633 }
2634 
2636  // VPlan-native path does not do any analysis for runtime checks currently.
2638  return;
2639 
2640  BasicBlock *BB = L->getLoopPreheader();
2641 
2642  // Generate the code that checks in runtime if arrays overlap. We put the
2643  // checks into a separate block to make the more common case of few elements
2644  // faster.
2645  Instruction *FirstCheckInst;
2646  Instruction *MemRuntimeCheck;
2647  std::tie(FirstCheckInst, MemRuntimeCheck) =
2649  if (!MemRuntimeCheck)
2650  return;
2651 
2652  assert(!Cost->foldTailByMasking() && "Cannot check memory when folding tail");
2653  // Create a new block containing the memory check.
2654  BB->setName("vector.memcheck");
2655  auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2656  // Update dominator tree immediately if the generated block is a
2657  // LoopBypassBlock because SCEV expansions to generate loop bypass
2658  // checks may query it before the current function is finished.
2659  DT->addNewBlock(NewBB, BB);
2660  if (L->getParentLoop())
2661  L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2663  BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
2664  LoopBypassBlocks.push_back(BB);
2665  AddedSafetyChecks = true;
2666 
2667  // We currently don't use LoopVersioning for the actual loop cloning but we
2668  // still use it to add the noalias metadata.
2669  LVer = llvm::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2670  PSE.getSE());
2671  LVer->prepareNoAliasMetadata();
2672 }
2673 
2675  IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2676  const InductionDescriptor &ID) const {
2677 
2678  SCEVExpander Exp(*SE, DL, "induction");
2679  auto Step = ID.getStep();
2680  auto StartValue = ID.getStartValue();
2681  assert(Index->getType() == Step->getType() &&
2682  "Index type does not match StepValue type");
2683 
2684  // Note: the IR at this point is broken. We cannot use SE to create any new
2685  // SCEV and then expand it, hoping that SCEV's simplification will give us
2686  // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2687  // lead to various SCEV crashes. So all we can do is to use builder and rely
2688  // on InstCombine for future simplifications. Here we handle some trivial
2689  // cases only.
2690  auto CreateAdd = [&B](Value *X, Value *Y) {
2691  assert(X->getType() == Y->getType() && "Types don't match!");
2692  if (auto *CX = dyn_cast<ConstantInt>(X))
2693  if (CX->isZero())
2694  return Y;
2695  if (auto *CY = dyn_cast<ConstantInt>(Y))
2696  if (CY->isZero())
2697  return X;
2698  return B.CreateAdd(X, Y);
2699  };
2700 
2701  auto CreateMul = [&B](Value *X, Value *Y) {
2702  assert(X->getType() == Y->getType() && "Types don't match!");
2703  if (auto *CX = dyn_cast<ConstantInt>(X))
2704  if (CX->isOne())
2705  return Y;
2706  if (auto *CY = dyn_cast<ConstantInt>(Y))
2707  if (CY->isOne())
2708  return X;
2709  return B.CreateMul(X, Y);
2710  };
2711 
2712  switch (ID.getKind()) {
2714  assert(Index->getType() == StartValue->getType() &&
2715  "Index type does not match StartValue type");
2717  return B.CreateSub(StartValue, Index);
2718  auto *Offset = CreateMul(
2719  Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2720  return CreateAdd(StartValue, Offset);
2721  }
2723  assert(isa<SCEVConstant>(Step) &&
2724  "Expected constant step for pointer induction");
2725  return B.CreateGEP(
2726  StartValue->getType()->getPointerElementType(), StartValue,
2727  CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2728  &*B.GetInsertPoint())));
2729  }
2731  assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2732  auto InductionBinOp = ID.getInductionBinOp();
2733  assert(InductionBinOp &&
2734  (InductionBinOp->getOpcode() == Instruction::FAdd ||
2735  InductionBinOp->getOpcode() == Instruction::FSub) &&
2736  "Original bin op should be defined for FP induction");
2737 
2738  Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2739 
2740  // Floating point operations had to be 'fast' to enable the induction.
2741  FastMathFlags Flags;
2742  Flags.setFast();
2743 
2744  Value *MulExp = B.CreateFMul(StepValue, Index);
2745  if (isa<Instruction>(MulExp))
2746  // We have to check, the MulExp may be a constant.
2747  cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2748 
2749  Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2750  "induction");
2751  if (isa<Instruction>(BOp))
2752  cast<Instruction>(BOp)->setFastMathFlags(Flags);
2753 
2754  return BOp;
2755  }
2757  return nullptr;
2758  }
2759  llvm_unreachable("invalid enum");
2760 }
2761 
2763  /*
2764  In this function we generate a new loop. The new loop will contain
2765  the vectorized instructions while the old loop will continue to run the
2766  scalar remainder.
2767 
2768  [ ] <-- loop iteration number check.
2769  / |
2770  / v
2771  | [ ] <-- vector loop bypass (may consist of multiple blocks).
2772  | / |
2773  | / v
2774  || [ ] <-- vector pre header.
2775  |/ |
2776  | v
2777  | [ ] \
2778  | [ ]_| <-- vector loop.
2779  | |
2780  | v
2781  | -[ ] <--- middle-block.
2782  | / |
2783  | / v
2784  -|- >[ ] <--- new preheader.
2785  | |
2786  | v
2787  | [ ] \
2788  | [ ]_| <-- old scalar loop to handle remainder.
2789  \ |
2790  \ v
2791  >[ ] <-- exit block.
2792  ...
2793  */
2794 
2795  BasicBlock *OldBasicBlock = OrigLoop->getHeader();
2796  BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2797  BasicBlock *ExitBlock = OrigLoop->getExitBlock();
2798  MDNode *OrigLoopID = OrigLoop->getLoopID();
2799  assert(VectorPH && "Invalid loop structure");
2800  assert(ExitBlock && "Must have an exit block");
2801 
2802  // Some loops have a single integer induction variable, while other loops
2803  // don't. One example is c++ iterators that often have multiple pointer
2804  // induction variables. In the code below we also support a case where we
2805  // don't have a single induction variable.
2806  //
2807  // We try to obtain an induction variable from the original loop as hard
2808  // as possible. However if we don't find one that:
2809  // - is an integer
2810  // - counts from zero, stepping by one
2811  // - is the size of the widest induction variable type
2812  // then we create a new one.
2814  Type *IdxTy = Legal->getWidestInductionType();
2815 
2816  // Split the single block loop into the two loop structure described above.
2817  BasicBlock *VecBody =
2818  VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
2819  BasicBlock *MiddleBlock =
2820  VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
2821  BasicBlock *ScalarPH =
2822  MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
2823 
2824  // Create and register the new vector loop.
2825  Loop *Lp = LI->AllocateLoop();
2826  Loop *ParentLoop = OrigLoop->getParentLoop();
2827 
2828  // Insert the new loop into the loop nest and register the new basic blocks
2829  // before calling any utilities such as SCEV that require valid LoopInfo.
2830  if (ParentLoop) {
2831  ParentLoop->addChildLoop(Lp);
2832  ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
2833  ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
2834  } else {
2835  LI->addTopLevelLoop(Lp);
2836  }
2837  Lp->addBasicBlockToLoop(VecBody, *LI);
2838 
2839  // Find the loop boundaries.
2840  Value *Count = getOrCreateTripCount(Lp);
2841 
2842  Value *StartIdx = ConstantInt::get(IdxTy, 0);
2843 
2844  // Now, compare the new count to zero. If it is zero skip the vector loop and
2845  // jump to the scalar loop. This check also covers the case where the
2846  // backedge-taken count is uint##_max: adding one to it will overflow leading
2847  // to an incorrect trip count of zero. In this (rare) case we will also jump
2848  // to the scalar loop.
2849  emitMinimumIterationCountCheck(Lp, ScalarPH);
2850 
2851  // Generate the code to check any assumptions that we've made for SCEV
2852  // expressions.
2853  emitSCEVChecks(Lp, ScalarPH);
2854 
2855  // Generate the code that checks in runtime if arrays overlap. We put the
2856  // checks into a separate block to make the more common case of few elements
2857  // faster.
2858  emitMemRuntimeChecks(Lp, ScalarPH);
2859 
2860  // Generate the induction variable.
2861  // The loop step is equal to the vectorization factor (num of SIMD elements)
2862  // times the unroll factor (num of SIMD instructions).
2863  Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
2864  Constant *Step = ConstantInt::get(IdxTy, VF * UF);
2865  Induction =
2866  createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
2868 
2869  // We are going to resume the execution of the scalar loop.
2870  // Go over all of the induction variables that we found and fix the
2871  // PHIs that are left in the scalar version of the loop.
2872  // The starting values of PHI nodes depend on the counter of the last
2873  // iteration in the vectorized loop.
2874  // If we come from a bypass edge then we need to start from the original
2875  // start value.
2876 
2877  // This variable saves the new starting index for the scalar loop. It is used
2878  // to test if there are any tail iterations left once the vector loop has
2879  // completed.
2881  for (auto &InductionEntry : *List) {
2882  PHINode *OrigPhi = InductionEntry.first;
2883  InductionDescriptor II = InductionEntry.second;
2884 
2885  // Create phi nodes to merge from the backedge-taken check block.
2886  PHINode *BCResumeVal = PHINode::Create(
2887  OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
2888  // Copy original phi DL over to the new one.
2889  BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
2890  Value *&EndValue = IVEndValues[OrigPhi];
2891  if (OrigPhi == OldInduction) {
2892  // We know what the end value is.
2893  EndValue = CountRoundDown;
2894  } else {
2896  Type *StepType = II.getStep()->getType();
2897  Instruction::CastOps CastOp =
2898  CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
2899  Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
2900  const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2901  EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
2902  EndValue->setName("ind.end");
2903  }
2904 
2905  // The new PHI merges the original incoming value, in case of a bypass,
2906  // or the value at the end of the vectorized loop.
2907  BCResumeVal->addIncoming(EndValue, MiddleBlock);
2908 
2909  // Fix the scalar body counter (PHI node).
2910  // The old induction's phi node in the scalar body needs the truncated
2911  // value.
2912  for (BasicBlock *BB : LoopBypassBlocks)
2913  BCResumeVal->addIncoming(II.getStartValue(), BB);
2914  OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal);
2915  }
2916 
2917  // We need the OrigLoop (scalar loop part) latch terminator to help
2918  // produce correct debug info for the middle block BB instructions.
2919  // The legality check stage guarantees that the loop will have a single
2920  // latch.
2921  assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
2922  "Scalar loop latch terminator isn't a branch");
2923  BranchInst *ScalarLatchBr =
2924  cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
2925 
2926  // Add a check in the middle block to see if we have completed
2927  // all of the iterations in the first vector loop.
2928  // If (N - N%VF) == N, then we *don't* need to run the remainder.
2929  // If tail is to be folded, we know we don't need to run the remainder.
2930  Value *CmpN = Builder.getTrue();
2931  if (!Cost->foldTailByMasking()) {
2932  CmpN =
2933  CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
2934  CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
2935 
2936  // Here we use the same DebugLoc as the scalar loop latch branch instead
2937  // of the corresponding compare because they may have ended up with
2938  // different line numbers and we want to avoid awkward line stepping while
2939  // debugging. Eg. if the compare has got a line number inside the loop.
2940  cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
2941  }
2942 
2943  BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN);
2944  BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
2945  ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst);
2946 
2947  // Get ready to start creating new instructions into the vectorized body.
2949 
2950  // Save the state.
2952  LoopScalarPreHeader = ScalarPH;
2953  LoopMiddleBlock = MiddleBlock;
2954  LoopExitBlock = ExitBlock;
2955  LoopVectorBody = VecBody;
2956  LoopScalarBody = OldBasicBlock;
2957 
2958  Optional<MDNode *> VectorizedLoopID =
2960  LLVMLoopVectorizeFollowupVectorized});
2961  if (VectorizedLoopID.hasValue()) {
2962  Lp->setLoopID(VectorizedLoopID.getValue());
2963 
2964  // Do not setAlreadyVectorized if loop attributes have been defined
2965  // explicitly.
2966  return LoopVectorPreHeader;
2967  }
2968 
2969  // Keep all loop hints from the original loop on the vector loop (we'll
2970  // replace the vectorizer-specific hints below).
2971  if (MDNode *LID = OrigLoop->getLoopID())
2972  Lp->setLoopID(LID);
2973 
2974  LoopVectorizeHints Hints(Lp, true, *ORE);
2975  Hints.setAlreadyVectorized();
2976 
2977  return LoopVectorPreHeader;
2978 }
2979 
2980 // Fix up external users of the induction variable. At this point, we are
2981 // in LCSSA form, with all external PHIs that use the IV having one input value,
2982 // coming from the remainder loop. We need those PHIs to also have a correct
2983 // value for the IV when arriving directly from the middle block.
2985  const InductionDescriptor &II,
2986  Value *CountRoundDown, Value *EndValue,
2987  BasicBlock *MiddleBlock) {
2988  // There are two kinds of external IV usages - those that use the value
2989  // computed in the last iteration (the PHI) and those that use the penultimate
2990  // value (the value that feeds into the phi from the loop latch).
2991  // We allow both, but they, obviously, have different values.
2992 
2993  assert(OrigLoop->getExitBlock() && "Expected a single exit block");
2994 
2995  DenseMap<Value *, Value *> MissingVals;
2996 
2997  // An external user of the last iteration's value should see the value that
2998  // the remainder loop uses to initialize its own IV.
3000  for (User *U : PostInc->users()) {
3001  Instruction *UI = cast<Instruction>(U);
3002  if (!OrigLoop->contains(UI)) {
3003  assert(isa<PHINode>(UI) && "Expected LCSSA form");
3004  MissingVals[UI] = EndValue;
3005  }
3006  }
3007 
3008  // An external user of the penultimate value need to see EndValue - Step.
3009  // The simplest way to get this is to recompute it from the constituent SCEVs,
3010  // that is Start + (Step * (CRD - 1)).
3011  for (User *U : OrigPhi->users()) {
3012  auto *UI = cast<Instruction>(U);
3013  if (!OrigLoop->contains(UI)) {
3014  const DataLayout &DL =
3016  assert(isa<PHINode>(UI) && "Expected LCSSA form");
3017 
3018  IRBuilder<> B(MiddleBlock->getTerminator());
3019  Value *CountMinusOne = B.CreateSub(
3020  CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3021  Value *CMO =
3022  !II.getStep()->getType()->isIntegerTy()
3023  ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3024  II.getStep()->getType())
3025  : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3026  CMO->setName("cast.cmo");
3027  Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3028  Escape->setName("ind.escape");
3029  MissingVals[UI] = Escape;
3030  }
3031  }
3032 
3033  for (auto &I : MissingVals) {
3034  PHINode *PHI = cast<PHINode>(I.first);
3035  // One corner case we have to handle is two IVs "chasing" each-other,
3036  // that is %IV2 = phi [...], [ %IV1, %latch ]
3037  // In this case, if IV1 has an external use, we need to avoid adding both
3038  // "last value of IV1" and "penultimate value of IV2". So, verify that we
3039  // don't already have an incoming value for the middle block.
3040  if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3041  PHI->addIncoming(I.second, MiddleBlock);
3042  }
3043 }
3044 
3045 namespace {
3046 
3047 struct CSEDenseMapInfo {
3048  static bool canHandle(const Instruction *I) {
3049  return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3050  isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3051  }
3052 
3053  static inline Instruction *getEmptyKey() {
3055  }
3056 
3057  static inline Instruction *getTombstoneKey() {
3059  }
3060 
3061  static unsigned getHashValue(const Instruction *I) {
3062  assert(canHandle(I) && "Unknown instruction!");
3064  I->value_op_end()));
3065  }
3066 
3067  static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3068  if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3069  LHS == getTombstoneKey() || RHS == getTombstoneKey())
3070  return LHS == RHS;
3071  return LHS->isIdenticalTo(RHS);
3072  }
3073 };
3074 
3075 } // end anonymous namespace
3076 
3077 ///Perform cse of induction variable instructions.
3078 static void cse(BasicBlock *BB) {
3079  // Perform simple cse.
3081  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3082  Instruction *In = &*I++;
3083 
3084  if (!CSEDenseMapInfo::canHandle(In))
3085  continue;
3086 
3087  // Check if we can replace this instruction with any of the
3088  // visited instructions.
3089  if (Instruction *V = CSEMap.lookup(In)) {
3090  In->replaceAllUsesWith(V);
3091  In->eraseFromParent();
3092  continue;
3093  }
3094 
3095  CSEMap[In] = In;
3096  }
3097 }
3098 
3100  unsigned VF,
3101  bool &NeedToScalarize) {
3102  Function *F = CI->getCalledFunction();
3103  StringRef FnName = CI->getCalledFunction()->getName();
3104  Type *ScalarRetTy = CI->getType();
3105  SmallVector<Type *, 4> Tys, ScalarTys;
3106  for (auto &ArgOp : CI->arg_operands())
3107  ScalarTys.push_back(ArgOp->getType());
3108 
3109  // Estimate cost of scalarized vector call. The source operands are assumed
3110  // to be vectors, so we need to extract individual elements from there,
3111  // execute VF scalar calls, and then gather the result into the vector return
3112  // value.
3113  unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3114  if (VF == 1)
3115  return ScalarCallCost;
3116 
3117  // Compute corresponding vector type for return value and arguments.
3118  Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3119  for (Type *ScalarTy : ScalarTys)
3120  Tys.push_back(ToVectorTy(ScalarTy, VF));
3121 
3122  // Compute costs of unpacking argument values for the scalar calls and
3123  // packing the return values to a vector.
3124  unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3125 
3126  unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3127 
3128  // If we can't emit a vector call for this function, then the currently found
3129  // cost is the cost we need to return.
3130  NeedToScalarize = true;
3131  if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
3132  return Cost;
3133 
3134  // If the corresponding vector cost is cheaper, return its cost.
3135  unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3136  if (VectorCallCost < Cost) {
3137  NeedToScalarize = false;
3138  return VectorCallCost;
3139  }
3140  return Cost;
3141 }
3142 
3144  unsigned VF) {
3146  assert(ID && "Expected intrinsic call!");
3147 
3148  FastMathFlags FMF;
3149  if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3150  FMF = FPMO->getFastMathFlags();
3151 
3152  SmallVector<Value *, 4> Operands(CI->arg_operands());
3153  return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3154 }
3155 
3157  auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3158  auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3159  return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3160 }
3162  auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3163  auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3164  return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3165 }
3166 
3168  // For every instruction `I` in MinBWs, truncate the operands, create a
3169  // truncated version of `I` and reextend its result. InstCombine runs
3170  // later and will remove any ext/trunc pairs.
3171  SmallPtrSet<Value *, 4> Erased;
3172  for (const auto &KV : Cost->getMinimalBitwidths()) {
3173  // If the value wasn't vectorized, we must maintain the original scalar
3174  // type. The absence of the value from VectorLoopValueMap indicates that it
3175  // wasn't vectorized.
3176  if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3177  continue;
3178  for (unsigned Part = 0; Part < UF; ++Part) {
3179  Value *I = getOrCreateVectorValue(KV.first, Part);
3180  if (Erased.find(I) != Erased.end() || I->use_empty() ||
3181  !isa<Instruction>(I))
3182  continue;
3183  Type *OriginalTy = I->getType();
3184  Type *ScalarTruncatedTy =
3185  IntegerType::get(OriginalTy->getContext(), KV.second);
3186  Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3187  OriginalTy->getVectorNumElements());
3188  if (TruncatedTy == OriginalTy)
3189  continue;
3190 
3191  IRBuilder<> B(cast<Instruction>(I));
3192  auto ShrinkOperand = [&](Value *V) -> Value * {
3193  if (auto *ZI = dyn_cast<ZExtInst>(V))
3194  if (ZI->getSrcTy() == TruncatedTy)
3195  return ZI->getOperand(0);
3196  return B.CreateZExtOrTrunc(V, TruncatedTy);
3197  };
3198 
3199  // The actual instruction modification depends on the instruction type,
3200  // unfortunately.
3201  Value *NewI = nullptr;
3202  if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3203  NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3204  ShrinkOperand(BO->getOperand(1)));
3205 
3206  // Any wrapping introduced by shrinking this operation shouldn't be
3207  // considered undefined behavior. So, we can't unconditionally copy
3208  // arithmetic wrapping flags to NewI.
3209  cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3210  } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3211  NewI =
3212  B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3213  ShrinkOperand(CI->getOperand(1)));
3214  } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3215  NewI = B.CreateSelect(SI->getCondition(),
3216  ShrinkOperand(SI->getTrueValue()),
3217  ShrinkOperand(SI->getFalseValue()));
3218  } else if (auto *CI = dyn_cast<CastInst>(I)) {
3219  switch (CI->getOpcode()) {
3220  default:
3221  llvm_unreachable("Unhandled cast!");
3222  case Instruction::Trunc:
3223  NewI = ShrinkOperand(CI->getOperand(0));
3224  break;
3225  case Instruction::SExt:
3226  NewI = B.CreateSExtOrTrunc(
3227  CI->getOperand(0),
3228  smallestIntegerVectorType(OriginalTy, TruncatedTy));
3229  break;
3230  case Instruction::ZExt:
3231  NewI = B.CreateZExtOrTrunc(
3232  CI->getOperand(0),
3233  smallestIntegerVectorType(OriginalTy, TruncatedTy));
3234  break;
3235  }
3236  } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3237  auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3238  auto *O0 = B.CreateZExtOrTrunc(
3239  SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3240  auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3241  auto *O1 = B.CreateZExtOrTrunc(
3242  SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3243 
3244  NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3245  } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3246  // Don't do anything with the operands, just extend the result.
3247  continue;
3248  } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3249  auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3250  auto *O0 = B.CreateZExtOrTrunc(
3251  IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3252  auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3253  NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3254  } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3255  auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3256  auto *O0 = B.CreateZExtOrTrunc(
3257  EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3258  NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3259  } else {
3260  // If we don't know what to do, be conservative and don't do anything.
3261  continue;
3262  }
3263 
3264  // Lastly, extend the result.
3265  NewI->takeName(cast<Instruction>(I));
3266  Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3267  I->replaceAllUsesWith(Res);
3268  cast<Instruction>(I)->eraseFromParent();
3269  Erased.insert(I);
3270  VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3271  }
3272  }
3273 
3274  // We'll have created a bunch of ZExts that are now parentless. Clean up.
3275  for (const auto &KV : Cost->getMinimalBitwidths()) {
3276  // If the value wasn't vectorized, we must maintain the original scalar
3277  // type. The absence of the value from VectorLoopValueMap indicates that it
3278  // wasn't vectorized.
3279  if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3280  continue;
3281  for (unsigned Part = 0; Part < UF; ++Part) {
3282  Value *I = getOrCreateVectorValue(KV.first, Part);
3283  ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3284  if (Inst && Inst->use_empty()) {
3285  Value *NewI = Inst->getOperand(0);
3286  Inst->eraseFromParent();
3287  VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3288  }
3289  }
3290  }
3291 }
3292 
3294  // Insert truncates and extends for any truncated instructions as hints to
3295  // InstCombine.
3296  if (VF > 1)
3298 
3299  // Fix widened non-induction PHIs by setting up the PHI operands.
3300  if (OrigPHIsToFix.size()) {
3302  "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3304  }
3305 
3306  // At this point every instruction in the original loop is widened to a
3307  // vector form. Now we need to fix the recurrences in the loop. These PHI
3308  // nodes are currently empty because we did not want to introduce cycles.
3309  // This is the second stage of vectorizing recurrences.
3311 
3312  // Update the dominator tree.
3313  //
3314  // FIXME: After creating the structure of the new loop, the dominator tree is
3315  // no longer up-to-date, and it remains that way until we update it
3316  // here. An out-of-date dominator tree is problematic for SCEV,
3317  // because SCEVExpander uses it to guide code generation. The
3318  // vectorizer use SCEVExpanders in several places. Instead, we should
3319  // keep the dominator tree up-to-date as we go.
3320  updateAnalysis();
3321 
3322  // Fix-up external users of the induction variables.
3323  for (auto &Entry : *Legal->getInductionVars())
3324  fixupIVUsers(Entry.first, Entry.second,
3327 
3328  fixLCSSAPHIs();
3330  sinkScalarOperands(&*PI);
3331 
3332  // Remove redundant induction instructions.
3334 }
3335 
3337  // In order to support recurrences we need to be able to vectorize Phi nodes.
3338  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3339  // stage #2: We now need to fix the recurrences by adding incoming edges to
3340  // the currently empty PHI nodes. At this point every instruction in the
3341  // original loop is widened to a vector form so we can use them to construct
3342  // the incoming edges.
3343  for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3344  // Handle first-order recurrences and reductions that need to be fixed.
3345  if (Legal->isFirstOrderRecurrence(&Phi))
3347  else if (Legal->isReductionVariable(&Phi))
3348  fixReduction(&Phi);
3349  }
3350 }
3351 
3353  // This is the second phase of vectorizing first-order recurrences. An
3354  // overview of the transformation is described below. Suppose we have the
3355  // following loop.
3356  //
3357  // for (int i = 0; i < n; ++i)
3358  // b[i] = a[i] - a[i - 1];
3359  //
3360  // There is a first-order recurrence on "a". For this loop, the shorthand
3361  // scalar IR looks like:
3362  //
3363  // scalar.ph:
3364  // s_init = a[-1]
3365  // br scalar.body
3366  //
3367  // scalar.body:
3368  // i = phi [0, scalar.ph], [i+1, scalar.body]
3369  // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3370  // s2 = a[i]
3371  // b[i] = s2 - s1
3372  // br cond, scalar.body, ...
3373  //
3374  // In this example, s1 is a recurrence because it's value depends on the
3375  // previous iteration. In the first phase of vectorization, we created a
3376  // temporary value for s1. We now complete the vectorization and produce the
3377  // shorthand vector IR shown below (for VF = 4, UF = 1).
3378  //
3379  // vector.ph:
3380  // v_init = vector(..., ..., ..., a[-1])
3381  // br vector.body
3382  //
3383  // vector.body
3384  // i = phi [0, vector.ph], [i+4, vector.body]
3385  // v1 = phi [v_init, vector.ph], [v2, vector.body]
3386  // v2 = a[i, i+1, i+2, i+3];
3387  // v3 = vector(v1(3), v2(0, 1, 2))
3388  // b[i, i+1, i+2, i+3] = v2 - v3
3389  // br cond, vector.body, middle.block
3390  //
3391  // middle.block:
3392  // x = v2(3)
3393  // br scalar.ph
3394  //
3395  // scalar.ph:
3396  // s_init = phi [x, middle.block], [a[-1], otherwise]
3397  // br scalar.body
3398  //
3399  // After execution completes the vector loop, we extract the next value of
3400  // the recurrence (x) to use as the initial value in the scalar loop.
3401 
3402  // Get the original loop preheader and single loop latch.
3403  auto *Preheader = OrigLoop->getLoopPreheader();
3404  auto *Latch = OrigLoop->getLoopLatch();
3405 
3406  // Get the initial and previous values of the scalar recurrence.
3407  auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3408  auto *Previous = Phi->getIncomingValueForBlock(Latch);
3409 
3410  // Create a vector from the initial value.
3411  auto *VectorInit = ScalarInit;
3412  if (VF > 1) {
3414  VectorInit = Builder.CreateInsertElement(
3415  UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3416  Builder.getInt32(VF - 1), "vector.recur.init");
3417  }
3418 
3419  // We constructed a temporary phi node in the first phase of vectorization.
3420  // This phi node will eventually be deleted.
3422  cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3423 
3424  // Create a phi node for the new recurrence. The current value will either be
3425  // the initial value inserted into a vector or loop-varying vector value.
3426  auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3427  VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3428 
3429  // Get the vectorized previous value of the last part UF - 1. It appears last
3430  // among all unrolled iterations, due to the order of their construction.
3431  Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3432 
3433  // Set the insertion point after the previous value if it is an instruction.
3434  // Note that the previous value may have been constant-folded so it is not
3435  // guaranteed to be an instruction in the vector loop. Also, if the previous
3436  // value is a phi node, we should insert after all the phi nodes to avoid
3437  // breaking basic block verification.
3438  if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||
3439  isa<PHINode>(PreviousLastPart))
3441  else
3443  &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));
3444 
3445  // We will construct a vector for the recurrence by combining the values for
3446  // the current and previous iterations. This is the required shuffle mask.
3447  SmallVector<Constant *, 8> ShuffleMask(VF);
3448  ShuffleMask[0] = Builder.getInt32(VF - 1);
3449  for (unsigned I = 1; I < VF; ++I)
3450  ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3451 
3452  // The vector from which to take the initial value for the current iteration
3453  // (actual or unrolled). Initially, this is the vector phi node.
3454  Value *Incoming = VecPhi;
3455 
3456  // Shuffle the current and previous vector and update the vector parts.
3457  for (unsigned Part = 0; Part < UF; ++Part) {
3458  Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3459  Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3460  auto *Shuffle =
3461  VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3462  ConstantVector::get(ShuffleMask))
3463  : Incoming;
3464  PhiPart->replaceAllUsesWith(Shuffle);
3465  cast<Instruction>(PhiPart)->eraseFromParent();
3466  VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3467  Incoming = PreviousPart;
3468  }
3469 
3470  // Fix the latch value of the new recurrence in the vector loop.
3471  VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3472 
3473  // Extract the last vector element in the middle block. This will be the
3474  // initial value for the recurrence when jumping to the scalar loop.
3475  auto *ExtractForScalar = Incoming;
3476  if (VF > 1) {
3478  ExtractForScalar = Builder.CreateExtractElement(
3479  ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3480  }
3481  // Extract the second last element in the middle block if the
3482  // Phi is used outside the loop. We need to extract the phi itself
3483  // and not the last element (the phi update in the current iteration). This
3484  // will be the value when jumping to the exit block from the LoopMiddleBlock,
3485  // when the scalar loop is not run at all.
3486  Value *ExtractForPhiUsedOutsideLoop = nullptr;
3487  if (VF > 1)
3488  ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3489  Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3490  // When loop is unrolled without vectorizing, initialize
3491  // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3492  // `Incoming`. This is analogous to the vectorized case above: extracting the
3493  // second last element when VF > 1.
3494  else if (UF > 1)
3495  ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3496 
3497  // Fix the initial value of the original recurrence in the scalar loop.
3499  auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3500  for (auto *BB : predecessors(LoopScalarPreHeader)) {
3501  auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3502  Start->addIncoming(Incoming, BB);
3503  }
3504 
3506  Phi->setName("scalar.recur");
3507 
3508  // Finally, fix users of the recurrence outside the loop. The users will need
3509  // either the last value of the scalar recurrence or the last value of the
3510  // vector recurrence we extracted in the middle block. Since the loop is in
3511  // LCSSA form, we just need to find all the phi nodes for the original scalar
3512  // recurrence in the exit block, and then add an edge for the middle block.
3513  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3514  if (LCSSAPhi.getIncomingValue(0) == Phi) {
3515  LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3516  }
3517  }
3518 }
3519 
3521  Constant *Zero = Builder.getInt32(0);
3522 
3523  // Get it's reduction variable descriptor.
3525  "Unable to find the reduction variable");
3526  RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
3527 
3529  TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3530  Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3532  RdxDesc.getMinMaxRecurrenceKind();
3533  setDebugLocFromInst(Builder, ReductionStartValue);
3534 
3535  // We need to generate a reduction vector from the incoming scalar.
3536  // To do so, we need to generate the 'identity' vector and override
3537  // one of the elements with the incoming scalar reduction. We need
3538  // to do it in the vector-loop preheader.
3540 
3541  // This is the vector-clone of the value that leaves the loop.
3542  Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3543 
3544  // Find the reduction identity variable. Zero for addition, or, xor,
3545  // one for multiplication, -1 for And.
3546  Value *Identity;
3547  Value *VectorStart;
3550  // MinMax reduction have the start value as their identify.
3551  if (VF == 1) {
3552  VectorStart = Identity = ReductionStartValue;
3553  } else {
3554  VectorStart = Identity =
3555  Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3556  }
3557  } else {
3558  // Handle other reduction kinds:
3560  RK, VecTy->getScalarType());
3561  if (VF == 1) {
3562  Identity = Iden;
3563  // This vector is the Identity vector where the first element is the
3564  // incoming scalar reduction.
3565  VectorStart = ReductionStartValue;
3566  } else {
3567  Identity = ConstantVector::getSplat(VF, Iden);
3568 
3569  // This vector is the Identity vector where the first element is the
3570  // incoming scalar reduction.
3571  VectorStart =
3572  Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3573  }
3574  }
3575 
3576  // Fix the vector-loop phi.
3577 
3578  // Reductions do not have to start at zero. They can start with
3579  // any loop invariant values.
3580  BasicBlock *Latch = OrigLoop->getLoopLatch();
3581  Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3582  for (unsigned Part = 0; Part < UF; ++Part) {
3583  Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3584  Value *Val = getOrCreateVectorValue(LoopVal, Part);
3585  // Make sure to add the reduction stat value only to the
3586  // first unroll part.
3587  Value *StartVal = (Part == 0) ? VectorStart : Identity;
3588  cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3589  cast<PHINode>(VecRdxPhi)
3590  ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3591  }
3592 
3593  // Before each round, move the insertion point right between
3594  // the PHIs and the values we are going to write.
3595  // This allows us to write both PHINodes and the extractelement
3596  // instructions.
3598 
3599  setDebugLocFromInst(Builder, LoopExitInst);
3600 
3601  // If the vector reduction can be performed in a smaller type, we truncate
3602  // then extend the loop exit value to enable InstCombine to evaluate the
3603  // entire expression in the smaller type.
3604  if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3605  Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3608  VectorParts RdxParts(UF);
3609  for (unsigned Part = 0; Part < UF; ++Part) {
3610  RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3611  Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3612  Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3613  : Builder.CreateZExt(Trunc, VecTy);
3614  for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3615  UI != RdxParts[Part]->user_end();)
3616  if (*UI != Trunc) {
3617  (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3618  RdxParts[Part] = Extnd;
3619  } else {
3620  ++UI;
3621  }
3622  }
3624  for (unsigned Part = 0; Part < UF; ++Part) {
3625  RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3626  VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3627  }
3628  }
3629 
3630  // Reduce all of the unrolled parts into a single vector.
3631  Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3633 
3634  // The middle block terminator has already been assigned a DebugLoc here (the
3635  // OrigLoop's single latch terminator). We want the whole middle block to
3636  // appear to execute on this line because: (a) it is all compiler generated,
3637  // (b) these instructions are always executed after evaluating the latch
3638  // conditional branch, and (c) other passes may add new predecessors which
3639  // terminate on this line. This is the easiest way to ensure we don't
3640  // accidentally cause an extra step back into the loop while debugging.
3642  for (unsigned Part = 1; Part < UF; ++Part) {
3643  Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3644  if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3645  // Floating point operations had to be 'fast' to enable the reduction.
3646  ReducedPartRdx = addFastMathFlag(
3648  ReducedPartRdx, "bin.rdx"),
3649  RdxDesc.getFastMathFlags());
3650  else
3651  ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3652  RdxPart);
3653  }
3654 
3655  if (VF > 1) {
3656  bool NoNaN = Legal->hasFunNoNaNAttr();
3657  ReducedPartRdx =
3658  createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3659  // If the reduction can be performed in a smaller type, we need to extend
3660  // the reduction to the wider type before we branch to the original loop.
3661  if (Phi->getType() != RdxDesc.getRecurrenceType())
3662  ReducedPartRdx =
3663  RdxDesc.isSigned()
3664  ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3665  : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3666  }
3667 
3668  // Create a phi node that merges control-flow from the backedge-taken check
3669  // block and the middle block.
3670  PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3672  for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3673  BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3674  BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3675 
3676  // Now, we need to fix the users of the reduction variable
3677  // inside and outside of the scalar remainder loop.
3678  // We know that the loop is in LCSSA form. We need to update the
3679  // PHI nodes in the exit blocks.
3680  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3681  // All PHINodes need to have a single entry edge, or two if
3682  // we already fixed them.
3683  assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3684 
3685  // We found a reduction value exit-PHI. Update it with the
3686  // incoming bypass edge.
3687  if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3688  LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3689  } // end of the LCSSA phi scan.
3690 
3691  // Fix the scalar loop reduction variable with the incoming reduction sum
3692  // from the vector body and from the backedge value.
3693  int IncomingEdgeBlockIdx =
3695  assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3696  // Pick the other block.
3697  int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3698  Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3699  Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3700 }
3701 
3703  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3704  if (LCSSAPhi.getNumIncomingValues() == 1) {
3705  auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3706  // Non-instruction incoming values will have only one value.
3707  unsigned LastLane = 0;
3708  if (isa<Instruction>(IncomingValue))
3709  LastLane = Cost->isUniformAfterVectorization(
3710  cast<Instruction>(IncomingValue), VF)
3711  ? 0
3712  : VF - 1;
3713  // Can be a loop invariant incoming value or the last scalar value to be
3714  // extracted from the vectorized loop.
3716  Value *lastIncomingValue =
3717  getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3718  LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3719  }
3720  }
3721 }
3722 
3724  // The basic block and loop containing the predicated instruction.
3725  auto *PredBB = PredInst->getParent();
3726  auto *VectorLoop = LI->getLoopFor(PredBB);
3727 
3728  // Initialize a worklist with the operands of the predicated instruction.
3729  SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3730 
3731  // Holds instructions that we need to analyze again. An instruction may be
3732  // reanalyzed if we don't yet know if we can sink it or not.
3733  SmallVector<Instruction *, 8> InstsToReanalyze;
3734 
3735  // Returns true if a given use occurs in the predicated block. Phi nodes use
3736  // their operands in their corresponding predecessor blocks.
3737  auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3738  auto *I = cast<Instruction>(U.getUser());
3739  BasicBlock *BB = I->getParent();
3740  if (auto *Phi = dyn_cast<PHINode>(I))
3741  BB = Phi->getIncomingBlock(
3742  PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3743  return BB == PredBB;
3744  };
3745 
3746  // Iteratively sink the scalarized operands of the predicated instruction
3747  // into the block we created for it. When an instruction is sunk, it's
3748  // operands are then added to the worklist. The algorithm ends after one pass
3749  // through the worklist doesn't sink a single instruction.
3750  bool Changed;
3751  do {
3752  // Add the instructions that need to be reanalyzed to the worklist, and
3753  // reset the changed indicator.
3754  Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3755  InstsToReanalyze.clear();
3756  Changed = false;
3757 
3758  while (!Worklist.empty()) {
3759  auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3760 
3761  // We can't sink an instruction if it is a phi node, is already in the
3762  // predicated block, is not in the loop, or may have side effects.
3763  if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
3764  !VectorLoop->contains(I) || I->mayHaveSideEffects())
3765  continue;
3766 
3767  // It's legal to sink the instruction if all its uses occur in the
3768  // predicated block. Otherwise, there's nothing to do yet, and we may
3769  // need to reanalyze the instruction.
3770  if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3771  InstsToReanalyze.push_back(I);
3772  continue;
3773  }
3774 
3775  // Move the instruction to the beginning of the predicated block, and add
3776  // it's operands to the worklist.
3777  I->moveBefore(&*PredBB->getFirstInsertionPt());
3778  Worklist.insert(I->op_begin(), I->op_end());
3779 
3780  // The sinking may have enabled other instructions to be sunk, so we will
3781  // need to iterate.
3782  Changed = true;
3783  }
3784  } while (Changed);
3785 }
3786 
3788  for (PHINode *OrigPhi : OrigPHIsToFix) {
3789  PHINode *NewPhi =
3790  cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
3791  unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
3792 
3793  SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
3794  predecessors(OrigPhi->getParent()));
3795  SmallVector<BasicBlock *, 2> VectorBBPredecessors(
3796  predecessors(NewPhi->getParent()));
3797  assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
3798  "Scalar and Vector BB should have the same number of predecessors");
3799 
3800  // The insertion point in Builder may be invalidated by the time we get
3801  // here. Force the Builder insertion point to something valid so that we do
3802  // not run into issues during insertion point restore in
3803  // getOrCreateVectorValue calls below.
3804  Builder.SetInsertPoint(NewPhi);
3805 
3806  // The predecessor order is preserved and we can rely on mapping between
3807  // scalar and vector block predecessors.
3808  for (unsigned i = 0; i < NumIncomingValues; ++i) {
3809  BasicBlock *NewPredBB = VectorBBPredecessors[i];
3810 
3811  // When looking up the new scalar/vector values to fix up, use incoming
3812  // values from original phi.
3813  Value *ScIncV =
3814  OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
3815 
3816  // Scalar incoming value may need a broadcast
3817  Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
3818  NewPhi->addIncoming(NewIncV, NewPredBB);
3819  }
3820  }
3821 }
3822 
3824  unsigned VF) {
3825  PHINode *P = cast<PHINode>(PN);
3826  if (EnableVPlanNativePath) {
3827  // Currently we enter here in the VPlan-native path for non-induction
3828  // PHIs where all control flow is uniform. We simply widen these PHIs.
3829  // Create a vector phi with no operands - the vector phi operands will be
3830  // set at the end of vector code generation.
3831  Type *VecTy =
3832  (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3833  Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
3834  VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
3835  OrigPHIsToFix.push_back(P);
3836 
3837  return;
3838  }
3839 
3840  assert(PN->getParent() == OrigLoop->getHeader() &&
3841  "Non-header phis should have been handled elsewhere");
3842 
3843  // In order to support recurrences we need to be able to vectorize Phi nodes.
3844  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3845  // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3846  // this value when we vectorize all of the instructions that use the PHI.
3848  for (unsigned Part = 0; Part < UF; ++Part) {
3849  // This is phase one of vectorizing PHIs.
3850  Type *VecTy =
3851  (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3852  Value *EntryPart = PHINode::Create(
3853  VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
3854  VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
3855  }
3856  return;
3857  }
3858 
3860 
3861  // This PHINode must be an induction variable.
3862  // Make sure that we know about it.
3863  assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
3864 
3866  const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
3867 
3868  // FIXME: The newly created binary instructions should contain nsw/nuw flags,
3869  // which can be found from the original scalar operations.
3870  switch (II.getKind()) {
3872  llvm_unreachable("Unknown induction");
3875  llvm_unreachable("Integer/fp induction is handled elsewhere.");
3877  // Handle the pointer induction variable case.
3878  assert(P->getType()->isPointerTy() && "Unexpected type.");
3879  // This is the normalized GEP that starts counting at zero.
3880  Value *PtrInd = Induction;
3881  PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
3882  // Determine the number of scalars we need to generate for each unroll
3883  // iteration. If the instruction is uniform, we only need to generate the
3884  // first lane. Otherwise, we generate all VF values.
3885  unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
3886  // These are the scalar results. Notice that we don't generate vector GEPs
3887  // because scalar GEPs result in better code.
3888  for (unsigned Part = 0; Part < UF; ++Part) {
3889  for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
3890  Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
3891  Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
3892  Value *SclrGep =
3893  emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
3894  SclrGep->setName("next.gep");
3895  VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
3896  }
3897  }
3898  return;
3899  }
3900  }
3901 }
3902 
3903 /// A helper function for checking whether an integer division-related
3904 /// instruction may divide by zero (in which case it must be predicated if
3905 /// executed conditionally in the scalar code).
3906 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
3907 /// Non-zero divisors that are non compile-time constants will not be
3908 /// converted into multiplication, so we will still end up scalarizing
3909 /// the division, but can do so w/o predication.
3911  assert((I.getOpcode() == Instruction::UDiv ||
3912  I.getOpcode() == Instruction::SDiv ||
3913  I.getOpcode() == Instruction::URem ||
3914  I.getOpcode() == Instruction::SRem) &&
3915  "Unexpected instruction");
3916  Value *Divisor = I.getOperand(1);
3917  auto *CInt = dyn_cast<ConstantInt>(Divisor);
3918  return !CInt || CInt->isZero();
3919 }
3920 
3922  switch (I.getOpcode()) {
3923  case Instruction::Br:
3924  case Instruction::PHI:
3925  llvm_unreachable("This instruction is handled by a different recipe.");
3926  case Instruction::GetElementPtr: {
3927  // Construct a vector GEP by widening the operands of the scalar GEP as
3928  // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
3929  // results in a vector of pointers when at least one operand of the GEP
3930  // is vector-typed. Thus, to keep the representation compact, we only use
3931  // vector-typed operands for loop-varying values.
3932  auto *GEP = cast<GetElementPtrInst>(&I);
3933 
3934  if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
3935  // If we are vectorizing, but the GEP has only loop-invariant operands,
3936  // the GEP we build (by only using vector-typed operands for
3937  // loop-varying values) would be a scalar pointer. Thus, to ensure we
3938  // produce a vector of pointers, we need to either arbitrarily pick an
3939  // operand to broadcast, or broadcast a clone of the original GEP.
3940  // Here, we broadcast a clone of the original.
3941  //
3942  // TODO: If at some point we decide to scalarize instructions having
3943  // loop-invariant operands, this special case will no longer be
3944  // required. We would add the scalarization decision to
3945  // collectLoopScalars() and teach getVectorValue() to broadcast
3946  // the lane-zero scalar value.
3947  auto *Clone = Builder.Insert(GEP->clone());
3948  for (unsigned Part = 0; Part < UF; ++Part) {
3949  Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
3950  VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
3951  addMetadata(EntryPart, GEP);
3952  }
3953  } else {
3954  // If the GEP has at least one loop-varying operand, we are sure to
3955  // produce a vector of pointers. But if we are only unrolling, we want
3956  // to produce a scalar GEP for each unroll part. Thus, the GEP we
3957  // produce with the code below will be scalar (if VF == 1) or vector
3958  // (otherwise). Note that for the unroll-only case, we still maintain
3959  // values in the vector mapping with initVector, as we do for other
3960  // instructions.
3961  for (unsigned Part = 0; Part < UF; ++Part) {
3962  // The pointer operand of the new GEP. If it's loop-invariant, we
3963  // won't broadcast it.
3964  auto *Ptr =
3965  OrigLoop->isLoopInvariant(GEP->getPointerOperand())
3966  ? GEP->getPointerOperand()
3967  : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
3968 
3969  // Collect all the indices for the new GEP. If any index is
3970  // loop-invariant, we won't broadcast it.
3971  SmallVector<Value *, 4> Indices;
3972  for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
3973  if (OrigLoop->isLoopInvariant(U.get()))
3974  Indices.push_back(U.get());
3975  else
3976  Indices.push_back(getOrCreateVectorValue(U.get(), Part));
3977  }
3978 
3979  // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
3980  // but it should be a vector, otherwise.
3981  auto *NewGEP =
3982  GEP->isInBounds()
3983  ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
3984  Indices)
3985  : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
3986  assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
3987  "NewGEP is not a pointer vector");
3988  VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
3989  addMetadata(NewGEP, GEP);
3990  }
3991  }
3992 
3993  break;
3994  }
3995  case Instruction::UDiv:
3996  case Instruction::SDiv:
3997  case Instruction::SRem:
3998  case Instruction::URem:
3999  case Instruction::Add:
4000  case Instruction::FAdd:
4001  case Instruction::Sub:
4002  case Instruction::FSub:
4003  case Instruction::FNeg:
4004  case Instruction::Mul:
4005  case Instruction::FMul:
4006  case Instruction::FDiv:
4007  case Instruction::FRem:
4008  case Instruction::Shl:
4009  case Instruction::LShr:
4010  case Instruction::AShr:
4011  case Instruction::And:
4012  case Instruction::Or:
4013  case Instruction::Xor: {
4014  // Just widen unops and binops.
4016 
4017  for (unsigned Part = 0; Part < UF; ++Part) {
4019  for (Value *Op : I.operands())
4020  Ops.push_back(getOrCreateVectorValue(Op, Part));
4021 
4022  Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4023 
4024  if (auto *VecOp = dyn_cast<Instruction>(V))
4025  VecOp->copyIRFlags(&I);
4026 
4027  // Use this vector value for all users of the original instruction.
4028  VectorLoopValueMap.setVectorValue(&I, Part, V);
4029  addMetadata(V, &I);
4030  }
4031 
4032  break;
4033  }
4034  case Instruction::Select: {
4035  // Widen selects.
4036  // If the selector is loop invariant we can create a select
4037  // instruction with a scalar condition. Otherwise, use vector-select.
4038  auto *SE = PSE.getSE();
4039  bool InvariantCond =
4042 
4043  // The condition can be loop invariant but still defined inside the
4044  // loop. This means that we can't just use the original 'cond' value.
4045  // We have to take the 'vectorized' value and pick the first lane.
4046  // Instcombine will make this a no-op.
4047 
4048  auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4049 
4050  for (unsigned Part = 0; Part < UF; ++Part) {
4051  Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4052  Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4053  Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4054  Value *Sel =
4055  Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4056  VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4057  addMetadata(Sel, &I);
4058  }
4059 
4060  break;
4061  }
4062 
4063  case Instruction::ICmp:
4064  case Instruction::FCmp: {
4065  // Widen compares. Generate vector compares.
4066  bool FCmp = (I.getOpcode() == Instruction::FCmp);
4067  auto *Cmp = dyn_cast<CmpInst>(&I);
4069  for (unsigned Part = 0; Part < UF; ++Part) {
4070  Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4071  Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4072  Value *C = nullptr;
4073  if (FCmp) {
4074  // Propagate fast math flags.
4076  Builder.setFastMathFlags(Cmp->getFastMathFlags());
4077  C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4078  } else {
4079  C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4080  }
4081  VectorLoopValueMap.setVectorValue(&I, Part, C);
4082  addMetadata(C, &I);
4083  }
4084 
4085  break;
4086  }
4087 
4088  case Instruction::ZExt:
4089  case Instruction::SExt:
4090  case Instruction::FPToUI:
4091  case Instruction::FPToSI:
4092  case Instruction::FPExt:
4093  case Instruction::PtrToInt:
4094  case Instruction::IntToPtr:
4095  case Instruction::SIToFP:
4096  case Instruction::UIToFP:
4097  case Instruction::Trunc:
4098  case Instruction::FPTrunc:
4099  case Instruction::BitCast: {
4100  auto *CI = dyn_cast<CastInst>(&I);
4102 
4103  /// Vectorize casts.
4104  Type *DestTy =
4105  (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4106 
4107  for (unsigned Part = 0; Part < UF; ++Part) {
4108  Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4109  Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4110  VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4111  addMetadata(Cast, &I);
4112  }
4113  break;
4114  }
4115 
4116  case Instruction::Call: {
4117  // Ignore dbg intrinsics.
4118  if (isa<DbgInfoIntrinsic>(I))
4119  break;
4121 
4122  Module *M = I.getParent()->getParent()->getParent();
4123  auto *CI = cast<CallInst>(&I);
4124 
4125  StringRef FnName = CI->getCalledFunction()->getName();
4126  Function *F = CI->getCalledFunction();
4127  Type *RetTy = ToVectorTy(CI->getType(), VF);
4129  for (Value *ArgOperand : CI->arg_operands())
4130  Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4131 
4133 
4134  // The flag shows whether we use Intrinsic or a usual Call for vectorized
4135  // version of the instruction.
4136  // Is it beneficial to perform intrinsic call compared to lib call?
4137  bool NeedToScalarize;
4138  unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4139  bool UseVectorIntrinsic =
4140  ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4141  assert((UseVectorIntrinsic || !NeedToScalarize) &&
4142  "Instruction should be scalarized elsewhere.");
4143 
4144  for (unsigned Part = 0; Part < UF; ++Part) {
4146  for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4147  Value *Arg = CI->getArgOperand(i);
4148  // Some intrinsics have a scalar argument - don't replace it with a
4149  // vector.
4150  if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4151  Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4152  Args.push_back(Arg);
4153  }
4154 
4155  Function *VectorF;
4156  if (UseVectorIntrinsic) {
4157  // Use vector version of the intrinsic.
4158  Type *TysForDecl[] = {CI->getType()};
4159  if (VF > 1)
4160  TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4161  VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4162  } else {
4163  // Use vector version of the library call.
4164  StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
4165  assert(!VFnName.empty() && "Vector function name is empty.");
4166  VectorF = M->getFunction(VFnName);
4167  if (!VectorF) {
4168  // Generate a declaration
4169  FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
4170  VectorF =
4171  Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
4172  VectorF->copyAttributesFrom(F);
4173  }
4174  }
4175  assert(VectorF && "Can't create vector function.");
4176 
4178  CI->getOperandBundlesAsDefs(OpBundles);
4179  CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4180 
4181  if (isa<FPMathOperator>(V))
4182  V->copyFastMathFlags(CI);
4183 
4184  VectorLoopValueMap.setVectorValue(&I, Part, V);
4185  addMetadata(V, &I);
4186  }
4187 
4188  break;
4189  }
4190 
4191  default:
4192  // This instruction is not vectorized by simple widening.
4193  LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4194  llvm_unreachable("Unhandled instruction!");
4195  } // end of switch.
4196 }
4197 
4199  // Forget the original basic block.
4201 
4202  // DT is not kept up-to-date for outer loop vectorization
4204  return;
4205 
4206  // Update the dominator tree information.
4208  "Entry does not dominate exit.");
4209 
4216 }
4217 
4218 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4219  // We should not collect Scalars more than once per VF. Right now, this
4220  // function is called from collectUniformsAndScalars(), which already does
4221  // this check. Collecting Scalars for VF=1 does not make any sense.
4222  assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4223  "This function should not be visited twice for the same VF");
4224 
4226 
4227  // These sets are used to seed the analysis with pointers used by memory
4228  // accesses that will remain scalar.
4230  SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4231 
4232  // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4233  // The pointer operands of loads and stores will be scalar as long as the
4234  // memory access is not a gather or scatter operation. The value operand of a
4235  // store will remain scalar if the store is scalarized.
4236  auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4237  InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4238  assert(WideningDecision != CM_Unknown &&
4239  "Widening decision should be ready at this moment");
4240  if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4241  if (Ptr == Store->getValueOperand())
4242  return WideningDecision == CM_Scalarize;
4243  assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4244  "Ptr is neither a value or pointer operand");
4245  return WideningDecision != CM_GatherScatter;
4246  };
4247 
4248  // A helper that returns true if the given value is a bitcast or
4249  // getelementptr instruction contained in the loop.
4250  auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4251  return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4252  isa<GetElementPtrInst>(V)) &&
4253  !TheLoop->isLoopInvariant(V);
4254  };
4255 
4256  // A helper that evaluates a memory access's use of a pointer. If the use
4257  // will be a scalar use, and the pointer is only used by memory accesses, we
4258  // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4259  // PossibleNonScalarPtrs.
4260  auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4261  // We only care about bitcast and getelementptr instructions contained in
4262  // the loop.
4263  if (!isLoopVaryingBitCastOrGEP(Ptr))
4264  return;
4265 
4266  // If the pointer has already been identified as scalar (e.g., if it was
4267  // also identified as uniform), there's nothing to do.
4268  auto *I = cast<Instruction>(Ptr);
4269  if (Worklist.count(I))
4270  return;
4271 
4272  // If the use of the pointer will be a scalar use, and all users of the
4273  // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4274  // place the pointer in PossibleNonScalarPtrs.
4275  if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4276  return isa<LoadInst>(U) || isa<StoreInst>(U);
4277  }))
4278  ScalarPtrs.insert(I);
4279  else
4280  PossibleNonScalarPtrs.insert(I);
4281  };
4282 
4283  // We seed the scalars analysis with three classes of instructions: (1)
4284  // instructions marked uniform-after-vectorization, (2) bitcast and
4285  // getelementptr instructions used by memory accesses requiring a scalar use,
4286  // and (3) pointer induction variables and their update instructions (we
4287  // currently only scalarize these).
4288  //
4289  // (1) Add to the worklist all instructions that have been identified as
4290  // uniform-after-vectorization.
4291  Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4292 
4293  // (2) Add to the worklist all bitcast and getelementptr instructions used by
4294  // memory accesses requiring a scalar use. The pointer operands of loads and
4295  // stores will be scalar as long as the memory accesses is not a gather or
4296  // scatter operation. The value operand of a store will remain scalar if the
4297  // store is scalarized.
4298  for (auto *BB : TheLoop->blocks())
4299  for (auto &I : *BB) {
4300  if (auto *Load = dyn_cast<LoadInst>(&I)) {
4301  evaluatePtrUse(Load, Load->getPointerOperand());
4302  } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4303  evaluatePtrUse(Store, Store->getPointerOperand());
4304  evaluatePtrUse(Store, Store->getValueOperand());
4305  }
4306  }
4307  for (auto *I : ScalarPtrs)
4308  if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4309  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4310  Worklist.insert(I);
4311  }
4312 
4313  // (3) Add to the worklist all pointer induction variables and their update
4314  // instructions.
4315  //
4316  // TODO: Once we are able to vectorize pointer induction variables we should
4317  // no longer insert them into the worklist here.
4318  auto *Latch = TheLoop->getLoopLatch();
4319  for (auto &Induction : *Legal->getInductionVars()) {
4320  auto *Ind = Induction.first;
4321  auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4322  if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4323  continue;
4324  Worklist.insert(Ind);
4325  Worklist.insert(IndUpdate);
4326  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4327  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4328  << "\n");
4329  }
4330 
4331  // Insert the forced scalars.
4332  // FIXME: Currently widenPHIInstruction() often creates a dead vector
4333  // induction variable when the PHI user is scalarized.
4334  auto ForcedScalar = ForcedScalars.find(VF);
4335  if (ForcedScalar != ForcedScalars.end())
4336  for (auto *I : ForcedScalar->second)
4337  Worklist.insert(I);
4338 
4339  // Expand the worklist by looking through any bitcasts and getelementptr
4340  // instructions we've already identified as scalar. This is similar to the
4341  // expansion step in collectLoopUniforms(); however, here we're only
4342  // expanding to include additional bitcasts and getelementptr instructions.
4343  unsigned Idx = 0;
4344  while (Idx != Worklist.size()) {
4345  Instruction *Dst = Worklist[Idx++];
4346  if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4347  continue;
4348  auto *Src = cast<Instruction>(Dst->getOperand(0));
4349  if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4350  auto *J = cast<Instruction>(U);
4351  return !TheLoop->contains(J) || Worklist.count(J) ||
4352  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4353  isScalarUse(J, Src));
4354  })) {
4355  Worklist.insert(Src);
4356  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4357  }
4358  }
4359 
4360  // An induction variable will remain scalar if all users of the induction
4361  // variable and induction variable update remain scalar.
4362  for (auto &Induction : *Legal->getInductionVars()) {
4363  auto *Ind = Induction.first;
4364  auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4365 
4366  // We already considered pointer induction variables, so there's no reason
4367  // to look at their users again.
4368  //
4369  // TODO: Once we are able to vectorize pointer induction variables we
4370  // should no longer skip over them here.
4371  if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4372  continue;
4373 
4374  // Determine if all users of the induction variable are scalar after
4375  // vectorization.
4376  auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4377  auto *I = cast<Instruction>(U);
4378  return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4379  });
4380  if (!ScalarInd)
4381  continue;
4382 
4383  // Determine if all users of the induction variable update instruction are
4384  // scalar after vectorization.
4385  auto ScalarIndUpdate =
4386  llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4387  auto *I = cast<Instruction>(U);
4388  return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4389  });
4390  if (!ScalarIndUpdate)
4391  continue;
4392 
4393  // The induction variable and its update instruction will remain scalar.
4394  Worklist.insert(Ind);
4395  Worklist.insert(IndUpdate);
4396  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4397  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4398  << "\n");
4399  }
4400 
4401  Scalars[VF].insert(Worklist.begin(), Worklist.end());
4402 }
4403 
4405  if (!blockNeedsPredication(I->getParent()))
4406  return false;
4407  switch(I->getOpcode()) {
4408  default:
4409  break;
4410  case Instruction::Load:
4411  case Instruction::Store: {
4412  if (!Legal->isMaskRequired(I))
4413  return false;
4414  auto *Ptr = getLoadStorePointerOperand(I);
4415  auto *Ty = getMemInstValueType(I);
4416  // We have already decided how to vectorize this instruction, get that
4417  // result.
4418  if (VF > 1) {
4419  InstWidening WideningDecision = getWideningDecision(I, VF);
4420  assert(WideningDecision != CM_Unknown &&
4421  "Widening decision should be ready at this moment");
4422  return WideningDecision == CM_Scalarize;
4423  }
4424  return isa<LoadInst>(I) ?
4425  !(isLegalMaskedLoad(Ty, Ptr) || isLegalMaskedGather(Ty))
4426  : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty));
4427  }
4428  case Instruction::UDiv:
4429  case Instruction::SDiv:
4430  case Instruction::SRem:
4431  case Instruction::URem:
4432  return mayDivideByZero(*I);
4433  }
4434  return false;
4435 }
4436 
4438  unsigned VF) {
4439  assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4440  assert(getWideningDecision(I, VF) == CM_Unknown &&
4441  "Decision should not be set yet.");
4442  auto *Group = getInterleavedAccessGroup(I);
4443  assert(Group && "Must have a group.");
4444 
4445  // If the instruction's allocated size doesn't equal it's type size, it
4446  // requires padding and will be scalarized.
4447  auto &DL = I->getModule()->getDataLayout();
4448  auto *ScalarTy = getMemInstValueType(I);
4449  if (hasIrregularType(ScalarTy, DL, VF))
4450  return false;
4451 
4452  // Check if masking is required.
4453  // A Group may need masking for one of two reasons: it resides in a block that
4454  // needs predication, or it was decided to use masking to deal with gaps.
4455  bool PredicatedAccessRequiresMasking =
4457  bool AccessWithGapsRequiresMasking =
4458  Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed;
4459  if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4460  return true;
4461 
4462  // If masked interleaving is required, we expect that the user/target had
4463  // enabled it, because otherwise it either wouldn't have been created or
4464  // it should have been invalidated by the CostModel.
4466  "Masked interleave-groups for predicated accesses are not enabled.");
4467 
4468  auto *Ty = getMemInstValueType(I);
4469  return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty)
4470  : TTI.isLegalMaskedStore(Ty);
4471 }
4472 
4474  unsigned VF) {
4475  // Get and ensure we have a valid memory instruction.
4476  LoadInst *LI = dyn_cast<LoadInst>(I);
4478  assert((LI || SI) && "Invalid memory instruction");
4479 
4480  auto *Ptr = getLoadStorePointerOperand(I);
4481 
4482  // In order to be widened, the pointer should be consecutive, first of all.
4483  if (!Legal->isConsecutivePtr(Ptr))
4484  return false;
4485 
4486  // If the instruction is a store located in a predicated block, it will be
4487  // scalarized.
4488  if (isScalarWithPredication(I))
4489  return false;
4490 
4491  // If the instruction's allocated size doesn't equal it's type size, it
4492  // requires padding and will be scalarized.
4493  auto &DL = I->getModule()->getDataLayout();
4494  auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4495  if (hasIrregularType(ScalarTy, DL, VF))
4496  return false;
4497 
4498  return true;
4499 }
4500 
4501 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4502  // We should not collect Uniforms more than once per VF. Right now,
4503  // this function is called from collectUniformsAndScalars(), which
4504  // already does this check. Collecting Uniforms for VF=1 does not make any
4505  // sense.
4506 
4507  assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4508  "This function should not be visited twice for the same VF");
4509 
4510  // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4511  // not analyze again. Uniforms.count(VF) will return 1.
4512  Uniforms[VF].clear();
4513 
4514  // We now know that the loop is vectorizable!
4515  // Collect instructions inside the loop that will remain uniform after
4516  // vectorization.
4517 
4518  // Global values, params and instructions outside of current loop are out of
4519  // scope.
4520  auto isOutOfScope = [&](Value *V) -> bool {
4522  return (!I || !TheLoop->contains(I));
4523  };
4524 
4525  SetVector<Instruction *> Worklist;
4526  BasicBlock *Latch = TheLoop->getLoopLatch();
4527 
4528  // Start with the conditional branch. If the branch condition is an
4529  // instruction contained in the loop that is only used by the branch, it is
4530  // uniform.
4531  auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4532  if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
4533  Worklist.insert(Cmp);
4534  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
4535  }
4536 
4537  // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4538  // are pointers that are treated like consecutive pointers during
4539  // vectorization. The pointer operands of interleaved accesses are an
4540  // example.
4541  SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4542 
4543  // Holds pointer operands of instructions that are possibly non-uniform.
4544  SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4545 
4546  auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4547  InstWidening WideningDecision = getWideningDecision(I, VF);
4548  assert(WideningDecision != CM_Unknown &&
4549  "Widening decision should be ready at this moment");
4550 
4551  return (WideningDecision == CM_Widen ||
4552  WideningDecision == CM_Widen_Reverse ||
4553  WideningDecision == CM_Interleave);
4554  };
4555  // Iterate over the instructions in the loop, and collect all
4556  // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4557  // that a consecutive-like pointer operand will be scalarized, we collect it
4558  // in PossibleNonUniformPtrs instead. We use two sets here because a single
4559  // getelementptr instruction can be used by both vectorized and scalarized
4560  // memory instructions. For example, if a loop loads and stores from the same
4561  // location, but the store is conditional, the store will be scalarized, and
4562  // the getelementptr won't remain uniform.
4563  for (auto *BB : TheLoop->blocks())
4564  for (auto &I : *BB) {
4565  // If there's no pointer operand, there's nothing to do.
4566  auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4567  if (!Ptr)
4568  continue;
4569 
4570  // True if all users of Ptr are memory accesses that have Ptr as their
4571  // pointer operand.
4572  auto UsersAreMemAccesses =
4573  llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4574  return getLoadStorePointerOperand(U) == Ptr;
4575  });
4576 
4577  // Ensure the memory instruction will not be scalarized or used by
4578  // gather/scatter, making its pointer operand non-uniform. If the pointer
4579  // operand is used by any instruction other than a memory access, we
4580  // conservatively assume the pointer operand may be non-uniform.
4581  if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4582  PossibleNonUniformPtrs.insert(Ptr);
4583 
4584  // If the memory instruction will be vectorized and its pointer operand
4585  // is consecutive-like, or interleaving - the pointer operand should
4586  // remain uniform.
4587  else
4588  ConsecutiveLikePtrs.insert(Ptr);
4589  }
4590 
4591  // Add to the Worklist all consecutive and consecutive-like pointers that
4592  // aren't also identified as possibly non-uniform.
4593  for (auto *V : ConsecutiveLikePtrs)
4594  if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) {
4595  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
4596  Worklist.insert(V);
4597  }
4598 
4599  // Expand Worklist in topological order: whenever a new instruction
4600  // is added , its users should be already inside Worklist. It ensures
4601  // a uniform instruction will only be used by uniform instructions.
4602  unsigned idx = 0;
4603  while (idx != Worklist.size()) {
4604  Instruction *I = Worklist[idx++];
4605 
4606  for (auto OV : I->operand_values()) {
4607  // isOutOfScope operands cannot be uniform instructions.
4608  if (isOutOfScope(OV))
4609  continue;
4610  // First order recurrence Phi's should typically be considered
4611  // non-uniform.
4612  auto *OP = dyn_cast<PHINode>(OV);
4613  if (OP && Legal->isFirstOrderRecurrence(OP))
4614  continue;
4615  // If all the users of the operand are uniform, then add the
4616  // operand into the uniform worklist.
4617  auto *OI = cast<Instruction>(OV);
4618  if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4619  auto *J = cast<Instruction>(U);
4620  return Worklist.count(J) ||
4621  (OI == getLoadStorePointerOperand(J) &&
4622  isUniformDecision(J, VF));
4623  })) {
4624  Worklist.insert(OI);
4625  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
4626  }
4627  }
4628  }
4629 
4630  // Returns true if Ptr is the pointer operand of a memory access instruction
4631  // I, and I is known to not require scalarization.
4632  auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4633  return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4634  };
4635 
4636  // For an instruction to be added into Worklist above, all its users inside
4637  // the loop should also be in Worklist. However, this condition cannot be
4638  // true for phi nodes that form a cyclic dependence. We must process phi
4639  // nodes separately. An induction variable will remain uniform if all users
4640  // of the induction variable and induction variable update remain uniform.
4641  // The code below handles both pointer and non-pointer induction variables.
4642  for (auto &Induction : *Legal->getInductionVars()) {
4643  auto *Ind = Induction.first;
4644  auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4645 
4646  // Determine if all users of the induction variable are uniform after
4647  // vectorization.
4648  auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4649  auto *I = cast<Instruction>(U);
4650  return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4651  isVectorizedMemAccessUse(I, Ind);
4652  });
4653  if (!UniformInd)
4654  continue;
4655 
4656  // Determine if all users of the induction variable update instruction are
4657  // uniform after vectorization.
4658  auto UniformIndUpdate =
4659  llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4660  auto *I = cast<Instruction>(U);
4661  return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4662  isVectorizedMemAccessUse(I, IndUpdate);
4663  });
4664  if (!UniformIndUpdate)
4665  continue;
4666 
4667  // The induction variable and its update instruction will remain uniform.
4668  Worklist.insert(Ind);
4669  Worklist.insert(IndUpdate);
4670  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
4671  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate
4672  << "\n");
4673  }
4674 
4675  Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4676 }
4677 
4680  // TODO: It may by useful to do since it's still likely to be dynamically
4681  // uniform if the target can skip.
4682  LLVM_DEBUG(
4683  dbgs() << "LV: Not inserting runtime ptr check for divergent target");
4684 
4685  ORE->emit(
4686  createMissedAnalysis("CantVersionLoopWithDivergentTarget")
4687  << "runtime pointer checks needed. Not enabled for divergent target");
4688 
4689  return None;
4690  }
4691 
4692  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4693  if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize.
4694  return computeFeasibleMaxVF(OptForSize, TC);
4695 
4697  ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
4698  << "runtime pointer checks needed. Enable vectorization of this "
4699  "loop with '#pragma clang loop vectorize(enable)' when "
4700  "compiling with -Os/-Oz");
4701  LLVM_DEBUG(
4702  dbgs()
4703  << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
4704  return None;
4705  }
4706 
4707  if (!PSE.getUnionPredicate().getPredicates().empty()) {
4708  ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
4709  << "runtime SCEV checks needed. Enable vectorization of this "
4710  "loop with '#pragma clang loop vectorize(enable)' when "
4711  "compiling with -Os/-Oz");
4712  LLVM_DEBUG(
4713  dbgs()
4714  << "LV: Aborting. Runtime SCEV check is required with -Os/-Oz.\n");
4715  return None;
4716  }
4717 
4718  // FIXME: Avoid specializing for stride==1 instead of bailing out.
4719  if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4720  ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
4721  << "runtime stride == 1 checks needed. Enable vectorization of "
4722  "this loop with '#pragma clang loop vectorize(enable)' when "
4723  "compiling with -Os/-Oz");
4724  LLVM_DEBUG(
4725  dbgs()
4726  << "LV: Aborting. Runtime stride check is required with -Os/-Oz.\n");
4727  return None;
4728  }
4729 
4730  // If we optimize the program for size, avoid creating the tail loop.
4731  LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4732 
4733  if (TC == 1) {
4734  ORE->emit(createMissedAnalysis("SingleIterationLoop")
4735  << "loop trip count is one, irrelevant for vectorization");
4736  LLVM_DEBUG(dbgs() << "LV: Aborting, single iteration (non) loop.\n");
4737  return None;
4738  }
4739 
4740  // Record that scalar epilogue is not allowed.
4741  LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4742 
4743  IsScalarEpilogueAllowed = !OptForSize;
4744 
4745  // We don't create an epilogue when optimizing for size.
4746  // Invalidate interleave groups that require an epilogue if we can't mask
4747  // the interleave-group.
4749  InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4750 
4751  unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC);
4752 
4753  if (TC > 0 && TC % MaxVF == 0) {
4754  LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4755  return MaxVF;
4756  }
4757 
4758  // If we don't know the precise trip count, or if the trip count that we
4759  // found modulo the vectorization factor is not zero, try to fold the tail
4760  // by masking.
4761  // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4762  if (Legal->canFoldTailByMasking()) {
4763  FoldTailByMasking = true;
4764  return MaxVF;
4765  }
4766 
4767  if (TC == 0) {
4768  ORE->emit(
4769  createMissedAnalysis("UnknownLoopCountComplexCFG")
4770  << "unable to calculate the loop count due to complex control flow");
4771  return None;
4772  }
4773 
4774  ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize")
4775  << "cannot optimize for size and vectorize at the same time. "
4776  "Enable vectorization of this loop with '#pragma clang loop "
4777  "vectorize(enable)' when compiling with -Os/-Oz");
4778  return None;
4779 }
4780 
4781 unsigned
4782 LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
4783  unsigned ConstTripCount) {
4784  MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4785  unsigned SmallestType, WidestType;
4786  std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4787  unsigned WidestRegister = TTI.getRegisterBitWidth(true);
4788 
4789  // Get the maximum safe dependence distance in bits computed by LAA.
4790  // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4791  // the memory accesses that is most restrictive (involved in the smallest
4792  // dependence distance).
4793  unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
4794 
4795  WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
4796 
4797  unsigned MaxVectorSize = WidestRegister / WidestType;
4798 
4799  LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4800  << " / " << WidestType << " bits.\n");
4801  LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4802  << WidestRegister << " bits.\n");
4803 
4804  assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
4805  " into one vector!");
4806  if (MaxVectorSize == 0) {
4807  LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
4808  MaxVectorSize = 1;
4809  return MaxVectorSize;
4810  } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
4811  isPowerOf2_32(ConstTripCount)) {
4812  // We need to clamp the VF to be the ConstTripCount. There is no point in
4813  // choosing a higher viable VF as done in the loop below.
4814  LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
4815  << ConstTripCount << "\n");
4816  MaxVectorSize = ConstTripCount;
4817  return MaxVectorSize;
4818  }
4819 
4820  unsigned MaxVF = MaxVectorSize;
4821  if (TTI.shouldMaximizeVectorBandwidth(OptForSize) ||
4822  (MaximizeBandwidth && !OptForSize)) {
4823  // Collect all viable vectorization factors larger than the default MaxVF
4824  // (i.e. MaxVectorSize).
4826  unsigned NewMaxVectorSize = WidestRegister / SmallestType;
4827  for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
4828  VFs.push_back(VS);
4829 
4830  // For each VF calculate its register usage.
4831  auto RUs = calculateRegisterUsage(VFs);
4832 
4833  // Select the largest VF which doesn't require more registers than existing
4834  // ones.
4835  unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
4836  for (int i = RUs.size() - 1; i >= 0; --i) {
4837  if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
4838  MaxVF = VFs[i];
4839  break;
4840  }
4841  }
4842  if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
4843  if (MaxVF < MinVF) {
4844  LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4845  << ") with target's minimum: " << MinVF << '\n');
4846  MaxVF = MinVF;
4847  }
4848  }
4849  }
4850  return MaxVF;
4851 }
4852 
4855  float Cost = expectedCost(1).first;
4856  const float ScalarCost = Cost;
4857  unsigned Width = 1;
4858  LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
4859 
4860  bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
4861  if (ForceVectorization && MaxVF > 1) {
4862  // Ignore scalar width, because the user explicitly wants vectorization.
4863  // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4864  // evaluation.
4866  }
4867 
4868  for (unsigned i = 2; i <= MaxVF; i *= 2) {
4869  // Notice that the vector loop needs to be executed less times, so
4870  // we need to divide the cost of the vector loops by the width of
4871  // the vector elements.
4872  VectorizationCostTy C = expectedCost(i);
4873  float VectorCost = C.first / (float)i;
4874  LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
4875  << " costs: " << (int)VectorCost << ".\n");
4876  if (!C.second && !ForceVectorization) {
4877  LLVM_DEBUG(
4878  dbgs() << "LV: Not considering vector loop of width " << i
4879  << " because it will not generate any vector instructions.\n");
4880  continue;
4881  }
4882  if (VectorCost < Cost) {
4883  Cost = VectorCost;
4884  Width = i;
4885  }
4886  }
4887 
4888  if (!EnableCondStoresVectorization && NumPredStores) {
4889  ORE->emit(createMissedAnalysis("ConditionalStore")
4890  << "store that is conditionally executed prevents vectorization");
4891  LLVM_DEBUG(
4892  dbgs() << "LV: No vectorization. There are conditional stores.\n");
4893  Width = 1;
4894  Cost = ScalarCost;
4895  }
4896 
4897  LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
4898  << "LV: Vectorization seems to be not beneficial, "
4899  << "but was forced by a user.\n");
4900  LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
4901  VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
4902  return Factor;
4903 }
4904 
4905 std::pair<unsigned, unsigned>
4907  unsigned MinWidth = -1U;
4908  unsigned MaxWidth = 8;
4909  const DataLayout &DL = TheFunction->getParent()->getDataLayout();
4910 
4911  // For each block.
4912  for (BasicBlock *BB : TheLoop->blocks()) {
4913  // For each instruction in the loop.
4914  for (Instruction &I : BB->instructionsWithoutDebug()) {
4915  Type *T = I.getType();
4916 
4917  // Skip ignored values.
4918  if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
4919  continue;
4920 
4921  // Only examine Loads, Stores and PHINodes.
4922  if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
4923  continue;
4924 
4925  // Examine PHI nodes that are reduction variables. Update the type to
4926  // account for the recurrence type.
4927  if (auto *PN = dyn_cast<PHINode>(&I)) {
4928  if (!Legal->isReductionVariable(PN))
4929  continue;
4930  RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
4931  T = RdxDesc.getRecurrenceType();
4932  }
4933 
4934  // Examine the stored values.
4935  if (auto *ST = dyn_cast<StoreInst>(&I))
4936  T = ST->getValueOperand()->getType();
4937 
4938  // Ignore loaded pointer types and stored pointer types that are not
4939  // vectorizable.
4940  //
4941  // FIXME: The check here attempts to predict whether a load or store will
4942  // be vectorized. We only know this for certain after a VF has
4943  // been selected. Here, we assume that if an access can be
4944  // vectorized, it will be. We should also look at extending this
4945  // optimization to non-pointer types.
4946  //
4947  if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
4948