LLVM  8.0.0svn
LoopVectorize.cpp
Go to the documentation of this file.
1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
11 // and generates target-independent LLVM-IR.
12 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
13 // of instructions in order to estimate the profitability of vectorization.
14 //
15 // The loop vectorizer combines consecutive loop iterations into a single
16 // 'wide' iteration. After this transformation the index is incremented
17 // by the SIMD vector width, and not by one.
18 //
19 // This pass has three parts:
20 // 1. The main loop pass that drives the different parts.
21 // 2. LoopVectorizationLegality - A unit that checks for the legality
22 // of the vectorization.
23 // 3. InnerLoopVectorizer - A unit that performs the actual
24 // widening of instructions.
25 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
26 // of vectorization. It decides on the optimal vector width, which
27 // can be one, if vectorization is not profitable.
28 //
29 // There is a development effort going on to migrate loop vectorizer to the
30 // VPlan infrastructure and to introduce outer loop vectorization support (see
31 // docs/Proposal/VectorizationPlan.rst and
32 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
33 // purpose, we temporarily introduced the VPlan-native vectorization path: an
34 // alternative vectorization path that is natively implemented on top of the
35 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
36 //
37 //===----------------------------------------------------------------------===//
38 //
39 // The reduction-variable vectorization is based on the paper:
40 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
41 //
42 // Variable uniformity checks are inspired by:
43 // Karrenberg, R. and Hack, S. Whole Function Vectorization.
44 //
45 // The interleaved access vectorization is based on the paper:
46 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
47 // Data for SIMD
48 //
49 // Other ideas/concepts are from:
50 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
51 //
52 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
53 // Vectorizing Compilers.
54 //
55 //===----------------------------------------------------------------------===//
56 
59 #include "VPRecipeBuilder.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanHCFGTransforms.h"
62 #include "llvm/ADT/APInt.h"
63 #include "llvm/ADT/ArrayRef.h"
64 #include "llvm/ADT/DenseMap.h"
65 #include "llvm/ADT/DenseMapInfo.h"
66 #include "llvm/ADT/Hashing.h"
67 #include "llvm/ADT/MapVector.h"
68 #include "llvm/ADT/None.h"
69 #include "llvm/ADT/Optional.h"
70 #include "llvm/ADT/STLExtras.h"
71 #include "llvm/ADT/SetVector.h"
72 #include "llvm/ADT/SmallPtrSet.h"
73 #include "llvm/ADT/SmallVector.h"
74 #include "llvm/ADT/Statistic.h"
75 #include "llvm/ADT/StringRef.h"
76 #include "llvm/ADT/Twine.h"
81 #include "llvm/Analysis/CFG.h"
87 #include "llvm/Analysis/LoopInfo.h"
96 #include "llvm/IR/Attributes.h"
97 #include "llvm/IR/BasicBlock.h"
98 #include "llvm/IR/CFG.h"
99 #include "llvm/IR/Constant.h"
100 #include "llvm/IR/Constants.h"
101 #include "llvm/IR/DataLayout.h"
103 #include "llvm/IR/DebugLoc.h"
104 #include "llvm/IR/DerivedTypes.h"
105 #include "llvm/IR/DiagnosticInfo.h"
106 #include "llvm/IR/Dominators.h"
107 #include "llvm/IR/Function.h"
108 #include "llvm/IR/IRBuilder.h"
109 #include "llvm/IR/InstrTypes.h"
110 #include "llvm/IR/Instruction.h"
111 #include "llvm/IR/Instructions.h"
112 #include "llvm/IR/IntrinsicInst.h"
113 #include "llvm/IR/Intrinsics.h"
114 #include "llvm/IR/LLVMContext.h"
115 #include "llvm/IR/Metadata.h"
116 #include "llvm/IR/Module.h"
117 #include "llvm/IR/Operator.h"
118 #include "llvm/IR/Type.h"
119 #include "llvm/IR/Use.h"
120 #include "llvm/IR/User.h"
121 #include "llvm/IR/Value.h"
122 #include "llvm/IR/ValueHandle.h"
123 #include "llvm/IR/Verifier.h"
124 #include "llvm/Pass.h"
125 #include "llvm/Support/Casting.h"
127 #include "llvm/Support/Compiler.h"
128 #include "llvm/Support/Debug.h"
130 #include "llvm/Support/MathExtras.h"
137 #include <algorithm>
138 #include <cassert>
139 #include <cstdint>
140 #include <cstdlib>
141 #include <functional>
142 #include <iterator>
143 #include <limits>
144 #include <memory>
145 #include <string>
146 #include <tuple>
147 #include <utility>
148 #include <vector>
149 
150 using namespace llvm;
151 
152 #define LV_NAME "loop-vectorize"
153 #define DEBUG_TYPE LV_NAME
154 
155 STATISTIC(LoopsVectorized, "Number of loops vectorized");
156 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
157 
158 /// Loops with a known constant trip count below this number are vectorized only
159 /// if no scalar iteration overheads are incurred.
161  "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
162  cl::desc("Loops with a constant trip count that is smaller than this "
163  "value are vectorized only if no scalar iteration overheads "
164  "are incurred."));
165 
167  "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
168  cl::desc("Maximize bandwidth when selecting vectorization factor which "
169  "will be determined by the smallest type in loop."));
170 
172  "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
173  cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
174 
175 /// We don't interleave loops with a known constant trip count below this
176 /// number.
177 static const unsigned TinyTripCountInterleaveThreshold = 128;
178 
180  "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
181  cl::desc("A flag that overrides the target's number of scalar registers."));
182 
184  "force-target-num-vector-regs", cl::init(0), cl::Hidden,
185  cl::desc("A flag that overrides the target's number of vector registers."));
186 
188  "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
189  cl::desc("A flag that overrides the target's max interleave factor for "
190  "scalar loops."));
191 
193  "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
194  cl::desc("A flag that overrides the target's max interleave factor for "
195  "vectorized loops."));
196 
198  "force-target-instruction-cost", cl::init(0), cl::Hidden,
199  cl::desc("A flag that overrides the target's expected cost for "
200  "an instruction to a single constant value. Mostly "
201  "useful for getting consistent testing."));
202 
204  "small-loop-cost", cl::init(20), cl::Hidden,
205  cl::desc(
206  "The cost of a loop that is considered 'small' by the interleaver."));
207 
209  "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
210  cl::desc("Enable the use of the block frequency analysis to access PGO "
211  "heuristics minimizing code growth in cold regions and being more "
212  "aggressive in hot regions."));
213 
214 // Runtime interleave loops for load/store throughput.
216  "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
217  cl::desc(
218  "Enable runtime interleaving until load/store ports are saturated"));
219 
220 /// The number of stores in a loop that are allowed to need predication.
222  "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
223  cl::desc("Max number of stores to be predicated behind an if."));
224 
226  "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
227  cl::desc("Count the induction variable only once when interleaving"));
228 
230  "enable-cond-stores-vec", cl::init(true), cl::Hidden,
231  cl::desc("Enable if predication of stores during vectorization."));
232 
234  "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
235  cl::desc("The maximum interleave count to use when interleaving a scalar "
236  "reduction in a nested loop."));
237 
239  "enable-vplan-native-path", cl::init(false), cl::Hidden,
240  cl::desc("Enable VPlan-native vectorization path with "
241  "support for outer loop vectorization."));
242 
243 // This flag enables the stress testing of the VPlan H-CFG construction in the
244 // VPlan-native vectorization path. It must be used in conjuction with
245 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
246 // verification of the H-CFGs built.
248  "vplan-build-stress-test", cl::init(false), cl::Hidden,
249  cl::desc(
250  "Build VPlan for every supported loop nest in the function and bail "
251  "out right after the build (stress test the VPlan H-CFG construction "
252  "in the VPlan-native vectorization path)."));
253 
254 /// A helper function for converting Scalar types to vector types.
255 /// If the incoming type is void, we return void. If the VF is 1, we return
256 /// the scalar type.
257 static Type *ToVectorTy(Type *Scalar, unsigned VF) {
258  if (Scalar->isVoidTy() || VF == 1)
259  return Scalar;
260  return VectorType::get(Scalar, VF);
261 }
262 
263 /// A helper function that returns the type of loaded or stored value.
265  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
266  "Expected Load or Store instruction");
267  if (auto *LI = dyn_cast<LoadInst>(I))
268  return LI->getType();
269  return cast<StoreInst>(I)->getValueOperand()->getType();
270 }
271 
272 /// A helper function that returns true if the given type is irregular. The
273 /// type is irregular if its allocated size doesn't equal the store size of an
274 /// element of the corresponding vector type at the given vectorization factor.
275 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
276  // Determine if an array of VF elements of type Ty is "bitcast compatible"
277  // with a <VF x Ty> vector.
278  if (VF > 1) {
279  auto *VectorTy = VectorType::get(Ty, VF);
280  return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
281  }
282 
283  // If the vectorization factor is one, we just check if an array of type Ty
284  // requires padding between elements.
285  return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
286 }
287 
288 /// A helper function that returns the reciprocal of the block probability of
289 /// predicated blocks. If we return X, we are assuming the predicated block
290 /// will execute once for every X iterations of the loop header.
291 ///
292 /// TODO: We should use actual block probability here, if available. Currently,
293 /// we always assume predicated blocks have a 50% chance of executing.
294 static unsigned getReciprocalPredBlockProb() { return 2; }
295 
296 /// A helper function that adds a 'fast' flag to floating-point operations.
298  if (isa<FPMathOperator>(V)) {
299  FastMathFlags Flags;
300  Flags.setFast();
301  cast<Instruction>(V)->setFastMathFlags(Flags);
302  }
303  return V;
304 }
305 
306 /// A helper function that returns an integer or floating-point constant with
307 /// value C.
308 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
309  return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
310  : ConstantFP::get(Ty, C);
311 }
312 
313 namespace llvm {
314 
315 /// InnerLoopVectorizer vectorizes loops which contain only one basic
316 /// block to a specified vectorization factor (VF).
317 /// This class performs the widening of scalars into vectors, or multiple
318 /// scalars. This class also implements the following features:
319 /// * It inserts an epilogue loop for handling loops that don't have iteration
320 /// counts that are known to be a multiple of the vectorization factor.
321 /// * It handles the code generation for reduction variables.
322 /// * Scalarization (implementation using scalars) of un-vectorizable
323 /// instructions.
324 /// InnerLoopVectorizer does not perform any vectorization-legality
325 /// checks, and relies on the caller to check for the different legality
326 /// aspects. The InnerLoopVectorizer relies on the
327 /// LoopVectorizationLegality class to provide information about the induction
328 /// and reduction variables that were found to a given vectorization factor.
330 public:
333  const TargetLibraryInfo *TLI,
335  OptimizationRemarkEmitter *ORE, unsigned VecWidth,
336  unsigned UnrollFactor, LoopVectorizationLegality *LVL,
338  : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
339  AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
340  Builder(PSE.getSE()->getContext()),
341  VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
342  virtual ~InnerLoopVectorizer() = default;
343 
344  /// Create a new empty loop. Unlink the old loop and connect the new one.
345  /// Return the pre-header block of the new loop.
347 
348  /// Widen a single instruction within the innermost loop.
350 
351  /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
352  void fixVectorizedLoop();
353 
354  // Return true if any runtime check is added.
356 
357  /// A type for vectorized values in the new loop. Each value from the
358  /// original loop, when vectorized, is represented by UF vector values in the
359  /// new unrolled loop, where UF is the unroll factor.
361 
362  /// Vectorize a single PHINode in a block. This method handles the induction
363  /// variable canonicalization. It supports both VF = 1 for unrolled loops and
364  /// arbitrary length vectors.
365  void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
366 
367  /// A helper function to scalarize a single Instruction in the innermost loop.
368  /// Generates a sequence of scalar instances for each lane between \p MinLane
369  /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
370  /// inclusive..
371  void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
372  bool IfPredicateInstr);
373 
374  /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
375  /// is provided, the integer induction variable will first be truncated to
376  /// the corresponding type.
377  void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
378 
379  /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
380  /// vector or scalar value on-demand if one is not yet available. When
381  /// vectorizing a loop, we visit the definition of an instruction before its
382  /// uses. When visiting the definition, we either vectorize or scalarize the
383  /// instruction, creating an entry for it in the corresponding map. (In some
384  /// cases, such as induction variables, we will create both vector and scalar
385  /// entries.) Then, as we encounter uses of the definition, we derive values
386  /// for each scalar or vector use unless such a value is already available.
387  /// For example, if we scalarize a definition and one of its uses is vector,
388  /// we build the required vector on-demand with an insertelement sequence
389  /// when visiting the use. Otherwise, if the use is scalar, we can use the
390  /// existing scalar definition.
391  ///
392  /// Return a value in the new loop corresponding to \p V from the original
393  /// loop at unroll index \p Part. If the value has already been vectorized,
394  /// the corresponding vector entry in VectorLoopValueMap is returned. If,
395  /// however, the value has a scalar entry in VectorLoopValueMap, we construct
396  /// a new vector value on-demand by inserting the scalar values into a vector
397  /// with an insertelement sequence. If the value has been neither vectorized
398  /// nor scalarized, it must be loop invariant, so we simply broadcast the
399  /// value into a vector.
400  Value *getOrCreateVectorValue(Value *V, unsigned Part);
401 
402  /// Return a value in the new loop corresponding to \p V from the original
403  /// loop at unroll and vector indices \p Instance. If the value has been
404  /// vectorized but not scalarized, the necessary extractelement instruction
405  /// will be generated.
406  Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
407 
408  /// Construct the vector value of a scalarized value \p V one lane at a time.
409  void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
410 
411  /// Try to vectorize the interleaved access group that \p Instr belongs to.
413 
414  /// Vectorize Load and Store instructions, optionally masking the vector
415  /// operations if \p BlockInMask is non-null.
417  VectorParts *BlockInMask = nullptr);
418 
419  /// Set the debug location in the builder using the debug location in
420  /// the instruction.
421  void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
422 
423  /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
424  void fixNonInductionPHIs(void);
425 
426 protected:
428 
429  /// A small list of PHINodes.
431 
432  /// A type for scalarized values in the new loop. Each value from the
433  /// original loop, when scalarized, is represented by UF x VF scalar values
434  /// in the new unrolled loop, where UF is the unroll factor and VF is the
435  /// vectorization factor.
437 
438  /// Set up the values of the IVs correctly when exiting the vector loop.
439  void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
440  Value *CountRoundDown, Value *EndValue,
441  BasicBlock *MiddleBlock);
442 
443  /// Create a new induction variable inside L.
444  PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
445  Value *Step, Instruction *DL);
446 
447  /// Handle all cross-iteration phis in the header.
448  void fixCrossIterationPHIs();
449 
450  /// Fix a first-order recurrence. This is the second phase of vectorizing
451  /// this phi node.
452  void fixFirstOrderRecurrence(PHINode *Phi);
453 
454  /// Fix a reduction cross-iteration phi. This is the second phase of
455  /// vectorizing this phi node.
456  void fixReduction(PHINode *Phi);
457 
458  /// The Loop exit block may have single value PHI nodes with some
459  /// incoming value. While vectorizing we only handled real values
460  /// that were defined inside the loop and we should have one value for
461  /// each predecessor of its parent basic block. See PR14725.
462  void fixLCSSAPHIs();
463 
464  /// Iteratively sink the scalarized operands of a predicated instruction into
465  /// the block that was created for it.
466  void sinkScalarOperands(Instruction *PredInst);
467 
468  /// Shrinks vector element sizes to the smallest bitwidth they can be legally
469  /// represented as.
471 
472  /// Insert the new loop to the loop hierarchy and pass manager
473  /// and update the analysis passes.
474  void updateAnalysis();
475 
476  /// Create a broadcast instruction. This method generates a broadcast
477  /// instruction (shuffle) for loop invariant values and for the induction
478  /// value. If this is the induction variable then we extend it to N, N+1, ...
479  /// this is needed because each iteration in the loop corresponds to a SIMD
480  /// element.
481  virtual Value *getBroadcastInstrs(Value *V);
482 
483  /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
484  /// to each vector element of Val. The sequence starts at StartIndex.
485  /// \p Opcode is relevant for FP induction variable.
486  virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
487  Instruction::BinaryOps Opcode =
488  Instruction::BinaryOpsEnd);
489 
490  /// Compute scalar induction steps. \p ScalarIV is the scalar induction
491  /// variable on which to base the steps, \p Step is the size of the step, and
492  /// \p EntryVal is the value from the original loop that maps to the steps.
493  /// Note that \p EntryVal doesn't have to be an induction variable - it
494  /// can also be a truncate instruction.
495  void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
496  const InductionDescriptor &ID);
497 
498  /// Create a vector induction phi node based on an existing scalar one. \p
499  /// EntryVal is the value from the original loop that maps to the vector phi
500  /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
501  /// truncate instruction, instead of widening the original IV, we widen a
502  /// version of the IV truncated to \p EntryVal's type.
504  Value *Step, Instruction *EntryVal);
505 
506  /// Returns true if an instruction \p I should be scalarized instead of
507  /// vectorized for the chosen vectorization factor.
509 
510  /// Returns true if we should generate a scalar version of \p IV.
511  bool needsScalarInduction(Instruction *IV) const;
512 
513  /// If there is a cast involved in the induction variable \p ID, which should
514  /// be ignored in the vectorized loop body, this function records the
515  /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
516  /// cast. We had already proved that the casted Phi is equal to the uncasted
517  /// Phi in the vectorized loop (under a runtime guard), and therefore
518  /// there is no need to vectorize the cast - the same value can be used in the
519  /// vector loop for both the Phi and the cast.
520  /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
521  /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
522  ///
523  /// \p EntryVal is the value from the original loop that maps to the vector
524  /// phi node and is used to distinguish what is the IV currently being
525  /// processed - original one (if \p EntryVal is a phi corresponding to the
526  /// original IV) or the "newly-created" one based on the proof mentioned above
527  /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
528  /// latter case \p EntryVal is a TruncInst and we must not record anything for
529  /// that IV, but it's error-prone to expect callers of this routine to care
530  /// about that, hence this explicit parameter.
532  const Instruction *EntryVal,
533  Value *VectorLoopValue,
534  unsigned Part,
535  unsigned Lane = UINT_MAX);
536 
537  /// Generate a shuffle sequence that will reverse the vector Vec.
538  virtual Value *reverseVector(Value *Vec);
539 
540  /// Returns (and creates if needed) the original loop trip count.
541  Value *getOrCreateTripCount(Loop *NewLoop);
542 
543  /// Returns (and creates if needed) the trip count of the widened loop.
545 
546  /// Returns a bitcasted value to the requested vector type.
547  /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
549  const DataLayout &DL);
550 
551  /// Emit a bypass check to see if the vector trip count is zero, including if
552  /// it overflows.
554 
555  /// Emit a bypass check to see if all of the SCEV assumptions we've
556  /// had to make are correct.
557  void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
558 
559  /// Emit bypass checks to check any memory assumptions we may have made.
560  void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
561 
562  /// Compute the transformed value of Index at offset StartValue using step
563  /// StepValue.
564  /// For integer induction, returns StartValue + Index * StepValue.
565  /// For pointer induction, returns StartValue[Index * StepValue].
566  /// FIXME: The newly created binary instructions should contain nsw/nuw
567  /// flags, which can be found from the original scalar operations.
569  const DataLayout &DL,
570  const InductionDescriptor &ID) const;
571 
572  /// Add additional metadata to \p To that was not present on \p Orig.
573  ///
574  /// Currently this is used to add the noalias annotations based on the
575  /// inserted memchecks. Use this for instructions that are *cloned* into the
576  /// vector loop.
577  void addNewMetadata(Instruction *To, const Instruction *Orig);
578 
579  /// Add metadata from one instruction to another.
580  ///
581  /// This includes both the original MDs from \p From and additional ones (\see
582  /// addNewMetadata). Use this for *newly created* instructions in the vector
583  /// loop.
585 
586  /// Similar to the previous function but it adds the metadata to a
587  /// vector of instructions.
589 
590  /// The original loop.
592 
593  /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
594  /// dynamic knowledge to simplify SCEV expressions and converts them to a
595  /// more usable form.
597 
598  /// Loop Info.
600 
601  /// Dominator Tree.
603 
604  /// Alias Analysis.
606 
607  /// Target Library Info.
609 
610  /// Target Transform Info.
612 
613  /// Assumption Cache.
615 
616  /// Interface to emit optimization remarks.
618 
619  /// LoopVersioning. It's only set up (non-null) if memchecks were
620  /// used.
621  ///
622  /// This is currently only used to add no-alias metadata based on the
623  /// memchecks. The actually versioning is performed manually.
624  std::unique_ptr<LoopVersioning> LVer;
625 
626  /// The vectorization SIMD factor to use. Each vector will have this many
627  /// vector elements.
628  unsigned VF;
629 
630  /// The vectorization unroll factor to use. Each scalar is vectorized to this
631  /// many different vector instructions.
632  unsigned UF;
633 
634  /// The builder that we use
636 
637  // --- Vectorization state ---
638 
639  /// The vector-loop preheader.
641 
642  /// The scalar-loop preheader.
644 
645  /// Middle Block between the vector and the scalar.
647 
648  /// The ExitBlock of the scalar loop.
650 
651  /// The vector loop body.
653 
654  /// The scalar loop body.
656 
657  /// A list of all bypass blocks. The first block is the entry of the loop.
659 
660  /// The new Induction variable which was added to the new block.
661  PHINode *Induction = nullptr;
662 
663  /// The induction variable of the old basic block.
664  PHINode *OldInduction = nullptr;
665 
666  /// Maps values from the original loop to their corresponding values in the
667  /// vectorized loop. A key value can map to either vector values, scalar
668  /// values or both kinds of values, depending on whether the key was
669  /// vectorized and scalarized.
671 
672  /// Store instructions that were predicated.
674 
675  /// Trip count of the original loop.
676  Value *TripCount = nullptr;
677 
678  /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
679  Value *VectorTripCount = nullptr;
680 
681  /// The legality analysis.
683 
684  /// The profitablity analysis.
686 
687  // Record whether runtime checks are added.
688  bool AddedSafetyChecks = false;
689 
690  // Holds the end values for each induction variable. We save the end values
691  // so we can later fix-up the external users of the induction variables.
693 
694  // Vector of original scalar PHIs whose corresponding widened PHIs need to be
695  // fixed up at the end of vector code generation.
697 };
698 
700 public:
703  const TargetLibraryInfo *TLI,
705  OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
708  : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
709  UnrollFactor, LVL, CM) {}
710 
711 private:
712  Value *getBroadcastInstrs(Value *V) override;
713  Value *getStepVector(Value *Val, int StartIdx, Value *Step,
714  Instruction::BinaryOps Opcode =
715  Instruction::BinaryOpsEnd) override;
716  Value *reverseVector(Value *Vec) override;
717 };
718 
719 } // end namespace llvm
720 
721 /// Look for a meaningful debug location on the instruction or it's
722 /// operands.
724  if (!I)
725  return I;
726 
727  DebugLoc Empty;
728  if (I->getDebugLoc() != Empty)
729  return I;
730 
731  for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
732  if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
733  if (OpInst->getDebugLoc() != Empty)
734  return OpInst;
735  }
736 
737  return I;
738 }
739 
741  if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
742  const DILocation *DIL = Inst->getDebugLoc();
743  if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
744  !isa<DbgInfoIntrinsic>(Inst))
746  else
748  } else
750 }
751 
752 #ifndef NDEBUG
753 /// \return string containing a file name and a line # for the given loop.
754 static std::string getDebugLocString(const Loop *L) {
755  std::string Result;
756  if (L) {
757  raw_string_ostream OS(Result);
758  if (const DebugLoc LoopDbgLoc = L->getStartLoc())
759  LoopDbgLoc.print(OS);
760  else
761  // Just print the module name.
762  OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
763  OS.flush();
764  }
765  return Result;
766 }
767 #endif
768 
770  const Instruction *Orig) {
771  // If the loop was versioned with memchecks, add the corresponding no-alias
772  // metadata.
773  if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
774  LVer->annotateInstWithNoAlias(To, Orig);
775 }
776 
778  Instruction *From) {
779  propagateMetadata(To, From);
780  addNewMetadata(To, From);
781 }
782 
784  Instruction *From) {
785  for (Value *V : To) {
786  if (Instruction *I = dyn_cast<Instruction>(V))
787  addMetadata(I, From);
788  }
789 }
790 
791 static void emitMissedWarning(Function *F, Loop *L,
792  const LoopVectorizeHints &LH,
794  LH.emitRemarkWithHints();
795 
797  if (LH.getWidth() != 1)
799  DEBUG_TYPE, "FailedRequestedVectorization",
800  L->getStartLoc(), L->getHeader())
801  << "loop not vectorized: "
802  << "failed explicitly specified loop vectorization");
803  else if (LH.getInterleave() != 1)
805  DEBUG_TYPE, "FailedRequestedInterleaving", L->getStartLoc(),
806  L->getHeader())
807  << "loop not interleaved: "
808  << "failed explicitly specified loop interleaving");
809  }
810 }
811 
812 namespace llvm {
813 
814 /// LoopVectorizationCostModel - estimates the expected speedups due to
815 /// vectorization.
816 /// In many cases vectorization is not profitable. This can happen because of
817 /// a number of reasons. In this class we mainly attempt to predict the
818 /// expected speedup/slowdowns due to the supported instruction set. We use the
819 /// TargetTransformInfo to query the different backends for the cost of
820 /// different operations.
822 public:
825  const TargetTransformInfo &TTI,
826  const TargetLibraryInfo *TLI, DemandedBits *DB,
829  const LoopVectorizeHints *Hints,
831  : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
832  AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {}
833 
834  /// \return An upper bound for the vectorization factor, or None if
835  /// vectorization should be avoided up front.
836  Optional<unsigned> computeMaxVF(bool OptForSize);
837 
838  /// \return The most profitable vectorization factor and the cost of that VF.
839  /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
840  /// then this vectorization factor will be selected if vectorization is
841  /// possible.
842  VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
843 
844  /// Setup cost-based decisions for user vectorization factor.
845  void selectUserVectorizationFactor(unsigned UserVF) {
846  collectUniformsAndScalars(UserVF);
847  collectInstsToScalarize(UserVF);
848  }
849 
850  /// \return The size (in bits) of the smallest and widest types in the code
851  /// that needs to be vectorized. We ignore values that remain scalar such as
852  /// 64 bit loop indices.
853  std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
854 
855  /// \return The desired interleave count.
856  /// If interleave count has been specified by metadata it will be returned.
857  /// Otherwise, the interleave count is computed and returned. VF and LoopCost
858  /// are the selected vectorization factor and the cost of the selected VF.
859  unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
860  unsigned LoopCost);
861 
862  /// Memory access instruction may be vectorized in more than one way.
863  /// Form of instruction after vectorization depends on cost.
864  /// This function takes cost-based decisions for Load/Store instructions
865  /// and collects them in a map. This decisions map is used for building
866  /// the lists of loop-uniform and loop-scalar instructions.
867  /// The calculated cost is saved with widening decision in order to
868  /// avoid redundant calculations.
869  void setCostBasedWideningDecision(unsigned VF);
870 
871  /// A struct that represents some properties of the register usage
872  /// of a loop.
873  struct RegisterUsage {
874  /// Holds the number of loop invariant values that are used in the loop.
876 
877  /// Holds the maximum number of concurrent live intervals in the loop.
878  unsigned MaxLocalUsers;
879  };
880 
881  /// \return Returns information about the register usages of the loop for the
882  /// given vectorization factors.
883  SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
884 
885  /// Collect values we want to ignore in the cost model.
886  void collectValuesToIgnore();
887 
888  /// \returns The smallest bitwidth each instruction can be represented with.
889  /// The vector equivalents of these instructions should be truncated to this
890  /// type.
892  return MinBWs;
893  }
894 
895  /// \returns True if it is more profitable to scalarize instruction \p I for
896  /// vectorization factor \p VF.
897  bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
898  assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
899 
900  // Cost model is not run in the VPlan-native path - return conservative
901  // result until this changes.
903  return false;
904 
905  auto Scalars = InstsToScalarize.find(VF);
906  assert(Scalars != InstsToScalarize.end() &&
907  "VF not yet analyzed for scalarization profitability");
908  return Scalars->second.find(I) != Scalars->second.end();
909  }
910 
911  /// Returns true if \p I is known to be uniform after vectorization.
912  bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
913  if (VF == 1)
914  return true;
915 
916  // Cost model is not run in the VPlan-native path - return conservative
917  // result until this changes.
919  return false;
920 
921  auto UniformsPerVF = Uniforms.find(VF);
922  assert(UniformsPerVF != Uniforms.end() &&
923  "VF not yet analyzed for uniformity");
924  return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
925  }
926 
927  /// Returns true if \p I is known to be scalar after vectorization.
928  bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
929  if (VF == 1)
930  return true;
931 
932  // Cost model is not run in the VPlan-native path - return conservative
933  // result until this changes.
935  return false;
936 
937  auto ScalarsPerVF = Scalars.find(VF);
938  assert(ScalarsPerVF != Scalars.end() &&
939  "Scalar values are not calculated for VF");
940  return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
941  }
942 
943  /// \returns True if instruction \p I can be truncated to a smaller bitwidth
944  /// for vectorization factor \p VF.
945  bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
946  return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
947  !isProfitableToScalarize(I, VF) &&
948  !isScalarAfterVectorization(I, VF);
949  }
950 
951  /// Decision that was taken during cost calculation for memory instruction.
954  CM_Widen, // For consecutive accesses with stride +1.
955  CM_Widen_Reverse, // For consecutive accesses with stride -1.
958  CM_Scalarize
959  };
960 
961  /// Save vectorization decision \p W and \p Cost taken by the cost model for
962  /// instruction \p I and vector width \p VF.
964  unsigned Cost) {
965  assert(VF >= 2 && "Expected VF >=2");
966  WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
967  }
968 
969  /// Save vectorization decision \p W and \p Cost taken by the cost model for
970  /// interleaving group \p Grp and vector width \p VF.
971  void setWideningDecision(const InterleaveGroup *Grp, unsigned VF,
972  InstWidening W, unsigned Cost) {
973  assert(VF >= 2 && "Expected VF >=2");
974  /// Broadcast this decicion to all instructions inside the group.
975  /// But the cost will be assigned to one instruction only.
976  for (unsigned i = 0; i < Grp->getFactor(); ++i) {
977  if (auto *I = Grp->getMember(i)) {
978  if (Grp->getInsertPos() == I)
979  WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
980  else
981  WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
982  }
983  }
984  }
985 
986  /// Return the cost model decision for the given instruction \p I and vector
987  /// width \p VF. Return CM_Unknown if this instruction did not pass
988  /// through the cost modeling.
990  assert(VF >= 2 && "Expected VF >=2");
991 
992  // Cost model is not run in the VPlan-native path - return conservative
993  // result until this changes.
995  return CM_GatherScatter;
996 
997  std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
998  auto Itr = WideningDecisions.find(InstOnVF);
999  if (Itr == WideningDecisions.end())
1000  return CM_Unknown;
1001  return Itr->second.first;
1002  }
1003 
1004  /// Return the vectorization cost for the given instruction \p I and vector
1005  /// width \p VF.
1006  unsigned getWideningCost(Instruction *I, unsigned VF) {
1007  assert(VF >= 2 && "Expected VF >=2");
1008  std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1009  assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1010  "The cost is not calculated");
1011  return WideningDecisions[InstOnVF].second;
1012  }
1013 
1014  /// Return True if instruction \p I is an optimizable truncate whose operand
1015  /// is an induction variable. Such a truncate will be removed by adding a new
1016  /// induction variable with the destination type.
1017  bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1018  // If the instruction is not a truncate, return false.
1019  auto *Trunc = dyn_cast<TruncInst>(I);
1020  if (!Trunc)
1021  return false;
1022 
1023  // Get the source and destination types of the truncate.
1024  Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1025  Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1026 
1027  // If the truncate is free for the given types, return false. Replacing a
1028  // free truncate with an induction variable would add an induction variable
1029  // update instruction to each iteration of the loop. We exclude from this
1030  // check the primary induction variable since it will need an update
1031  // instruction regardless.
1032  Value *Op = Trunc->getOperand(0);
1033  if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1034  return false;
1035 
1036  // If the truncated value is not an induction variable, return false.
1037  return Legal->isInductionPhi(Op);
1038  }
1039 
1040  /// Collects the instructions to scalarize for each predicated instruction in
1041  /// the loop.
1042  void collectInstsToScalarize(unsigned VF);
1043 
1044  /// Collect Uniform and Scalar values for the given \p VF.
1045  /// The sets depend on CM decision for Load/Store instructions
1046  /// that may be vectorized as interleave, gather-scatter or scalarized.
1047  void collectUniformsAndScalars(unsigned VF) {
1048  // Do the analysis once.
1049  if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1050  return;
1051  setCostBasedWideningDecision(VF);
1052  collectLoopUniforms(VF);
1053  collectLoopScalars(VF);
1054  }
1055 
1056  /// Returns true if the target machine supports masked store operation
1057  /// for the given \p DataType and kind of access to \p Ptr.
1059  return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType);
1060  }
1061 
1062  /// Returns true if the target machine supports masked load operation
1063  /// for the given \p DataType and kind of access to \p Ptr.
1065  return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType);
1066  }
1067 
1068  /// Returns true if the target machine supports masked scatter operation
1069  /// for the given \p DataType.
1071  return TTI.isLegalMaskedScatter(DataType);
1072  }
1073 
1074  /// Returns true if the target machine supports masked gather operation
1075  /// for the given \p DataType.
1077  return TTI.isLegalMaskedGather(DataType);
1078  }
1079 
1080  /// Returns true if the target machine can represent \p V as a masked gather
1081  /// or scatter operation.
1083  bool LI = isa<LoadInst>(V);
1084  bool SI = isa<StoreInst>(V);
1085  if (!LI && !SI)
1086  return false;
1087  auto *Ty = getMemInstValueType(V);
1088  return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
1089  }
1090 
1091  /// Returns true if \p I is an instruction that will be scalarized with
1092  /// predication. Such instructions include conditional stores and
1093  /// instructions that may divide by zero.
1094  /// If a non-zero VF has been calculated, we check if I will be scalarized
1095  /// predication for that VF.
1096  bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1097 
1098  // Returns true if \p I is an instruction that will be predicated either
1099  // through scalar predication or masked load/store or masked gather/scatter.
1100  // Superset of instructions that return true for isScalarWithPredication.
1103  return false;
1104  // Loads and stores that need some form of masked operation are predicated
1105  // instructions.
1106  if (isa<LoadInst>(I) || isa<StoreInst>(I))
1107  return Legal->isMaskRequired(I);
1108  return isScalarWithPredication(I);
1109  }
1110 
1111  /// Returns true if \p I is a memory instruction with consecutive memory
1112  /// access that can be widened.
1113  bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1114 
1115  /// Check if \p Instr belongs to any interleaved access group.
1117  return InterleaveInfo.isInterleaved(Instr);
1118  }
1119 
1120  /// Get the interleaved access group that \p Instr belongs to.
1122  return InterleaveInfo.getInterleaveGroup(Instr);
1123  }
1124 
1125  /// Returns true if an interleaved group requires a scalar iteration
1126  /// to handle accesses with gaps.
1127  bool requiresScalarEpilogue() const {
1128  return InterleaveInfo.requiresScalarEpilogue();
1129  }
1130 
1131 private:
1132  unsigned NumPredStores = 0;
1133 
1134  /// \return An upper bound for the vectorization factor, larger than zero.
1135  /// One is returned if vectorization should best be avoided due to cost.
1136  unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount);
1137 
1138  /// The vectorization cost is a combination of the cost itself and a boolean
1139  /// indicating whether any of the contributing operations will actually
1140  /// operate on
1141  /// vector values after type legalization in the backend. If this latter value
1142  /// is
1143  /// false, then all operations will be scalarized (i.e. no vectorization has
1144  /// actually taken place).
1145  using VectorizationCostTy = std::pair<unsigned, bool>;
1146 
1147  /// Returns the expected execution cost. The unit of the cost does
1148  /// not matter because we use the 'cost' units to compare different
1149  /// vector widths. The cost that is returned is *not* normalized by
1150  /// the factor width.
1151  VectorizationCostTy expectedCost(unsigned VF);
1152 
1153  /// Returns the execution time cost of an instruction for a given vector
1154  /// width. Vector width of one means scalar.
1155  VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1156 
1157  /// The cost-computation logic from getInstructionCost which provides
1158  /// the vector type as an output parameter.
1159  unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1160 
1161  /// Calculate vectorization cost of memory instruction \p I.
1162  unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1163 
1164  /// The cost computation for scalarized memory instruction.
1165  unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1166 
1167  /// The cost computation for interleaving group of memory instructions.
1168  unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1169 
1170  /// The cost computation for Gather/Scatter instruction.
1171  unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1172 
1173  /// The cost computation for widening instruction \p I with consecutive
1174  /// memory access.
1175  unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1176 
1177  /// The cost calculation for Load instruction \p I with uniform pointer -
1178  /// scalar load + broadcast.
1179  unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1180 
1181  /// Returns whether the instruction is a load or store and will be a emitted
1182  /// as a vector operation.
1183  bool isConsecutiveLoadOrStore(Instruction *I);
1184 
1185  /// Returns true if an artificially high cost for emulated masked memrefs
1186  /// should be used.
1187  bool useEmulatedMaskMemRefHack(Instruction *I);
1188 
1189  /// Create an analysis remark that explains why vectorization failed
1190  ///
1191  /// \p RemarkName is the identifier for the remark. \return the remark object
1192  /// that can be streamed to.
1193  OptimizationRemarkAnalysis createMissedAnalysis(StringRef RemarkName) {
1194  return createLVMissedAnalysis(Hints->vectorizeAnalysisPassName(),
1195  RemarkName, TheLoop);
1196  }
1197 
1198  /// Map of scalar integer values to the smallest bitwidth they can be legally
1199  /// represented as. The vector equivalents of these values should be truncated
1200  /// to this type.
1202 
1203  /// A type representing the costs for instructions if they were to be
1204  /// scalarized rather than vectorized. The entries are Instruction-Cost
1205  /// pairs.
1207 
1208  /// A set containing all BasicBlocks that are known to present after
1209  /// vectorization as a predicated block.
1210  SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1211 
1212  /// A map holding scalar costs for different vectorization factors. The
1213  /// presence of a cost for an instruction in the mapping indicates that the
1214  /// instruction will be scalarized when vectorizing with the associated
1215  /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1216  DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1217 
1218  /// Holds the instructions known to be uniform after vectorization.
1219  /// The data is collected per VF.
1221 
1222  /// Holds the instructions known to be scalar after vectorization.
1223  /// The data is collected per VF.
1225 
1226  /// Holds the instructions (address computations) that are forced to be
1227  /// scalarized.
1229 
1230  /// Returns the expected difference in cost from scalarizing the expression
1231  /// feeding a predicated instruction \p PredInst. The instructions to
1232  /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1233  /// non-negative return value implies the expression will be scalarized.
1234  /// Currently, only single-use chains are considered for scalarization.
1235  int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1236  unsigned VF);
1237 
1238  /// Collect the instructions that are uniform after vectorization. An
1239  /// instruction is uniform if we represent it with a single scalar value in
1240  /// the vectorized loop corresponding to each vector iteration. Examples of
1241  /// uniform instructions include pointer operands of consecutive or
1242  /// interleaved memory accesses. Note that although uniformity implies an
1243  /// instruction will be scalar, the reverse is not true. In general, a
1244  /// scalarized instruction will be represented by VF scalar values in the
1245  /// vectorized loop, each corresponding to an iteration of the original
1246  /// scalar loop.
1247  void collectLoopUniforms(unsigned VF);
1248 
1249  /// Collect the instructions that are scalar after vectorization. An
1250  /// instruction is scalar if it is known to be uniform or will be scalarized
1251  /// during vectorization. Non-uniform scalarized instructions will be
1252  /// represented by VF values in the vectorized loop, each corresponding to an
1253  /// iteration of the original scalar loop.
1254  void collectLoopScalars(unsigned VF);
1255 
1256  /// Keeps cost model vectorization decision and cost for instructions.
1257  /// Right now it is used for memory instructions only.
1259  std::pair<InstWidening, unsigned>>;
1260 
1261  DecisionList WideningDecisions;
1262 
1263 public:
1264  /// The loop that we evaluate.
1266 
1267  /// Predicated scalar evolution analysis.
1269 
1270  /// Loop Info analysis.
1272 
1273  /// Vectorization legality.
1275 
1276  /// Vector target information.
1278 
1279  /// Target Library Info.
1281 
1282  /// Demanded bits analysis.
1284 
1285  /// Assumption cache.
1287 
1288  /// Interface to emit optimization remarks.
1290 
1292 
1293  /// Loop Vectorize Hint.
1295 
1296  /// The interleave access information contains groups of interleaved accesses
1297  /// with the same stride and close to each other.
1299 
1300  /// Values to ignore in the cost model.
1302 
1303  /// Values to ignore in the cost model when VF > 1.
1305 };
1306 
1307 } // end namespace llvm
1308 
1309 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1310 // vectorization. The loop needs to be annotated with #pragma omp simd
1311 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1312 // vector length information is not provided, vectorization is not considered
1313 // explicit. Interleave hints are not allowed either. These limitations will be
1314 // relaxed in the future.
1315 // Please, note that we are currently forced to abuse the pragma 'clang
1316 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1317 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1318 // provides *explicit vectorization hints* (LV can bypass legal checks and
1319 // assume that vectorization is legal). However, both hints are implemented
1320 // using the same metadata (llvm.loop.vectorize, processed by
1321 // LoopVectorizeHints). This will be fixed in the future when the native IR
1322 // representation for pragma 'omp simd' is introduced.
1323 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1325  assert(!OuterLp->empty() && "This is not an outer loop");
1326  LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1327 
1328  // Only outer loops with an explicit vectorization hint are supported.
1329  // Unannotated outer loops are ignored.
1330  if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1331  return false;
1332 
1333  Function *Fn = OuterLp->getHeader()->getParent();
1334  if (!Hints.allowVectorization(Fn, OuterLp, false /*AlwaysVectorize*/)) {
1335  LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1336  return false;
1337  }
1338 
1339  if (!Hints.getWidth()) {
1340  LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No user vector width.\n");
1341  emitMissedWarning(Fn, OuterLp, Hints, ORE);
1342  return false;
1343  }
1344 
1345  if (Hints.getInterleave() > 1) {
1346  // TODO: Interleave support is future work.
1347  LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1348  "outer loops.\n");
1349  emitMissedWarning(Fn, OuterLp, Hints, ORE);
1350  return false;
1351  }
1352 
1353  return true;
1354 }
1355 
1359  // Collect inner loops and outer loops without irreducible control flow. For
1360  // now, only collect outer loops that have explicit vectorization hints. If we
1361  // are stress testing the VPlan H-CFG construction, we collect the outermost
1362  // loop of every loop nest.
1363  if (L.empty() || VPlanBuildStressTest ||
1365  LoopBlocksRPO RPOT(&L);
1366  RPOT.perform(LI);
1367  if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1368  V.push_back(&L);
1369  // TODO: Collect inner loops inside marked outer loops in case
1370  // vectorization fails for the outer loop. Do not invoke
1371  // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1372  // already known to be reducible. We can use an inherited attribute for
1373  // that.
1374  return;
1375  }
1376  }
1377  for (Loop *InnerL : L)
1378  collectSupportedLoops(*InnerL, LI, ORE, V);
1379 }
1380 
1381 namespace {
1382 
1383 /// The LoopVectorize Pass.
1384 struct LoopVectorize : public FunctionPass {
1385  /// Pass identification, replacement for typeid
1386  static char ID;
1387 
1388  LoopVectorizePass Impl;
1389 
1390  explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true)
1391  : FunctionPass(ID) {
1392  Impl.DisableUnrolling = NoUnrolling;
1393  Impl.AlwaysVectorize = AlwaysVectorize;
1395  }
1396 
1397  bool runOnFunction(Function &F) override {
1398  if (skipFunction(F))
1399  return false;
1400 
1401  auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1402  auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1403  auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1404  auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1405  auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1406  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1407  auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
1408  auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1409  auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1410  auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1411  auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1412  auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1413 
1414  std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1415  [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1416 
1417  return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1418  GetLAA, *ORE);
1419  }
1420 
1421  void getAnalysisUsage(AnalysisUsage &AU) const override {
1432 
1433  // We currently do not preserve loopinfo/dominator analyses with outer loop
1434  // vectorization. Until this is addressed, mark these analyses as preserved
1435  // only for non-VPlan-native path.
1436  // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1437  if (!EnableVPlanNativePath) {
1440  }
1441 
1444  }
1445 };
1446 
1447 } // end anonymous namespace
1448 
1449 //===----------------------------------------------------------------------===//
1450 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1451 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1452 //===----------------------------------------------------------------------===//
1453 
1455  // We need to place the broadcast of invariant variables outside the loop,
1456  // but only if it's proven safe to do so. Else, broadcast will be inside
1457  // vector loop body.
1458  Instruction *Instr = dyn_cast<Instruction>(V);
1459  bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1460  (!Instr ||
1462  // Place the code for broadcasting invariant variables in the new preheader.
1464  if (SafeToHoist)
1466 
1467  // Broadcast the scalar into all locations in the vector.
1468  Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1469 
1470  return Shuf;
1471 }
1472 
1474  const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1475  assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1476  "Expected either an induction phi-node or a truncate of it!");
1477  Value *Start = II.getStartValue();
1478 
1479  // Construct the initial value of the vector IV in the vector loop preheader
1480  auto CurrIP = Builder.saveIP();
1482  if (isa<TruncInst>(EntryVal)) {
1483  assert(Start->getType()->isIntegerTy() &&
1484  "Truncation requires an integer type");
1485  auto *TruncType = cast<IntegerType>(EntryVal->getType());
1486  Step = Builder.CreateTrunc(Step, TruncType);
1487  Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1488  }
1489  Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1490  Value *SteppedStart =
1491  getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1492 
1493  // We create vector phi nodes for both integer and floating-point induction
1494  // variables. Here, we determine the kind of arithmetic we will perform.
1495  Instruction::BinaryOps AddOp;
1496  Instruction::BinaryOps MulOp;
1497  if (Step->getType()->isIntegerTy()) {
1498  AddOp = Instruction::Add;
1499  MulOp = Instruction::Mul;
1500  } else {
1501  AddOp = II.getInductionOpcode();
1502  MulOp = Instruction::FMul;
1503  }
1504 
1505  // Multiply the vectorization factor by the step using integer or
1506  // floating-point arithmetic as appropriate.
1507  Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1508  Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1509 
1510  // Create a vector splat to use in the induction update.
1511  //
1512  // FIXME: If the step is non-constant, we create the vector splat with
1513  // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1514  // handle a constant vector splat.
1515  Value *SplatVF = isa<Constant>(Mul)
1516  ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1517  : Builder.CreateVectorSplat(VF, Mul);
1518  Builder.restoreIP(CurrIP);
1519 
1520  // We may need to add the step a number of times, depending on the unroll
1521  // factor. The last of those goes into the PHI.
1522  PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1524  VecInd->setDebugLoc(EntryVal->getDebugLoc());
1525  Instruction *LastInduction = VecInd;
1526  for (unsigned Part = 0; Part < UF; ++Part) {
1527  VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1528 
1529  if (isa<TruncInst>(EntryVal))
1530  addMetadata(LastInduction, EntryVal);
1531  recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1532 
1533  LastInduction = cast<Instruction>(addFastMathFlag(
1534  Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1535  LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1536  }
1537 
1538  // Move the last step to the end of the latch block. This ensures consistent
1539  // placement of all induction updates.
1540  auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1541  auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1542  auto *ICmp = cast<Instruction>(Br->getCondition());
1543  LastInduction->moveBefore(ICmp);
1544  LastInduction->setName("vec.ind.next");
1545 
1546  VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1547  VecInd->addIncoming(LastInduction, LoopVectorLatch);
1548 }
1549 
1551  return Cost->isScalarAfterVectorization(I, VF) ||
1553 }
1554 
1557  return true;
1558  auto isScalarInst = [&](User *U) -> bool {
1559  auto *I = cast<Instruction>(U);
1561  };
1562  return llvm::any_of(IV->users(), isScalarInst);
1563 }
1564 
1566  const InductionDescriptor &ID, const Instruction *EntryVal,
1567  Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1568  assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1569  "Expected either an induction phi-node or a truncate of it!");
1570 
1571  // This induction variable is not the phi from the original loop but the
1572  // newly-created IV based on the proof that casted Phi is equal to the
1573  // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1574  // re-uses the same InductionDescriptor that original IV uses but we don't
1575  // have to do any recording in this case - that is done when original IV is
1576  // processed.
1577  if (isa<TruncInst>(EntryVal))
1578  return;
1579 
1580  const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1581  if (Casts.empty())
1582  return;
1583  // Only the first Cast instruction in the Casts vector is of interest.
1584  // The rest of the Casts (if exist) have no uses outside the
1585  // induction update chain itself.
1586  Instruction *CastInst = *Casts.begin();
1587  if (Lane < UINT_MAX)
1588  VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1589  else
1590  VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1591 }
1592 
1594  assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1595  "Primary induction variable must have an integer type");
1596 
1597  auto II = Legal->getInductionVars()->find(IV);
1598  assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
1599 
1600  auto ID = II->second;
1601  assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1602 
1603  // The scalar value to broadcast. This will be derived from the canonical
1604  // induction variable.
1605  Value *ScalarIV = nullptr;
1606 
1607  // The value from the original loop to which we are mapping the new induction
1608  // variable.
1609  Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1610 
1611  // True if we have vectorized the induction variable.
1612  auto VectorizedIV = false;
1613 
1614  // Determine if we want a scalar version of the induction variable. This is
1615  // true if the induction variable itself is not widened, or if it has at
1616  // least one user in the loop that is not widened.
1617  auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
1618 
1619  // Generate code for the induction step. Note that induction steps are
1620  // required to be loop-invariant
1621  assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
1622  "Induction step should be loop invariant");
1623  auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1624  Value *Step = nullptr;
1625  if (PSE.getSE()->isSCEVable(IV->getType())) {
1626  SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1627  Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
1629  } else {
1630  Step = cast<SCEVUnknown>(ID.getStep())->getValue();
1631  }
1632 
1633  // Try to create a new independent vector induction variable. If we can't
1634  // create the phi node, we will splat the scalar induction variable in each
1635  // loop iteration.
1636  if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
1637  createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1638  VectorizedIV = true;
1639  }
1640 
1641  // If we haven't yet vectorized the induction variable, or if we will create
1642  // a scalar one, we need to define the scalar induction variable and step
1643  // values. If we were given a truncation type, truncate the canonical
1644  // induction variable and step. Otherwise, derive these values from the
1645  // induction descriptor.
1646  if (!VectorizedIV || NeedsScalarIV) {
1647  ScalarIV = Induction;
1648  if (IV != OldInduction) {
1649  ScalarIV = IV->getType()->isIntegerTy()
1651  : Builder.CreateCast(Instruction::SIToFP, Induction,
1652  IV->getType());
1653  ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1654  ScalarIV->setName("offset.idx");
1655  }
1656  if (Trunc) {
1657  auto *TruncType = cast<IntegerType>(Trunc->getType());
1658  assert(Step->getType()->isIntegerTy() &&
1659  "Truncation requires an integer step");
1660  ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1661  Step = Builder.CreateTrunc(Step, TruncType);
1662  }
1663  }
1664 
1665  // If we haven't yet vectorized the induction variable, splat the scalar
1666  // induction variable, and build the necessary step vectors.
1667  // TODO: Don't do it unless the vectorized IV is really required.
1668  if (!VectorizedIV) {
1669  Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1670  for (unsigned Part = 0; Part < UF; ++Part) {
1671  Value *EntryPart =
1672  getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1673  VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1674  if (Trunc)
1675  addMetadata(EntryPart, Trunc);
1676  recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1677  }
1678  }
1679 
1680  // If an induction variable is only used for counting loop iterations or
1681  // calculating addresses, it doesn't need to be widened. Create scalar steps
1682  // that can be used by instructions we will later scalarize. Note that the
1683  // addition of the scalar steps will not increase the number of instructions
1684  // in the loop in the common case prior to InstCombine. We will be trading
1685  // one vector extract for each scalar step.
1686  if (NeedsScalarIV)
1687  buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1688 }
1689 
1691  Instruction::BinaryOps BinOp) {
1692  // Create and check the types.
1693  assert(Val->getType()->isVectorTy() && "Must be a vector");
1694  int VLen = Val->getType()->getVectorNumElements();
1695 
1696  Type *STy = Val->getType()->getScalarType();
1697  assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1698  "Induction Step must be an integer or FP");
1699  assert(Step->getType() == STy && "Step has wrong type");
1700 
1702 
1703  if (STy->isIntegerTy()) {
1704  // Create a vector of consecutive numbers from zero to VF.
1705  for (int i = 0; i < VLen; ++i)
1706  Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1707 
1708  // Add the consecutive indices to the vector value.
1709  Constant *Cv = ConstantVector::get(Indices);
1710  assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1711  Step = Builder.CreateVectorSplat(VLen, Step);
1712  assert(Step->getType() == Val->getType() && "Invalid step vec");
1713  // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1714  // which can be found from the original scalar operations.
1715  Step = Builder.CreateMul(Cv, Step);
1716  return Builder.CreateAdd(Val, Step, "induction");
1717  }
1718 
1719  // Floating point induction.
1720  assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1721  "Binary Opcode should be specified for FP induction");
1722  // Create a vector of consecutive numbers from zero to VF.
1723  for (int i = 0; i < VLen; ++i)
1724  Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1725 
1726  // Add the consecutive indices to the vector value.
1727  Constant *Cv = ConstantVector::get(Indices);
1728 
1729  Step = Builder.CreateVectorSplat(VLen, Step);
1730 
1731  // Floating point operations had to be 'fast' to enable the induction.
1732  FastMathFlags Flags;
1733  Flags.setFast();
1734 
1735  Value *MulOp = Builder.CreateFMul(Cv, Step);
1736  if (isa<Instruction>(MulOp))
1737  // Have to check, MulOp may be a constant
1738  cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1739 
1740  Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1741  if (isa<Instruction>(BOp))
1742  cast<Instruction>(BOp)->setFastMathFlags(Flags);
1743  return BOp;
1744 }
1745 
1747  Instruction *EntryVal,
1748  const InductionDescriptor &ID) {
1749  // We shouldn't have to build scalar steps if we aren't vectorizing.
1750  assert(VF > 1 && "VF should be greater than one");
1751 
1752  // Get the value type and ensure it and the step have the same integer type.
1753  Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1754  assert(ScalarIVTy == Step->getType() &&
1755  "Val and Step should have the same type");
1756 
1757  // We build scalar steps for both integer and floating-point induction
1758  // variables. Here, we determine the kind of arithmetic we will perform.
1759  Instruction::BinaryOps AddOp;
1760  Instruction::BinaryOps MulOp;
1761  if (ScalarIVTy->isIntegerTy()) {
1762  AddOp = Instruction::Add;
1763  MulOp = Instruction::Mul;
1764  } else {
1765  AddOp = ID.getInductionOpcode();
1766  MulOp = Instruction::FMul;
1767  }
1768 
1769  // Determine the number of scalars we need to generate for each unroll
1770  // iteration. If EntryVal is uniform, we only need to generate the first
1771  // lane. Otherwise, we generate all VF values.
1772  unsigned Lanes =
1773  Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1774  : VF;
1775  // Compute the scalar steps and save the results in VectorLoopValueMap.
1776  for (unsigned Part = 0; Part < UF; ++Part) {
1777  for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
1778  auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1779  auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1780  auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
1781  VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
1782  recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
1783  }
1784  }
1785 }
1786 
1788  assert(V != Induction && "The new induction variable should not be used.");
1789  assert(!V->getType()->isVectorTy() && "Can't widen a vector");
1790  assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1791 
1792  // If we have a stride that is replaced by one, do it here. Defer this for
1793  // the VPlan-native path until we start running Legal checks in that path.
1795  V = ConstantInt::get(V->getType(), 1);
1796 
1797  // If we have a vector mapped to this value, return it.
1798  if (VectorLoopValueMap.hasVectorValue(V, Part))
1799  return VectorLoopValueMap.getVectorValue(V, Part);
1800 
1801  // If the value has not been vectorized, check if it has been scalarized
1802  // instead. If it has been scalarized, and we actually need the value in
1803  // vector form, we will construct the vector values on demand.
1805  Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
1806 
1807  // If we've scalarized a value, that value should be an instruction.
1808  auto *I = cast<Instruction>(V);
1809 
1810  // If we aren't vectorizing, we can just copy the scalar map values over to
1811  // the vector map.
1812  if (VF == 1) {
1813  VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
1814  return ScalarValue;
1815  }
1816 
1817  // Get the last scalar instruction we generated for V and Part. If the value
1818  // is known to be uniform after vectorization, this corresponds to lane zero
1819  // of the Part unroll iteration. Otherwise, the last instruction is the one
1820  // we created for the last vector lane of the Part unroll iteration.
1821  unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
1822  auto *LastInst = cast<Instruction>(
1823  VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
1824 
1825  // Set the insert point after the last scalarized instruction. This ensures
1826  // the insertelement sequence will directly follow the scalar definitions.
1827  auto OldIP = Builder.saveIP();
1828  auto NewIP = std::next(BasicBlock::iterator(LastInst));
1829  Builder.SetInsertPoint(&*NewIP);
1830 
1831  // However, if we are vectorizing, we need to construct the vector values.
1832  // If the value is known to be uniform after vectorization, we can just
1833  // broadcast the scalar value corresponding to lane zero for each unroll
1834  // iteration. Otherwise, we construct the vector values using insertelement
1835  // instructions. Since the resulting vectors are stored in
1836  // VectorLoopValueMap, we will only generate the insertelements once.
1837  Value *VectorValue = nullptr;
1839  VectorValue = getBroadcastInstrs(ScalarValue);
1840  VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
1841  } else {
1842  // Initialize packing with insertelements to start from undef.
1844  VectorLoopValueMap.setVectorValue(V, Part, Undef);
1845  for (unsigned Lane = 0; Lane < VF; ++Lane)
1846  packScalarIntoVectorValue(V, {Part, Lane});
1847  VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
1848  }
1849  Builder.restoreIP(OldIP);
1850  return VectorValue;
1851  }
1852 
1853  // If this scalar is unknown, assume that it is a constant or that it is
1854  // loop invariant. Broadcast V and save the value for future uses.
1855  Value *B = getBroadcastInstrs(V);
1856  VectorLoopValueMap.setVectorValue(V, Part, B);
1857  return B;
1858 }
1859 
1860 Value *
1862  const VPIteration &Instance) {
1863  // If the value is not an instruction contained in the loop, it should
1864  // already be scalar.
1865  if (OrigLoop->isLoopInvariant(V))
1866  return V;
1867 
1868  assert(Instance.Lane > 0
1869  ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
1870  : true && "Uniform values only have lane zero");
1871 
1872  // If the value from the original loop has not been vectorized, it is
1873  // represented by UF x VF scalar values in the new loop. Return the requested
1874  // scalar value.
1875  if (VectorLoopValueMap.hasScalarValue(V, Instance))
1876  return VectorLoopValueMap.getScalarValue(V, Instance);
1877 
1878  // If the value has not been scalarized, get its entry in VectorLoopValueMap
1879  // for the given unroll part. If this entry is not a vector type (i.e., the
1880  // vectorization factor is one), there is no need to generate an
1881  // extractelement instruction.
1882  auto *U = getOrCreateVectorValue(V, Instance.Part);
1883  if (!U->getType()->isVectorTy()) {
1884  assert(VF == 1 && "Value not scalarized has non-vector type");
1885  return U;
1886  }
1887 
1888  // Otherwise, the value from the original loop has been vectorized and is
1889  // represented by UF vector values. Extract and return the requested scalar
1890  // value from the appropriate vector lane.
1891  return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
1892 }
1893 
1895  Value *V, const VPIteration &Instance) {
1896  assert(V != Induction && "The new induction variable should not be used.");
1897  assert(!V->getType()->isVectorTy() && "Can't pack a vector");
1898  assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1899 
1900  Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
1901  Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
1902  VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
1903  Builder.getInt32(Instance.Lane));
1904  VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
1905 }
1906 
1908  assert(Vec->getType()->isVectorTy() && "Invalid type");
1909  SmallVector<Constant *, 8> ShuffleMask;
1910  for (unsigned i = 0; i < VF; ++i)
1911  ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
1912 
1913  return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
1914  ConstantVector::get(ShuffleMask),
1915  "reverse");
1916 }
1917 
1918 // Try to vectorize the interleave group that \p Instr belongs to.
1919 //
1920 // E.g. Translate following interleaved load group (factor = 3):
1921 // for (i = 0; i < N; i+=3) {
1922 // R = Pic[i]; // Member of index 0
1923 // G = Pic[i+1]; // Member of index 1
1924 // B = Pic[i+2]; // Member of index 2
1925 // ... // do something to R, G, B
1926 // }
1927 // To:
1928 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
1929 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements
1930 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements
1931 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements
1932 //
1933 // Or translate following interleaved store group (factor = 3):
1934 // for (i = 0; i < N; i+=3) {
1935 // ... do something to R, G, B
1936 // Pic[i] = R; // Member of index 0
1937 // Pic[i+1] = G; // Member of index 1
1938 // Pic[i+2] = B; // Member of index 2
1939 // }
1940 // To:
1941 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
1942 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
1943 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
1944 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
1945 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
1947  const InterleaveGroup *Group = Cost->getInterleavedAccessGroup(Instr);
1948  assert(Group && "Fail to get an interleaved access group.");
1949 
1950  // Skip if current instruction is not the insert position.
1951  if (Instr != Group->getInsertPos())
1952  return;
1953 
1954  const DataLayout &DL = Instr->getModule()->getDataLayout();
1955  Value *Ptr = getLoadStorePointerOperand(Instr);
1956 
1957  // Prepare for the vector type of the interleaved load/store.
1958  Type *ScalarTy = getMemInstValueType(Instr);
1959  unsigned InterleaveFactor = Group->getFactor();
1960  Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
1961  Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr));
1962 
1963  // Prepare for the new pointers.
1965  SmallVector<Value *, 2> NewPtrs;
1966  unsigned Index = Group->getIndex(Instr);
1967 
1968  // If the group is reverse, adjust the index to refer to the last vector lane
1969  // instead of the first. We adjust the index from the first vector lane,
1970  // rather than directly getting the pointer for lane VF - 1, because the
1971  // pointer operand of the interleaved access is supposed to be uniform. For
1972  // uniform instructions, we're only required to generate a value for the
1973  // first vector lane in each unroll iteration.
1974  if (Group->isReverse())
1975  Index += (VF - 1) * Group->getFactor();
1976 
1977  bool InBounds = false;
1978  if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
1979  InBounds = gep->isInBounds();
1980 
1981  for (unsigned Part = 0; Part < UF; Part++) {
1982  Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
1983 
1984  // Notice current instruction could be any index. Need to adjust the address
1985  // to the member of index 0.
1986  //
1987  // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
1988  // b = A[i]; // Member of index 0
1989  // Current pointer is pointed to A[i+1], adjust it to A[i].
1990  //
1991  // E.g. A[i+1] = a; // Member of index 1
1992  // A[i] = b; // Member of index 0
1993  // A[i+2] = c; // Member of index 2 (Current instruction)
1994  // Current pointer is pointed to A[i+2], adjust it to A[i].
1995  NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index));
1996  if (InBounds)
1997  cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);
1998 
1999  // Cast to the vector pointer type.
2000  NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
2001  }
2002 
2003  setDebugLocFromInst(Builder, Instr);
2004  Value *UndefVec = UndefValue::get(VecTy);
2005 
2006  // Vectorize the interleaved load group.
2007  if (isa<LoadInst>(Instr)) {
2008  // For each unroll part, create a wide load for the group.
2009  SmallVector<Value *, 2> NewLoads;
2010  for (unsigned Part = 0; Part < UF; Part++) {
2011  auto *NewLoad = Builder.CreateAlignedLoad(
2012  NewPtrs[Part], Group->getAlignment(), "wide.vec");
2013  Group->addMetadata(NewLoad);
2014  NewLoads.push_back(NewLoad);
2015  }
2016 
2017  // For each member in the group, shuffle out the appropriate data from the
2018  // wide loads.
2019  for (unsigned I = 0; I < InterleaveFactor; ++I) {
2020  Instruction *Member = Group->getMember(I);
2021 
2022  // Skip the gaps in the group.
2023  if (!Member)
2024  continue;
2025 
2026  Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2027  for (unsigned Part = 0; Part < UF; Part++) {
2028  Value *StridedVec = Builder.CreateShuffleVector(
2029  NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2030 
2031  // If this member has different type, cast the result type.
2032  if (Member->getType() != ScalarTy) {
2033  VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2034  StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2035  }
2036 
2037  if (Group->isReverse())
2038  StridedVec = reverseVector(StridedVec);
2039 
2040  VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2041  }
2042  }
2043  return;
2044  }
2045 
2046  // The sub vector type for current instruction.
2047  VectorType *SubVT = VectorType::get(ScalarTy, VF);
2048 
2049  // Vectorize the interleaved store group.
2050  for (unsigned Part = 0; Part < UF; Part++) {
2051  // Collect the stored vector from each member.
2052  SmallVector<Value *, 4> StoredVecs;
2053  for (unsigned i = 0; i < InterleaveFactor; i++) {
2054  // Interleaved store group doesn't allow a gap, so each index has a member
2055  Instruction *Member = Group->getMember(i);
2056  assert(Member && "Fail to get a member from an interleaved store group");
2057 
2058  Value *StoredVec = getOrCreateVectorValue(
2059  cast<StoreInst>(Member)->getValueOperand(), Part);
2060  if (Group->isReverse())
2061  StoredVec = reverseVector(StoredVec);
2062 
2063  // If this member has different type, cast it to a unified type.
2064 
2065  if (StoredVec->getType() != SubVT)
2066  StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2067 
2068  StoredVecs.push_back(StoredVec);
2069  }
2070 
2071  // Concatenate all vectors into a wide vector.
2072  Value *WideVec = concatenateVectors(Builder, StoredVecs);
2073 
2074  // Interleave the elements in the wide vector.
2075  Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2076  Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2077  "interleaved.vec");
2078 
2079  Instruction *NewStoreInstr =
2080  Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment());
2081 
2082  Group->addMetadata(NewStoreInstr);
2083  }
2084 }
2085 
2087  VectorParts *BlockInMask) {
2088  // Attempt to issue a wide load.
2089  LoadInst *LI = dyn_cast<LoadInst>(Instr);
2090  StoreInst *SI = dyn_cast<StoreInst>(Instr);
2091 
2092  assert((LI || SI) && "Invalid Load/Store instruction");
2093 
2095  Cost->getWideningDecision(Instr, VF);
2097  "CM decision should be taken at this point");
2099  return vectorizeInterleaveGroup(Instr);
2100 
2101  Type *ScalarDataTy = getMemInstValueType(Instr);
2102  Type *DataTy = VectorType::get(ScalarDataTy, VF);
2103  Value *Ptr = getLoadStorePointerOperand(Instr);
2104  unsigned Alignment = getLoadStoreAlignment(Instr);
2105  // An alignment of 0 means target abi alignment. We need to use the scalar's
2106  // target abi alignment in such a case.
2107  const DataLayout &DL = Instr->getModule()->getDataLayout();
2108  if (!Alignment)
2109  Alignment = DL.getABITypeAlignment(ScalarDataTy);
2110  unsigned AddressSpace = getLoadStoreAddressSpace(Instr);
2111 
2112  // Determine if the pointer operand of the access is either consecutive or
2113  // reverse consecutive.
2114  bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2115  bool ConsecutiveStride =
2116  Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2117  bool CreateGatherScatter =
2119 
2120  // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2121  // gather/scatter. Otherwise Decision should have been to Scalarize.
2122  assert((ConsecutiveStride || CreateGatherScatter) &&
2123  "The instruction should be scalarized");
2124 
2125  // Handle consecutive loads/stores.
2126  if (ConsecutiveStride)
2127  Ptr = getOrCreateScalarValue(Ptr, {0, 0});
2128 
2129  VectorParts Mask;
2130  bool isMaskRequired = BlockInMask;
2131  if (isMaskRequired)
2132  Mask = *BlockInMask;
2133 
2134  bool InBounds = false;
2135  if (auto *gep = dyn_cast<GetElementPtrInst>(
2136  getLoadStorePointerOperand(Instr)->stripPointerCasts()))
2137  InBounds = gep->isInBounds();
2138 
2139  const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2140  // Calculate the pointer for the specific unroll-part.
2141  GetElementPtrInst *PartPtr = nullptr;
2142 
2143  if (Reverse) {
2144  // If the address is consecutive but reversed, then the
2145  // wide store needs to start at the last vector element.
2146  PartPtr = cast<GetElementPtrInst>(
2147  Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)));
2148  PartPtr->setIsInBounds(InBounds);
2149  PartPtr = cast<GetElementPtrInst>(
2150  Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)));
2151  PartPtr->setIsInBounds(InBounds);
2152  if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2153  Mask[Part] = reverseVector(Mask[Part]);
2154  } else {
2155  PartPtr = cast<GetElementPtrInst>(
2156  Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)));
2157  PartPtr->setIsInBounds(InBounds);
2158  }
2159 
2160  return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2161  };
2162 
2163  // Handle Stores:
2164  if (SI) {
2166 
2167  for (unsigned Part = 0; Part < UF; ++Part) {
2168  Instruction *NewSI = nullptr;
2169  Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
2170  if (CreateGatherScatter) {
2171  Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2172  Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2173  NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2174  MaskPart);
2175  } else {
2176  if (Reverse) {
2177  // If we store to reverse consecutive memory locations, then we need
2178  // to reverse the order of elements in the stored value.
2179  StoredVal = reverseVector(StoredVal);
2180  // We don't want to update the value in the map as it might be used in
2181  // another expression. So don't call resetVectorValue(StoredVal).
2182  }
2183  auto *VecPtr = CreateVecPtr(Part, Ptr);
2184  if (isMaskRequired)
2185  NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2186  Mask[Part]);
2187  else
2188  NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2189  }
2190  addMetadata(NewSI, SI);
2191  }
2192  return;
2193  }
2194 
2195  // Handle loads.
2196  assert(LI && "Must have a load instruction");
2198  for (unsigned Part = 0; Part < UF; ++Part) {
2199  Value *NewLI;
2200  if (CreateGatherScatter) {
2201  Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2202  Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2203  NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2204  nullptr, "wide.masked.gather");
2205  addMetadata(NewLI, LI);
2206  } else {
2207  auto *VecPtr = CreateVecPtr(Part, Ptr);
2208  if (isMaskRequired)
2209  NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
2210  UndefValue::get(DataTy),
2211  "wide.masked.load");
2212  else
2213  NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");
2214 
2215  // Add metadata to the load, but setVectorValue to the reverse shuffle.
2216  addMetadata(NewLI, LI);
2217  if (Reverse)
2218  NewLI = reverseVector(NewLI);
2219  }
2220  VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2221  }
2222 }
2223 
2225  const VPIteration &Instance,
2226  bool IfPredicateInstr) {
2227  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2228 
2229  setDebugLocFromInst(Builder, Instr);
2230 
2231  // Does this instruction return a value ?
2232  bool IsVoidRetTy = Instr->getType()->isVoidTy();
2233 
2234  Instruction *Cloned = Instr->clone();
2235  if (!IsVoidRetTy)
2236  Cloned->setName(Instr->getName() + ".cloned");
2237 
2238  // Replace the operands of the cloned instructions with their scalar
2239  // equivalents in the new loop.
2240  for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2241  auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2242  Cloned->setOperand(op, NewOp);
2243  }
2244  addNewMetadata(Cloned, Instr);
2245 
2246  // Place the cloned scalar in the new loop.
2247  Builder.Insert(Cloned);
2248 
2249  // Add the cloned scalar to the scalar map entry.
2250  VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2251 
2252  // If we just cloned a new assumption, add it the assumption cache.
2253  if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2254  if (II->getIntrinsicID() == Intrinsic::assume)
2255  AC->registerAssumption(II);
2256 
2257  // End if-block.
2258  if (IfPredicateInstr)
2259  PredicatedInstructions.push_back(Cloned);
2260 }
2261 
2263  Value *End, Value *Step,
2264  Instruction *DL) {
2265  BasicBlock *Header = L->getHeader();
2266  BasicBlock *Latch = L->getLoopLatch();
2267  // As we're just creating this loop, it's possible no latch exists
2268  // yet. If so, use the header as this will be a single block loop.
2269  if (!Latch)
2270  Latch = Header;
2271 
2274  setDebugLocFromInst(Builder, OldInst);
2275  auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2276 
2278  setDebugLocFromInst(Builder, OldInst);
2279 
2280  // Create i+1 and fill the PHINode.
2281  Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2282  Induction->addIncoming(Start, L->getLoopPreheader());
2283  Induction->addIncoming(Next, Latch);
2284  // Create the compare.
2285  Value *ICmp = Builder.CreateICmpEQ(Next, End);
2286  Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2287 
2288  // Now we have two terminators. Remove the old one from the block.
2289  Latch->getTerminator()->eraseFromParent();
2290 
2291  return Induction;
2292 }
2293 
2295  if (TripCount)
2296  return TripCount;
2297 
2299  // Find the loop boundaries.
2300  ScalarEvolution *SE = PSE.getSE();
2301  const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2302  assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2303  "Invalid loop count");
2304 
2305  Type *IdxTy = Legal->getWidestInductionType();
2306  assert(IdxTy && "No type for induction");
2307 
2308  // The exit count might have the type of i64 while the phi is i32. This can
2309  // happen if we have an induction variable that is sign extended before the
2310  // compare. The only way that we get a backedge taken count is that the
2311  // induction variable was signed and as such will not overflow. In such a case
2312  // truncation is legal.
2313  if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
2314  IdxTy->getPrimitiveSizeInBits())
2315  BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2316  BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2317 
2318  // Get the total trip count from the count by adding 1.
2319  const SCEV *ExitCount = SE->getAddExpr(
2320  BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2321 
2322  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2323 
2324  // Expand the trip count and place the new instructions in the preheader.
2325  // Notice that the pre-header does not change, only the loop body.
2326  SCEVExpander Exp(*SE, DL, "induction");
2327 
2328  // Count holds the overall loop count (N).
2329  TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2331 
2332  if (TripCount->getType()->isPointerTy())
2333  TripCount =
2334  CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2336 
2337  return TripCount;
2338 }
2339 
2341  if (VectorTripCount)
2342  return VectorTripCount;
2343 
2344  Value *TC = getOrCreateTripCount(L);
2346 
2347  // Now we need to generate the expression for the part of the loop that the
2348  // vectorized body will execute. This is equal to N - (N % Step) if scalar
2349  // iterations are not required for correctness, or N - Step, otherwise. Step
2350  // is equal to the vectorization factor (number of SIMD elements) times the
2351  // unroll factor (number of SIMD instructions).
2352  Constant *Step = ConstantInt::get(TC->getType(), VF * UF);
2353  Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2354 
2355  // If there is a non-reversed interleaved group that may speculatively access
2356  // memory out-of-bounds, we need to ensure that there will be at least one
2357  // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2358  // the trip count, we set the remainder to be equal to the step. If the step
2359  // does not evenly divide the trip count, no adjustment is necessary since
2360  // there will already be scalar iterations. Note that the minimum iterations
2361  // check ensures that N >= Step.
2362  if (VF > 1 && Cost->requiresScalarEpilogue()) {
2363  auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2364  R = Builder.CreateSelect(IsZero, Step, R);
2365  }
2366 
2367  VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2368 
2369  return VectorTripCount;
2370 }
2371 
2373  const DataLayout &DL) {
2374  // Verify that V is a vector type with same number of elements as DstVTy.
2375  unsigned VF = DstVTy->getNumElements();
2376  VectorType *SrcVecTy = cast<VectorType>(V->getType());
2377  assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2378  Type *SrcElemTy = SrcVecTy->getElementType();
2379  Type *DstElemTy = DstVTy->getElementType();
2380  assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2381  "Vector elements must have same size");
2382 
2383  // Do a direct cast if element types are castable.
2384  if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2385  return Builder.CreateBitOrPointerCast(V, DstVTy);
2386  }
2387  // V cannot be directly casted to desired vector type.
2388  // May happen when V is a floating point vector but DstVTy is a vector of
2389  // pointers or vice-versa. Handle this using a two-step bitcast using an
2390  // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2391  assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2392  "Only one type should be a pointer type");
2393  assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2394  "Only one type should be a floating point type");
2395  Type *IntTy =
2397  VectorType *VecIntTy = VectorType::get(IntTy, VF);
2398  Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2399  return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2400 }
2401 
2403  BasicBlock *Bypass) {
2404  Value *Count = getOrCreateTripCount(L);
2405  BasicBlock *BB = L->getLoopPreheader();
2407 
2408  // Generate code to check if the loop's trip count is less than VF * UF, or
2409  // equal to it in case a scalar epilogue is required; this implies that the
2410  // vector trip count is zero. This check also covers the case where adding one
2411  // to the backedge-taken count overflowed leading to an incorrect trip count
2412  // of zero. In this case we will also jump to the scalar loop.
2415  Value *CheckMinIters = Builder.CreateICmp(
2416  P, Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check");
2417 
2418  BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2419  // Update dominator tree immediately if the generated block is a
2420  // LoopBypassBlock because SCEV expansions to generate loop bypass
2421  // checks may query it before the current function is finished.
2422  DT->addNewBlock(NewBB, BB);
2423  if (L->getParentLoop())
2424  L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2426  BranchInst::Create(Bypass, NewBB, CheckMinIters));
2427  LoopBypassBlocks.push_back(BB);
2428 }
2429 
2431  BasicBlock *BB = L->getLoopPreheader();
2432 
2433  // Generate the code to check that the SCEV assumptions that we made.
2434  // We want the new basic block to start at the first instruction in a
2435  // sequence of instructions that form a check.
2436  SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2437  "scev.check");
2438  Value *SCEVCheck =
2439  Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
2440 
2441  if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2442  if (C->isZero())
2443  return;
2444 
2445  // Create a new block containing the stride check.
2446  BB->setName("vector.scevcheck");
2447  auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2448  // Update dominator tree immediately if the generated block is a
2449  // LoopBypassBlock because SCEV expansions to generate loop bypass
2450  // checks may query it before the current function is finished.
2451  DT->addNewBlock(NewBB, BB);
2452  if (L->getParentLoop())
2453  L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2455  BranchInst::Create(Bypass, NewBB, SCEVCheck));
2456  LoopBypassBlocks.push_back(BB);
2457  AddedSafetyChecks = true;
2458 }
2459 
2461  // VPlan-native path does not do any analysis for runtime checks currently.
2463  return;
2464 
2465  BasicBlock *BB = L->getLoopPreheader();
2466 
2467  // Generate the code that checks in runtime if arrays overlap. We put the
2468  // checks into a separate block to make the more common case of few elements
2469  // faster.
2470  Instruction *FirstCheckInst;
2471  Instruction *MemRuntimeCheck;
2472  std::tie(FirstCheckInst, MemRuntimeCheck) =
2474  if (!MemRuntimeCheck)
2475  return;
2476 
2477  // Create a new block containing the memory check.
2478  BB->setName("vector.memcheck");
2479  auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2480  // Update dominator tree immediately if the generated block is a
2481  // LoopBypassBlock because SCEV expansions to generate loop bypass
2482  // checks may query it before the current function is finished.
2483  DT->addNewBlock(NewBB, BB);
2484  if (L->getParentLoop())
2485  L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2487  BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
2488  LoopBypassBlocks.push_back(BB);
2489  AddedSafetyChecks = true;
2490 
2491  // We currently don't use LoopVersioning for the actual loop cloning but we
2492  // still use it to add the noalias metadata.
2493  LVer = llvm::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2494  PSE.getSE());
2495  LVer->prepareNoAliasMetadata();
2496 }
2497 
2499  IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2500  const InductionDescriptor &ID) const {
2501 
2502  SCEVExpander Exp(*SE, DL, "induction");
2503  auto Step = ID.getStep();
2504  auto StartValue = ID.getStartValue();
2505  assert(Index->getType() == Step->getType() &&
2506  "Index type does not match StepValue type");
2507  switch (ID.getKind()) {
2509  assert(Index->getType() == StartValue->getType() &&
2510  "Index type does not match StartValue type");
2511 
2512  // FIXME: Theoretically, we can call getAddExpr() of ScalarEvolution
2513  // and calculate (Start + Index * Step) for all cases, without
2514  // special handling for "isOne" and "isMinusOne".
2515  // But in the real life the result code getting worse. We mix SCEV
2516  // expressions and ADD/SUB operations and receive redundant
2517  // intermediate values being calculated in different ways and
2518  // Instcombine is unable to reduce them all.
2519 
2521  return B.CreateSub(StartValue, Index);
2522  if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isOne())
2523  return B.CreateAdd(StartValue, Index);
2524  const SCEV *S = SE->getAddExpr(SE->getSCEV(StartValue),
2525  SE->getMulExpr(Step, SE->getSCEV(Index)));
2526  return Exp.expandCodeFor(S, StartValue->getType(), &*B.GetInsertPoint());
2527  }
2529  assert(isa<SCEVConstant>(Step) &&
2530  "Expected constant step for pointer induction");
2531  const SCEV *S = SE->getMulExpr(SE->getSCEV(Index), Step);
2532  Index = Exp.expandCodeFor(S, Index->getType(), &*B.GetInsertPoint());
2533  return B.CreateGEP(nullptr, StartValue, Index);
2534  }
2536  assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2537  auto InductionBinOp = ID.getInductionBinOp();
2538  assert(InductionBinOp &&
2539  (InductionBinOp->getOpcode() == Instruction::FAdd ||
2540  InductionBinOp->getOpcode() == Instruction::FSub) &&
2541  "Original bin op should be defined for FP induction");
2542 
2543  Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2544 
2545  // Floating point operations had to be 'fast' to enable the induction.
2546  FastMathFlags Flags;
2547  Flags.setFast();
2548 
2549  Value *MulExp = B.CreateFMul(StepValue, Index);
2550  if (isa<Instruction>(MulExp))
2551  // We have to check, the MulExp may be a constant.
2552  cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2553 
2554  Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2555  "induction");
2556  if (isa<Instruction>(BOp))
2557  cast<Instruction>(BOp)->setFastMathFlags(Flags);
2558 
2559  return BOp;
2560  }
2562  return nullptr;
2563  }
2564  llvm_unreachable("invalid enum");
2565 }
2566 
2568  /*
2569  In this function we generate a new loop. The new loop will contain
2570  the vectorized instructions while the old loop will continue to run the
2571  scalar remainder.
2572 
2573  [ ] <-- loop iteration number check.
2574  / |
2575  / v
2576  | [ ] <-- vector loop bypass (may consist of multiple blocks).
2577  | / |
2578  | / v
2579  || [ ] <-- vector pre header.
2580  |/ |
2581  | v
2582  | [ ] \
2583  | [ ]_| <-- vector loop.
2584  | |
2585  | v
2586  | -[ ] <--- middle-block.
2587  | / |
2588  | / v
2589  -|- >[ ] <--- new preheader.
2590  | |
2591  | v
2592  | [ ] \
2593  | [ ]_| <-- old scalar loop to handle remainder.
2594  \ |
2595  \ v
2596  >[ ] <-- exit block.
2597  ...
2598  */
2599 
2600  BasicBlock *OldBasicBlock = OrigLoop->getHeader();
2601  BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2602  BasicBlock *ExitBlock = OrigLoop->getExitBlock();
2603  assert(VectorPH && "Invalid loop structure");
2604  assert(ExitBlock && "Must have an exit block");
2605 
2606  // Some loops have a single integer induction variable, while other loops
2607  // don't. One example is c++ iterators that often have multiple pointer
2608  // induction variables. In the code below we also support a case where we
2609  // don't have a single induction variable.
2610  //
2611  // We try to obtain an induction variable from the original loop as hard
2612  // as possible. However if we don't find one that:
2613  // - is an integer
2614  // - counts from zero, stepping by one
2615  // - is the size of the widest induction variable type
2616  // then we create a new one.
2618  Type *IdxTy = Legal->getWidestInductionType();
2619 
2620  // Split the single block loop into the two loop structure described above.
2621  BasicBlock *VecBody =
2622  VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
2623  BasicBlock *MiddleBlock =
2624  VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
2625  BasicBlock *ScalarPH =
2626  MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
2627 
2628  // Create and register the new vector loop.
2629  Loop *Lp = LI->AllocateLoop();
2630  Loop *ParentLoop = OrigLoop->getParentLoop();
2631 
2632  // Insert the new loop into the loop nest and register the new basic blocks
2633  // before calling any utilities such as SCEV that require valid LoopInfo.
2634  if (ParentLoop) {
2635  ParentLoop->addChildLoop(Lp);
2636  ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
2637  ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
2638  } else {
2639  LI->addTopLevelLoop(Lp);
2640  }
2641  Lp->addBasicBlockToLoop(VecBody, *LI);
2642 
2643  // Find the loop boundaries.
2644  Value *Count = getOrCreateTripCount(Lp);
2645 
2646  Value *StartIdx = ConstantInt::get(IdxTy, 0);
2647 
2648  // Now, compare the new count to zero. If it is zero skip the vector loop and
2649  // jump to the scalar loop. This check also covers the case where the
2650  // backedge-taken count is uint##_max: adding one to it will overflow leading
2651  // to an incorrect trip count of zero. In this (rare) case we will also jump
2652  // to the scalar loop.
2653  emitMinimumIterationCountCheck(Lp, ScalarPH);
2654 
2655  // Generate the code to check any assumptions that we've made for SCEV
2656  // expressions.
2657  emitSCEVChecks(Lp, ScalarPH);
2658 
2659  // Generate the code that checks in runtime if arrays overlap. We put the
2660  // checks into a separate block to make the more common case of few elements
2661  // faster.
2662  emitMemRuntimeChecks(Lp, ScalarPH);
2663 
2664  // Generate the induction variable.
2665  // The loop step is equal to the vectorization factor (num of SIMD elements)
2666  // times the unroll factor (num of SIMD instructions).
2667  Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
2668  Constant *Step = ConstantInt::get(IdxTy, VF * UF);
2669  Induction =
2670  createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
2672 
2673  // We are going to resume the execution of the scalar loop.
2674  // Go over all of the induction variables that we found and fix the
2675  // PHIs that are left in the scalar version of the loop.
2676  // The starting values of PHI nodes depend on the counter of the last
2677  // iteration in the vectorized loop.
2678  // If we come from a bypass edge then we need to start from the original
2679  // start value.
2680 
2681  // This variable saves the new starting index for the scalar loop. It is used
2682  // to test if there are any tail iterations left once the vector loop has
2683  // completed.
2685  for (auto &InductionEntry : *List) {
2686  PHINode *OrigPhi = InductionEntry.first;
2687  InductionDescriptor II = InductionEntry.second;
2688 
2689  // Create phi nodes to merge from the backedge-taken check block.
2690  PHINode *BCResumeVal = PHINode::Create(
2691  OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
2692  // Copy original phi DL over to the new one.
2693  BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
2694  Value *&EndValue = IVEndValues[OrigPhi];
2695  if (OrigPhi == OldInduction) {
2696  // We know what the end value is.
2697  EndValue = CountRoundDown;
2698  } else {
2700  Type *StepType = II.getStep()->getType();
2701  Instruction::CastOps CastOp =
2702  CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
2703  Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
2704  const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2705  EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
2706  EndValue->setName("ind.end");
2707  }
2708 
2709  // The new PHI merges the original incoming value, in case of a bypass,
2710  // or the value at the end of the vectorized loop.
2711  BCResumeVal->addIncoming(EndValue, MiddleBlock);
2712 
2713  // Fix the scalar body counter (PHI node).
2714  unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
2715 
2716  // The old induction's phi node in the scalar body needs the truncated
2717  // value.
2718  for (BasicBlock *BB : LoopBypassBlocks)
2719  BCResumeVal->addIncoming(II.getStartValue(), BB);
2720  OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
2721  }
2722 
2723  // Add a check in the middle block to see if we have completed
2724  // all of the iterations in the first vector loop.
2725  // If (N - N%VF) == N, then we *don't* need to run the remainder.
2726  Value *CmpN =
2727  CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
2728  CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
2729  ReplaceInstWithInst(MiddleBlock->getTerminator(),
2730  BranchInst::Create(ExitBlock, ScalarPH, CmpN));
2731 
2732  // Get ready to start creating new instructions into the vectorized body.
2734 
2735  // Save the state.
2737  LoopScalarPreHeader = ScalarPH;
2738  LoopMiddleBlock = MiddleBlock;
2739  LoopExitBlock = ExitBlock;
2740  LoopVectorBody = VecBody;
2741  LoopScalarBody = OldBasicBlock;
2742 
2743  // Keep all loop hints from the original loop on the vector loop (we'll
2744  // replace the vectorizer-specific hints below).
2745  if (MDNode *LID = OrigLoop->getLoopID())
2746  Lp->setLoopID(LID);
2747 
2748  LoopVectorizeHints Hints(Lp, true, *ORE);
2749  Hints.setAlreadyVectorized();
2750 
2751  return LoopVectorPreHeader;
2752 }
2753 
2754 // Fix up external users of the induction variable. At this point, we are
2755 // in LCSSA form, with all external PHIs that use the IV having one input value,
2756 // coming from the remainder loop. We need those PHIs to also have a correct
2757 // value for the IV when arriving directly from the middle block.
2759  const InductionDescriptor &II,
2760  Value *CountRoundDown, Value *EndValue,
2761  BasicBlock *MiddleBlock) {
2762  // There are two kinds of external IV usages - those that use the value
2763  // computed in the last iteration (the PHI) and those that use the penultimate
2764  // value (the value that feeds into the phi from the loop latch).
2765  // We allow both, but they, obviously, have different values.
2766 
2767  assert(OrigLoop->getExitBlock() && "Expected a single exit block");
2768 
2769  DenseMap<Value *, Value *> MissingVals;
2770 
2771  // An external user of the last iteration's value should see the value that
2772  // the remainder loop uses to initialize its own IV.
2774  for (User *U : PostInc->users()) {
2775  Instruction *UI = cast<Instruction>(U);
2776  if (!OrigLoop->contains(UI)) {
2777  assert(isa<PHINode>(UI) && "Expected LCSSA form");
2778  MissingVals[UI] = EndValue;
2779  }
2780  }
2781 
2782  // An external user of the penultimate value need to see EndValue - Step.
2783  // The simplest way to get this is to recompute it from the constituent SCEVs,
2784  // that is Start + (Step * (CRD - 1)).
2785  for (User *U : OrigPhi->users()) {
2786  auto *UI = cast<Instruction>(U);
2787  if (!OrigLoop->contains(UI)) {
2788  const DataLayout &DL =
2790  assert(isa<PHINode>(UI) && "Expected LCSSA form");
2791 
2792  IRBuilder<> B(MiddleBlock->getTerminator());
2793  Value *CountMinusOne = B.CreateSub(
2794  CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
2795  Value *CMO =
2796  !II.getStep()->getType()->isIntegerTy()
2797  ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
2798  II.getStep()->getType())
2799  : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
2800  CMO->setName("cast.cmo");
2801  Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
2802  Escape->setName("ind.escape");
2803  MissingVals[UI] = Escape;
2804  }
2805  }
2806 
2807  for (auto &I : MissingVals) {
2808  PHINode *PHI = cast<PHINode>(I.first);
2809  // One corner case we have to handle is two IVs "chasing" each-other,
2810  // that is %IV2 = phi [...], [ %IV1, %latch ]
2811  // In this case, if IV1 has an external use, we need to avoid adding both
2812  // "last value of IV1" and "penultimate value of IV2". So, verify that we
2813  // don't already have an incoming value for the middle block.
2814  if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
2815  PHI->addIncoming(I.second, MiddleBlock);
2816  }
2817 }
2818 
2819 namespace {
2820 
2821 struct CSEDenseMapInfo {
2822  static bool canHandle(const Instruction *I) {
2823  return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
2824  isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
2825  }
2826 
2827  static inline Instruction *getEmptyKey() {
2829  }
2830 
2831  static inline Instruction *getTombstoneKey() {
2833  }
2834 
2835  static unsigned getHashValue(const Instruction *I) {
2836  assert(canHandle(I) && "Unknown instruction!");
2838  I->value_op_end()));
2839  }
2840 
2841  static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2842  if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2843  LHS == getTombstoneKey() || RHS == getTombstoneKey())
2844  return LHS == RHS;
2845  return LHS->isIdenticalTo(RHS);
2846  }
2847 };
2848 
2849 } // end anonymous namespace
2850 
2851 ///Perform cse of induction variable instructions.
2852 static void cse(BasicBlock *BB) {
2853  // Perform simple cse.
2855  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
2856  Instruction *In = &*I++;
2857 
2858  if (!CSEDenseMapInfo::canHandle(In))
2859  continue;
2860 
2861  // Check if we can replace this instruction with any of the
2862  // visited instructions.
2863  if (Instruction *V = CSEMap.lookup(In)) {
2864  In->replaceAllUsesWith(V);
2865  In->eraseFromParent();
2866  continue;
2867  }
2868 
2869  CSEMap[In] = In;
2870  }
2871 }
2872 
2873 /// Estimate the overhead of scalarizing an instruction. This is a
2874 /// convenience wrapper for the type-based getScalarizationOverhead API.
2875 static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,
2876  const TargetTransformInfo &TTI) {
2877  if (VF == 1)
2878  return 0;
2879 
2880  unsigned Cost = 0;
2881  Type *RetTy = ToVectorTy(I->getType(), VF);
2882  if (!RetTy->isVoidTy() &&
2883  (!isa<LoadInst>(I) ||
2885  Cost += TTI.getScalarizationOverhead(RetTy, true, false);
2886 
2887  if (CallInst *CI = dyn_cast<CallInst>(I)) {
2888  SmallVector<const Value *, 4> Operands(CI->arg_operands());
2889  Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
2890  }
2891  else if (!isa<StoreInst>(I) ||
2894  Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
2895  }
2896 
2897  return Cost;
2898 }
2899 
2900 // Estimate cost of a call instruction CI if it were vectorized with factor VF.
2901 // Return the cost of the instruction, including scalarization overhead if it's
2902 // needed. The flag NeedToScalarize shows if the call needs to be scalarized -
2903 // i.e. either vector version isn't available, or is too expensive.
2904 static unsigned getVectorCallCost(CallInst *CI, unsigned VF,
2905  const TargetTransformInfo &TTI,
2906  const TargetLibraryInfo *TLI,
2907  bool &NeedToScalarize) {
2908  Function *F = CI->getCalledFunction();
2909  StringRef FnName = CI->getCalledFunction()->getName();
2910  Type *ScalarRetTy = CI->getType();
2911  SmallVector<Type *, 4> Tys, ScalarTys;
2912  for (auto &ArgOp : CI->arg_operands())
2913  ScalarTys.push_back(ArgOp->getType());
2914 
2915  // Estimate cost of scalarized vector call. The source operands are assumed
2916  // to be vectors, so we need to extract individual elements from there,
2917  // execute VF scalar calls, and then gather the result into the vector return
2918  // value.
2919  unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
2920  if (VF == 1)
2921  return ScalarCallCost;
2922 
2923  // Compute corresponding vector type for return value and arguments.
2924  Type *RetTy = ToVectorTy(ScalarRetTy, VF);
2925  for (Type *ScalarTy : ScalarTys)
2926  Tys.push_back(ToVectorTy(ScalarTy, VF));
2927 
2928  // Compute costs of unpacking argument values for the scalar calls and
2929  // packing the return values to a vector.
2930  unsigned ScalarizationCost = getScalarizationOverhead(CI, VF, TTI);
2931 
2932  unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
2933 
2934  // If we can't emit a vector call for this function, then the currently found
2935  // cost is the cost we need to return.
2936  NeedToScalarize = true;
2937  if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
2938  return Cost;
2939 
2940  // If the corresponding vector cost is cheaper, return its cost.
2941  unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
2942  if (VectorCallCost < Cost) {
2943  NeedToScalarize = false;
2944  return VectorCallCost;
2945  }
2946  return Cost;
2947 }
2948 
2949 // Estimate cost of an intrinsic call instruction CI if it were vectorized with
2950 // factor VF. Return the cost of the instruction, including scalarization
2951 // overhead if it's needed.
2952 static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF,
2953  const TargetTransformInfo &TTI,
2954  const TargetLibraryInfo *TLI) {
2956  assert(ID && "Expected intrinsic call!");
2957 
2958  FastMathFlags FMF;
2959  if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
2960  FMF = FPMO->getFastMathFlags();
2961 
2962  SmallVector<Value *, 4> Operands(CI->arg_operands());
2963  return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
2964 }
2965 
2967  auto *I1 = cast<IntegerType>(T1->getVectorElementType());
2968  auto *I2 = cast<IntegerType>(T2->getVectorElementType());
2969  return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
2970 }
2972  auto *I1 = cast<IntegerType>(T1->getVectorElementType());
2973  auto *I2 = cast<IntegerType>(T2->getVectorElementType());
2974  return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
2975 }
2976 
2978  // For every instruction `I` in MinBWs, truncate the operands, create a
2979  // truncated version of `I` and reextend its result. InstCombine runs
2980  // later and will remove any ext/trunc pairs.
2981  SmallPtrSet<Value *, 4> Erased;
2982  for (const auto &KV : Cost->getMinimalBitwidths()) {
2983  // If the value wasn't vectorized, we must maintain the original scalar
2984  // type. The absence of the value from VectorLoopValueMap indicates that it
2985  // wasn't vectorized.
2986  if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
2987  continue;
2988  for (unsigned Part = 0; Part < UF; ++Part) {
2989  Value *I = getOrCreateVectorValue(KV.first, Part);
2990  if (Erased.find(I) != Erased.end() || I->use_empty() ||
2991  !isa<Instruction>(I))
2992  continue;
2993  Type *OriginalTy = I->getType();
2994  Type *ScalarTruncatedTy =
2995  IntegerType::get(OriginalTy->getContext(), KV.second);
2996  Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
2997  OriginalTy->getVectorNumElements());
2998  if (TruncatedTy == OriginalTy)
2999  continue;
3000 
3001  IRBuilder<> B(cast<Instruction>(I));
3002  auto ShrinkOperand = [&](Value *V) -> Value * {
3003  if (auto *ZI = dyn_cast<ZExtInst>(V))
3004  if (ZI->getSrcTy() == TruncatedTy)
3005  return ZI->getOperand(0);
3006  return B.CreateZExtOrTrunc(V, TruncatedTy);
3007  };
3008 
3009  // The actual instruction modification depends on the instruction type,
3010  // unfortunately.
3011  Value *NewI = nullptr;
3012  if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3013  NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3014  ShrinkOperand(BO->getOperand(1)));
3015 
3016  // Any wrapping introduced by shrinking this operation shouldn't be
3017  // considered undefined behavior. So, we can't unconditionally copy
3018  // arithmetic wrapping flags to NewI.
3019  cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3020  } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3021  NewI =
3022  B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3023  ShrinkOperand(CI->getOperand(1)));
3024  } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3025  NewI = B.CreateSelect(SI->getCondition(),
3026  ShrinkOperand(SI->getTrueValue()),
3027  ShrinkOperand(SI->getFalseValue()));
3028  } else if (auto *CI = dyn_cast<CastInst>(I)) {
3029  switch (CI->getOpcode()) {
3030  default:
3031  llvm_unreachable("Unhandled cast!");
3032  case Instruction::Trunc:
3033  NewI = ShrinkOperand(CI->getOperand(0));
3034  break;
3035  case Instruction::SExt:
3036  NewI = B.CreateSExtOrTrunc(
3037  CI->getOperand(0),
3038  smallestIntegerVectorType(OriginalTy, TruncatedTy));
3039  break;
3040  case Instruction::ZExt:
3041  NewI = B.CreateZExtOrTrunc(
3042  CI->getOperand(0),
3043  smallestIntegerVectorType(OriginalTy, TruncatedTy));
3044  break;
3045  }
3046  } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3047  auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3048  auto *O0 = B.CreateZExtOrTrunc(
3049  SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3050  auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3051  auto *O1 = B.CreateZExtOrTrunc(
3052  SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3053 
3054  NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3055  } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3056  // Don't do anything with the operands, just extend the result.
3057  continue;
3058  } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3059  auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3060  auto *O0 = B.CreateZExtOrTrunc(
3061  IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3062  auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3063  NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3064  } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3065  auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3066  auto *O0 = B.CreateZExtOrTrunc(
3067  EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3068  NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3069  } else {
3070  // If we don't know what to do, be conservative and don't do anything.
3071  continue;
3072  }
3073 
3074  // Lastly, extend the result.
3075  NewI->takeName(cast<Instruction>(I));
3076  Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3077  I->replaceAllUsesWith(Res);
3078  cast<Instruction>(I)->eraseFromParent();
3079  Erased.insert(I);
3080  VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3081  }
3082  }
3083 
3084  // We'll have created a bunch of ZExts that are now parentless. Clean up.
3085  for (const auto &KV : Cost->getMinimalBitwidths()) {
3086  // If the value wasn't vectorized, we must maintain the original scalar
3087  // type. The absence of the value from VectorLoopValueMap indicates that it
3088  // wasn't vectorized.
3089  if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3090  continue;
3091  for (unsigned Part = 0; Part < UF; ++Part) {
3092  Value *I = getOrCreateVectorValue(KV.first, Part);
3093  ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3094  if (Inst && Inst->use_empty()) {
3095  Value *NewI = Inst->getOperand(0);
3096  Inst->eraseFromParent();
3097  VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3098  }
3099  }
3100  }
3101 }
3102 
3104  // Insert truncates and extends for any truncated instructions as hints to
3105  // InstCombine.
3106  if (VF > 1)
3108 
3109  // Fix widened non-induction PHIs by setting up the PHI operands.
3110  if (OrigPHIsToFix.size()) {
3112  "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3114  }
3115 
3116  // At this point every instruction in the original loop is widened to a
3117  // vector form. Now we need to fix the recurrences in the loop. These PHI
3118  // nodes are currently empty because we did not want to introduce cycles.
3119  // This is the second stage of vectorizing recurrences.
3121 
3122  // Update the dominator tree.
3123  //
3124  // FIXME: After creating the structure of the new loop, the dominator tree is
3125  // no longer up-to-date, and it remains that way until we update it
3126  // here. An out-of-date dominator tree is problematic for SCEV,
3127  // because SCEVExpander uses it to guide code generation. The
3128  // vectorizer use SCEVExpanders in several places. Instead, we should
3129  // keep the dominator tree up-to-date as we go.
3130  updateAnalysis();
3131 
3132  // Fix-up external users of the induction variables.
3133  for (auto &Entry : *Legal->getInductionVars())
3134  fixupIVUsers(Entry.first, Entry.second,
3136  IVEndValues[Entry.first], LoopMiddleBlock);
3137 
3138  fixLCSSAPHIs();
3140  sinkScalarOperands(&*PI);
3141 
3142  // Remove redundant induction instructions.
3144 }
3145 
3147  // In order to support recurrences we need to be able to vectorize Phi nodes.
3148  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3149  // stage #2: We now need to fix the recurrences by adding incoming edges to
3150  // the currently empty PHI nodes. At this point every instruction in the
3151  // original loop is widened to a vector form so we can use them to construct
3152  // the incoming edges.
3153  for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3154  // Handle first-order recurrences and reductions that need to be fixed.
3155  if (Legal->isFirstOrderRecurrence(&Phi))
3157  else if (Legal->isReductionVariable(&Phi))
3158  fixReduction(&Phi);
3159  }
3160 }
3161 
3163  // This is the second phase of vectorizing first-order recurrences. An
3164  // overview of the transformation is described below. Suppose we have the
3165  // following loop.
3166  //
3167  // for (int i = 0; i < n; ++i)
3168  // b[i] = a[i] - a[i - 1];
3169  //
3170  // There is a first-order recurrence on "a". For this loop, the shorthand
3171  // scalar IR looks like:
3172  //
3173  // scalar.ph:
3174  // s_init = a[-1]
3175  // br scalar.body
3176  //
3177  // scalar.body:
3178  // i = phi [0, scalar.ph], [i+1, scalar.body]
3179  // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3180  // s2 = a[i]
3181  // b[i] = s2 - s1
3182  // br cond, scalar.body, ...
3183  //
3184  // In this example, s1 is a recurrence because it's value depends on the
3185  // previous iteration. In the first phase of vectorization, we created a
3186  // temporary value for s1. We now complete the vectorization and produce the
3187  // shorthand vector IR shown below (for VF = 4, UF = 1).
3188  //
3189  // vector.ph:
3190  // v_init = vector(..., ..., ..., a[-1])
3191  // br vector.body
3192  //
3193  // vector.body
3194  // i = phi [0, vector.ph], [i+4, vector.body]
3195  // v1 = phi [v_init, vector.ph], [v2, vector.body]
3196  // v2 = a[i, i+1, i+2, i+3];
3197  // v3 = vector(v1(3), v2(0, 1, 2))
3198  // b[i, i+1, i+2, i+3] = v2 - v3
3199  // br cond, vector.body, middle.block
3200  //
3201  // middle.block:
3202  // x = v2(3)
3203  // br scalar.ph
3204  //
3205  // scalar.ph:
3206  // s_init = phi [x, middle.block], [a[-1], otherwise]
3207  // br scalar.body
3208  //
3209  // After execution completes the vector loop, we extract the next value of
3210  // the recurrence (x) to use as the initial value in the scalar loop.
3211 
3212  // Get the original loop preheader and single loop latch.
3213  auto *Preheader = OrigLoop->getLoopPreheader();
3214  auto *Latch = OrigLoop->getLoopLatch();
3215 
3216  // Get the initial and previous values of the scalar recurrence.
3217  auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3218  auto *Previous = Phi->getIncomingValueForBlock(Latch);
3219 
3220  // Create a vector from the initial value.
3221  auto *VectorInit = ScalarInit;
3222  if (VF > 1) {
3224  VectorInit = Builder.CreateInsertElement(
3225  UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3226  Builder.getInt32(VF - 1), "vector.recur.init");
3227  }
3228 
3229  // We constructed a temporary phi node in the first phase of vectorization.
3230  // This phi node will eventually be deleted.
3232  cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3233 
3234  // Create a phi node for the new recurrence. The current value will either be
3235  // the initial value inserted into a vector or loop-varying vector value.
3236  auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3237  VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3238 
3239  // Get the vectorized previous value of the last part UF - 1. It appears last
3240  // among all unrolled iterations, due to the order of their construction.
3241  Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3242 
3243  // Set the insertion point after the previous value if it is an instruction.
3244  // Note that the previous value may have been constant-folded so it is not
3245  // guaranteed to be an instruction in the vector loop. Also, if the previous
3246  // value is a phi node, we should insert after all the phi nodes to avoid
3247  // breaking basic block verification.
3248  if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||
3249  isa<PHINode>(PreviousLastPart))
3251  else
3253  &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));
3254 
3255  // We will construct a vector for the recurrence by combining the values for
3256  // the current and previous iterations. This is the required shuffle mask.
3257  SmallVector<Constant *, 8> ShuffleMask(VF);
3258  ShuffleMask[0] = Builder.getInt32(VF - 1);
3259  for (unsigned I = 1; I < VF; ++I)
3260  ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3261 
3262  // The vector from which to take the initial value for the current iteration
3263  // (actual or unrolled). Initially, this is the vector phi node.
3264  Value *Incoming = VecPhi;
3265 
3266  // Shuffle the current and previous vector and update the vector parts.
3267  for (unsigned Part = 0; Part < UF; ++Part) {
3268  Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3269  Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3270  auto *Shuffle =
3271  VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3272  ConstantVector::get(ShuffleMask))
3273  : Incoming;
3274  PhiPart->replaceAllUsesWith(Shuffle);
3275  cast<Instruction>(PhiPart)->eraseFromParent();
3276  VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3277  Incoming = PreviousPart;
3278  }
3279 
3280  // Fix the latch value of the new recurrence in the vector loop.
3281  VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3282 
3283  // Extract the last vector element in the middle block. This will be the
3284  // initial value for the recurrence when jumping to the scalar loop.
3285  auto *ExtractForScalar = Incoming;
3286  if (VF > 1) {
3288  ExtractForScalar = Builder.CreateExtractElement(
3289  ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3290  }
3291  // Extract the second last element in the middle block if the
3292  // Phi is used outside the loop. We need to extract the phi itself
3293  // and not the last element (the phi update in the current iteration). This
3294  // will be the value when jumping to the exit block from the LoopMiddleBlock,
3295  // when the scalar loop is not run at all.
3296  Value *ExtractForPhiUsedOutsideLoop = nullptr;
3297  if (VF > 1)
3298  ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3299  Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3300  // When loop is unrolled without vectorizing, initialize
3301  // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3302  // `Incoming`. This is analogous to the vectorized case above: extracting the
3303  // second last element when VF > 1.
3304  else if (UF > 1)
3305  ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3306 
3307  // Fix the initial value of the original recurrence in the scalar loop.
3309  auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3310  for (auto *BB : predecessors(LoopScalarPreHeader)) {
3311  auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3312  Start->addIncoming(Incoming, BB);
3313  }
3314 
3316  Phi->setName("scalar.recur");
3317 
3318  // Finally, fix users of the recurrence outside the loop. The users will need
3319  // either the last value of the scalar recurrence or the last value of the
3320  // vector recurrence we extracted in the middle block. Since the loop is in
3321  // LCSSA form, we just need to find all the phi nodes for the original scalar
3322  // recurrence in the exit block, and then add an edge for the middle block.
3323  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3324  if (LCSSAPhi.getIncomingValue(0) == Phi) {
3325  LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3326  }
3327  }
3328 }
3329 
3331  Constant *Zero = Builder.getInt32(0);
3332 
3333  // Get it's reduction variable descriptor.
3335  "Unable to find the reduction variable");
3336  RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
3337 
3339  TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3340  Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3342  RdxDesc.getMinMaxRecurrenceKind();
3343  setDebugLocFromInst(Builder, ReductionStartValue);
3344 
3345  // We need to generate a reduction vector from the incoming scalar.
3346  // To do so, we need to generate the 'identity' vector and override
3347  // one of the elements with the incoming scalar reduction. We need
3348  // to do it in the vector-loop preheader.
3350 
3351  // This is the vector-clone of the value that leaves the loop.
3352  Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3353 
3354  // Find the reduction identity variable. Zero for addition, or, xor,
3355  // one for multiplication, -1 for And.
3356  Value *Identity;
3357  Value *VectorStart;
3360  // MinMax reduction have the start value as their identify.
3361  if (VF == 1) {
3362  VectorStart = Identity = ReductionStartValue;
3363  } else {
3364  VectorStart = Identity =
3365  Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3366  }
3367  } else {
3368  // Handle other reduction kinds:
3370  RK, VecTy->getScalarType());
3371  if (VF == 1) {
3372  Identity = Iden;
3373  // This vector is the Identity vector where the first element is the
3374  // incoming scalar reduction.
3375  VectorStart = ReductionStartValue;
3376  } else {
3377  Identity = ConstantVector::getSplat(VF, Iden);
3378 
3379  // This vector is the Identity vector where the first element is the
3380  // incoming scalar reduction.
3381  VectorStart =
3382  Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3383  }
3384  }
3385 
3386  // Fix the vector-loop phi.
3387 
3388  // Reductions do not have to start at zero. They can start with
3389  // any loop invariant values.
3390  BasicBlock *Latch = OrigLoop->getLoopLatch();
3391  Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3392  for (unsigned Part = 0; Part < UF; ++Part) {
3393  Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3394  Value *Val = getOrCreateVectorValue(LoopVal, Part);
3395  // Make sure to add the reduction stat value only to the
3396  // first unroll part.
3397  Value *StartVal = (Part == 0) ? VectorStart : Identity;
3398  cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3399  cast<PHINode>(VecRdxPhi)
3400  ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3401  }
3402 
3403  // Before each round, move the insertion point right between
3404  // the PHIs and the values we are going to write.
3405  // This allows us to write both PHINodes and the extractelement
3406  // instructions.
3408 
3409  setDebugLocFromInst(Builder, LoopExitInst);
3410 
3411  // If the vector reduction can be performed in a smaller type, we truncate
3412  // then extend the loop exit value to enable InstCombine to evaluate the
3413  // entire expression in the smaller type.
3414  if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3415  Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3418  VectorParts RdxParts(UF);
3419  for (unsigned Part = 0; Part < UF; ++Part) {
3420  RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3421  Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3422  Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3423  : Builder.CreateZExt(Trunc, VecTy);
3424  for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3425  UI != RdxParts[Part]->user_end();)
3426  if (*UI != Trunc) {
3427  (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3428  RdxParts[Part] = Extnd;
3429  } else {
3430  ++UI;
3431  }
3432  }
3434  for (unsigned Part = 0; Part < UF; ++Part) {
3435  RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3436  VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3437  }
3438  }
3439 
3440  // Reduce all of the unrolled parts into a single vector.
3441  Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3443  setDebugLocFromInst(Builder, ReducedPartRdx);
3444  for (unsigned Part = 1; Part < UF; ++Part) {
3445  Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3446  if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3447  // Floating point operations had to be 'fast' to enable the reduction.
3448  ReducedPartRdx = addFastMathFlag(
3450  ReducedPartRdx, "bin.rdx"));
3451  else
3452  ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3453  RdxPart);
3454  }
3455 
3456  if (VF > 1) {
3457  bool NoNaN = Legal->hasFunNoNaNAttr();
3458  ReducedPartRdx =
3459  createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3460  // If the reduction can be performed in a smaller type, we need to extend
3461  // the reduction to the wider type before we branch to the original loop.
3462  if (Phi->getType() != RdxDesc.getRecurrenceType())
3463  ReducedPartRdx =
3464  RdxDesc.isSigned()
3465  ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3466  : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3467  }
3468 
3469  // Create a phi node that merges control-flow from the backedge-taken check
3470  // block and the middle block.
3471  PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3473  for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3474  BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3475  BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3476 
3477  // Now, we need to fix the users of the reduction variable
3478  // inside and outside of the scalar remainder loop.
3479  // We know that the loop is in LCSSA form. We need to update the
3480  // PHI nodes in the exit blocks.
3481  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3482  // All PHINodes need to have a single entry edge, or two if
3483  // we already fixed them.
3484  assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3485 
3486  // We found a reduction value exit-PHI. Update it with the
3487  // incoming bypass edge.
3488  if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3489  LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3490  } // end of the LCSSA phi scan.
3491 
3492  // Fix the scalar loop reduction variable with the incoming reduction sum
3493  // from the vector body and from the backedge value.
3494  int IncomingEdgeBlockIdx =
3496  assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3497  // Pick the other block.
3498  int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3499  Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3500  Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3501 }
3502 
3504  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3505  if (LCSSAPhi.getNumIncomingValues() == 1) {
3506  auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3507  // Non-instruction incoming values will have only one value.
3508  unsigned LastLane = 0;
3509  if (isa<Instruction>(IncomingValue))
3510  LastLane = Cost->isUniformAfterVectorization(
3511  cast<Instruction>(IncomingValue), VF)
3512  ? 0
3513  : VF - 1;
3514  // Can be a loop invariant incoming value or the last scalar value to be
3515  // extracted from the vectorized loop.
3517  Value *lastIncomingValue =
3518  getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3519  LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3520  }
3521  }
3522 }
3523 
3525  // The basic block and loop containing the predicated instruction.
3526  auto *PredBB = PredInst->getParent();
3527  auto *VectorLoop = LI->getLoopFor(PredBB);
3528 
3529  // Initialize a worklist with the operands of the predicated instruction.
3530  SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3531 
3532  // Holds instructions that we need to analyze again. An instruction may be
3533  // reanalyzed if we don't yet know if we can sink it or not.
3534  SmallVector<Instruction *, 8> InstsToReanalyze;
3535 
3536  // Returns true if a given use occurs in the predicated block. Phi nodes use
3537  // their operands in their corresponding predecessor blocks.
3538  auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3539  auto *I = cast<Instruction>(U.getUser());
3540  BasicBlock *BB = I->getParent();
3541  if (auto *Phi = dyn_cast<PHINode>(I))
3542  BB = Phi->getIncomingBlock(
3543  PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3544  return BB == PredBB;
3545  };
3546 
3547  // Iteratively sink the scalarized operands of the predicated instruction
3548  // into the block we created for it. When an instruction is sunk, it's
3549  // operands are then added to the worklist. The algorithm ends after one pass
3550  // through the worklist doesn't sink a single instruction.
3551  bool Changed;
3552  do {
3553  // Add the instructions that need to be reanalyzed to the worklist, and
3554  // reset the changed indicator.
3555  Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3556  InstsToReanalyze.clear();
3557  Changed = false;
3558 
3559  while (!Worklist.empty()) {
3560  auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3561 
3562  // We can't sink an instruction if it is a phi node, is already in the
3563  // predicated block, is not in the loop, or may have side effects.
3564  if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
3565  !VectorLoop->contains(I) || I->mayHaveSideEffects())
3566  continue;
3567 
3568  // It's legal to sink the instruction if all its uses occur in the
3569  // predicated block. Otherwise, there's nothing to do yet, and we may
3570  // need to reanalyze the instruction.
3571  if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3572  InstsToReanalyze.push_back(I);
3573  continue;
3574  }
3575 
3576  // Move the instruction to the beginning of the predicated block, and add
3577  // it's operands to the worklist.
3578  I->moveBefore(&*PredBB->getFirstInsertionPt());
3579  Worklist.insert(I->op_begin(), I->op_end());
3580 
3581  // The sinking may have enabled other instructions to be sunk, so we will
3582  // need to iterate.
3583  Changed = true;
3584  }
3585  } while (Changed);
3586 }
3587 
3589  for (PHINode *OrigPhi : OrigPHIsToFix) {
3590  PHINode *NewPhi =
3591  cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
3592  unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
3593 
3594  SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
3595  predecessors(OrigPhi->getParent()));
3596  SmallVector<BasicBlock *, 2> VectorBBPredecessors(
3597  predecessors(NewPhi->getParent()));
3598  assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
3599  "Scalar and Vector BB should have the same number of predecessors");
3600 
3601  // The insertion point in Builder may be invalidated by the time we get
3602  // here. Force the Builder insertion point to something valid so that we do
3603  // not run into issues during insertion point restore in
3604  // getOrCreateVectorValue calls below.
3605  Builder.SetInsertPoint(NewPhi);
3606 
3607  // The predecessor order is preserved and we can rely on mapping between
3608  // scalar and vector block predecessors.
3609  for (unsigned i = 0; i < NumIncomingValues; ++i) {
3610  BasicBlock *NewPredBB = VectorBBPredecessors[i];
3611 
3612  // When looking up the new scalar/vector values to fix up, use incoming
3613  // values from original phi.
3614  Value *ScIncV =
3615  OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
3616 
3617  // Scalar incoming value may need a broadcast
3618  Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
3619  NewPhi->addIncoming(NewIncV, NewPredBB);
3620  }
3621  }
3622 }
3623 
3625  unsigned VF) {
3626  PHINode *P = cast<PHINode>(PN);
3627  if (EnableVPlanNativePath) {
3628  // Currently we enter here in the VPlan-native path for non-induction
3629  // PHIs where all control flow is uniform. We simply widen these PHIs.
3630  // Create a vector phi with no operands - the vector phi operands will be
3631  // set at the end of vector code generation.
3632  Type *VecTy =
3633  (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3634  Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
3635  VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
3636  OrigPHIsToFix.push_back(P);
3637 
3638  return;
3639  }
3640 
3641  assert(PN->getParent() == OrigLoop->getHeader() &&
3642  "Non-header phis should have been handled elsewhere");
3643 
3644  // In order to support recurrences we need to be able to vectorize Phi nodes.
3645  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3646  // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3647  // this value when we vectorize all of the instructions that use the PHI.
3649  for (unsigned Part = 0; Part < UF; ++Part) {
3650  // This is phase one of vectorizing PHIs.
3651  Type *VecTy =
3652  (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3653  Value *EntryPart = PHINode::Create(
3654  VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
3655  VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
3656  }
3657  return;
3658  }
3659 
3661 
3662  // This PHINode must be an induction variable.
3663  // Make sure that we know about it.
3664  assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
3665 
3667  const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
3668 
3669  // FIXME: The newly created binary instructions should contain nsw/nuw flags,
3670  // which can be found from the original scalar operations.
3671  switch (II.getKind()) {
3673  llvm_unreachable("Unknown induction");
3676  llvm_unreachable("Integer/fp induction is handled elsewhere.");
3678  // Handle the pointer induction variable case.
3679  assert(P->getType()->isPointerTy() && "Unexpected type.");
3680  // This is the normalized GEP that starts counting at zero.
3681  Value *PtrInd = Induction;
3682  PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
3683  // Determine the number of scalars we need to generate for each unroll
3684  // iteration. If the instruction is uniform, we only need to generate the
3685  // first lane. Otherwise, we generate all VF values.
3686  unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
3687  // These are the scalar results. Notice that we don't generate vector GEPs
3688  // because scalar GEPs result in better code.
3689  for (unsigned Part = 0; Part < UF; ++Part) {
3690  for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
3691  Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
3692  Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
3693  Value *SclrGep =
3694  emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
3695  SclrGep->setName("next.gep");
3696  VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
3697  }
3698  }
3699  return;
3700  }
3701  }
3702 }
3703 
3704 /// A helper function for checking whether an integer division-related
3705 /// instruction may divide by zero (in which case it must be predicated if
3706 /// executed conditionally in the scalar code).
3707 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
3708 /// Non-zero divisors that are non compile-time constants will not be
3709 /// converted into multiplication, so we will still end up scalarizing
3710 /// the division, but can do so w/o predication.
3712  assert((I.getOpcode() == Instruction::UDiv ||
3713  I.getOpcode() == Instruction::SDiv ||
3714  I.getOpcode() == Instruction::URem ||
3715  I.getOpcode() == Instruction::SRem) &&
3716  "Unexpected instruction");
3717  Value *Divisor = I.getOperand(1);
3718  auto *CInt = dyn_cast<ConstantInt>(Divisor);
3719  return !CInt || CInt->isZero();
3720 }
3721 
3723  switch (I.getOpcode()) {
3724  case Instruction::Br:
3725  case Instruction::PHI:
3726  llvm_unreachable("This instruction is handled by a different recipe.");
3727  case Instruction::GetElementPtr: {
3728  // Construct a vector GEP by widening the operands of the scalar GEP as
3729  // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
3730  // results in a vector of pointers when at least one operand of the GEP
3731  // is vector-typed. Thus, to keep the representation compact, we only use
3732  // vector-typed operands for loop-varying values.
3733  auto *GEP = cast<GetElementPtrInst>(&I);
3734 
3735  if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
3736  // If we are vectorizing, but the GEP has only loop-invariant operands,
3737  // the GEP we build (by only using vector-typed operands for
3738  // loop-varying values) would be a scalar pointer. Thus, to ensure we
3739  // produce a vector of pointers, we need to either arbitrarily pick an
3740  // operand to broadcast, or broadcast a clone of the original GEP.
3741  // Here, we broadcast a clone of the original.
3742  //
3743  // TODO: If at some point we decide to scalarize instructions having
3744  // loop-invariant operands, this special case will no longer be
3745  // required. We would add the scalarization decision to
3746  // collectLoopScalars() and teach getVectorValue() to broadcast
3747  // the lane-zero scalar value.
3748  auto *Clone = Builder.Insert(GEP->clone());
3749  for (unsigned Part = 0; Part < UF; ++Part) {
3750  Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
3751  VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
3752  addMetadata(EntryPart, GEP);
3753  }
3754  } else {
3755  // If the GEP has at least one loop-varying operand, we are sure to
3756  // produce a vector of pointers. But if we are only unrolling, we want
3757  // to produce a scalar GEP for each unroll part. Thus, the GEP we
3758  // produce with the code below will be scalar (if VF == 1) or vector
3759  // (otherwise). Note that for the unroll-only case, we still maintain
3760  // values in the vector mapping with initVector, as we do for other
3761  // instructions.
3762  for (unsigned Part = 0; Part < UF; ++Part) {
3763  // The pointer operand of the new GEP. If it's loop-invariant, we
3764  // won't broadcast it.
3765  auto *Ptr =
3766  OrigLoop->isLoopInvariant(GEP->getPointerOperand())
3767  ? GEP->getPointerOperand()
3768  : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
3769 
3770  // Collect all the indices for the new GEP. If any index is
3771  // loop-invariant, we won't broadcast it.
3772  SmallVector<Value *, 4> Indices;
3773  for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
3774  if (OrigLoop->isLoopInvariant(U.get()))
3775  Indices.push_back(U.get());
3776  else
3777  Indices.push_back(getOrCreateVectorValue(U.get(), Part));
3778  }
3779 
3780  // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
3781  // but it should be a vector, otherwise.
3782  auto *NewGEP = GEP->isInBounds()
3783  ? Builder.CreateInBoundsGEP(Ptr, Indices)
3784  : Builder.CreateGEP(Ptr, Indices);
3785  assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
3786  "NewGEP is not a pointer vector");
3787  VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
3788  addMetadata(NewGEP, GEP);
3789  }
3790  }
3791 
3792  break;
3793  }
3794  case Instruction::UDiv:
3795  case Instruction::SDiv:
3796  case Instruction::SRem:
3797  case Instruction::URem:
3798  case Instruction::Add:
3799  case Instruction::FAdd:
3800  case Instruction::Sub:
3801  case Instruction::FSub:
3802  case Instruction::Mul:
3803  case Instruction::FMul:
3804  case Instruction::FDiv:
3805  case Instruction::FRem:
3806  case Instruction::Shl:
3807  case Instruction::LShr:
3808  case Instruction::AShr:
3809  case Instruction::And:
3810  case Instruction::Or:
3811  case Instruction::Xor: {
3812  // Just widen binops.
3813  auto *BinOp = cast<BinaryOperator>(&I);
3814  setDebugLocFromInst(Builder, BinOp);
3815 
3816  for (unsigned Part = 0; Part < UF; ++Part) {
3817  Value *A = getOrCreateVectorValue(BinOp->getOperand(0), Part);
3818  Value *B = getOrCreateVectorValue(BinOp->getOperand(1), Part);
3819  Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
3820 
3821  if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
3822  VecOp->copyIRFlags(BinOp);
3823 
3824  // Use this vector value for all users of the original instruction.
3825  VectorLoopValueMap.setVectorValue(&I, Part, V);
3826  addMetadata(V, BinOp);
3827  }
3828 
3829  break;
3830  }
3831  case Instruction::Select: {
3832  // Widen selects.
3833  // If the selector is loop invariant we can create a select
3834  // instruction with a scalar condition. Otherwise, use vector-select.
3835  auto *SE = PSE.getSE();
3836  bool InvariantCond =
3839 
3840  // The condition can be loop invariant but still defined inside the
3841  // loop. This means that we can't just use the original 'cond' value.
3842  // We have to take the 'vectorized' value and pick the first lane.
3843  // Instcombine will make this a no-op.
3844 
3845  auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
3846 
3847  for (unsigned Part = 0; Part < UF; ++Part) {
3848  Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
3849  Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
3850  Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
3851  Value *Sel =
3852  Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
3853  VectorLoopValueMap.setVectorValue(&I, Part, Sel);
3854  addMetadata(Sel, &I);
3855  }
3856 
3857  break;
3858  }
3859 
3860  case Instruction::ICmp:
3861  case Instruction::FCmp: {
3862  // Widen compares. Generate vector compares.
3863  bool FCmp = (I.getOpcode() == Instruction::FCmp);
3864  auto *Cmp = dyn_cast<CmpInst>(&I);
3866  for (unsigned Part = 0; Part < UF; ++Part) {
3867  Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
3868  Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
3869  Value *C = nullptr;
3870  if (FCmp) {
3871  // Propagate fast math flags.
3873  Builder.setFastMathFlags(Cmp->getFastMathFlags());
3874  C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
3875  } else {
3876  C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
3877  }
3878  VectorLoopValueMap.setVectorValue(&I, Part, C);
3879  addMetadata(C, &I);
3880  }
3881 
3882  break;
3883  }
3884 
3885  case Instruction::ZExt:
3886  case Instruction::SExt:
3887  case Instruction::FPToUI:
3888  case Instruction::FPToSI:
3889  case Instruction::FPExt:
3890  case Instruction::PtrToInt:
3891  case Instruction::IntToPtr:
3892  case Instruction::SIToFP:
3893  case Instruction::UIToFP:
3894  case Instruction::Trunc:
3895  case Instruction::FPTrunc:
3896  case Instruction::BitCast: {
3897  auto *CI = dyn_cast<CastInst>(&I);
3899 
3900  /// Vectorize casts.
3901  Type *DestTy =
3902  (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
3903 
3904  for (unsigned Part = 0; Part < UF; ++Part) {
3905  Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
3906  Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
3907  VectorLoopValueMap.setVectorValue(&I, Part, Cast);
3908  addMetadata(Cast, &I);
3909  }
3910  break;
3911  }
3912 
3913  case Instruction::Call: {
3914  // Ignore dbg intrinsics.
3915  if (isa<DbgInfoIntrinsic>(I))
3916  break;
3918 
3919  Module *M = I.getParent()->getParent()->getParent();
3920  auto *CI = cast<CallInst>(&I);
3921 
3922  StringRef FnName = CI->getCalledFunction()->getName();
3923  Function *F = CI->getCalledFunction();
3924  Type *RetTy = ToVectorTy(CI->getType(), VF);
3926  for (Value *ArgOperand : CI->arg_operands())
3927  Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
3928 
3930 
3931  // The flag shows whether we use Intrinsic or a usual Call for vectorized
3932  // version of the instruction.
3933  // Is it beneficial to perform intrinsic call compared to lib call?
3934  bool NeedToScalarize;
3935  unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
3936  bool UseVectorIntrinsic =
3937  ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
3938  assert((UseVectorIntrinsic || !NeedToScalarize) &&
3939  "Instruction should be scalarized elsewhere.");
3940 
3941  for (unsigned Part = 0; Part < UF; ++Part) {
3943  for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
3944  Value *Arg = CI->getArgOperand(i);
3945  // Some intrinsics have a scalar argument - don't replace it with a
3946  // vector.
3947  if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
3948  Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
3949  Args.push_back(Arg);
3950  }
3951 
3952  Function *VectorF;
3953  if (UseVectorIntrinsic) {
3954  // Use vector version of the intrinsic.
3955  Type *TysForDecl[] = {CI->getType()};
3956  if (VF > 1)
3957  TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
3958  VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
3959  } else {
3960  // Use vector version of the library call.
3961  StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
3962  assert(!VFnName.empty() && "Vector function name is empty.");
3963  VectorF = M->getFunction(VFnName);
3964  if (!VectorF) {
3965  // Generate a declaration
3966  FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
3967  VectorF =
3968  Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
3969  VectorF->copyAttributesFrom(F);
3970  }
3971  }
3972  assert(VectorF && "Can't create vector function.");
3973 
3975  CI->getOperandBundlesAsDefs(OpBundles);
3976  CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
3977 
3978  if (isa<FPMathOperator>(V))
3979  V->copyFastMathFlags(CI);
3980 
3981  VectorLoopValueMap.setVectorValue(&I, Part, V);
3982  addMetadata(V, &I);
3983  }
3984 
3985  break;
3986  }
3987 
3988  default:
3989  // This instruction is not vectorized by simple widening.
3990  LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
3991  llvm_unreachable("Unhandled instruction!");
3992  } // end of switch.
3993 }
3994 
3996  // Forget the original basic block.
3998 
3999  // DT is not kept up-to-date for outer loop vectorization
4001  return;
4002 
4003  // Update the dominator tree information.
4005  "Entry does not dominate exit.");
4006 
4013 }
4014 
4015 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4016  // We should not collect Scalars more than once per VF. Right now, this
4017  // function is called from collectUniformsAndScalars(), which already does
4018  // this check. Collecting Scalars for VF=1 does not make any sense.
4019  assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4020  "This function should not be visited twice for the same VF");
4021 
4023 
4024  // These sets are used to seed the analysis with pointers used by memory
4025  // accesses that will remain scalar.
4027  SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4028 
4029  // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4030  // The pointer operands of loads and stores will be scalar as long as the
4031  // memory access is not a gather or scatter operation. The value operand of a
4032  // store will remain scalar if the store is scalarized.
4033  auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4034  InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4035  assert(WideningDecision != CM_Unknown &&
4036  "Widening decision should be ready at this moment");
4037  if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4038  if (Ptr == Store->getValueOperand())
4039  return WideningDecision == CM_Scalarize;
4040  assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4041  "Ptr is neither a value or pointer operand");
4042  return WideningDecision != CM_GatherScatter;
4043  };
4044 
4045  // A helper that returns true if the given value is a bitcast or
4046  // getelementptr instruction contained in the loop.
4047  auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4048  return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4049  isa<GetElementPtrInst>(V)) &&
4050  !TheLoop->isLoopInvariant(V);
4051  };
4052 
4053  // A helper that evaluates a memory access's use of a pointer. If the use
4054  // will be a scalar use, and the pointer is only used by memory accesses, we
4055  // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4056  // PossibleNonScalarPtrs.
4057  auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4058  // We only care about bitcast and getelementptr instructions contained in
4059  // the loop.
4060  if (!isLoopVaryingBitCastOrGEP(Ptr))
4061  return;
4062 
4063  // If the pointer has already been identified as scalar (e.g., if it was
4064  // also identified as uniform), there's nothing to do.
4065  auto *I = cast<Instruction>(Ptr);
4066  if (Worklist.count(I))
4067  return;
4068 
4069  // If the use of the pointer will be a scalar use, and all users of the
4070  // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4071  // place the pointer in PossibleNonScalarPtrs.
4072  if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4073  return isa<LoadInst>(U) || isa<StoreInst>(U);
4074  }))
4075  ScalarPtrs.insert(I);
4076  else
4077  PossibleNonScalarPtrs.insert(I);
4078  };
4079 
4080  // We seed the scalars analysis with three classes of instructions: (1)
4081  // instructions marked uniform-after-vectorization, (2) bitcast and
4082  // getelementptr instructions used by memory accesses requiring a scalar use,
4083  // and (3) pointer induction variables and their update instructions (we
4084  // currently only scalarize these).
4085  //
4086  // (1) Add to the worklist all instructions that have been identified as
4087  // uniform-after-vectorization.
4088  Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4089 
4090  // (2) Add to the worklist all bitcast and getelementptr instructions used by
4091  // memory accesses requiring a scalar use. The pointer operands of loads and
4092  // stores will be scalar as long as the memory accesses is not a gather or
4093  // scatter operation. The value operand of a store will remain scalar if the
4094  // store is scalarized.
4095  for (auto *BB : TheLoop->blocks())
4096  for (auto &I : *BB) {
4097  if (auto *Load = dyn_cast<LoadInst>(&I)) {
4098  evaluatePtrUse(Load, Load->getPointerOperand());
4099  } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4100  evaluatePtrUse(Store, Store->getPointerOperand());
4101  evaluatePtrUse(Store, Store->getValueOperand());
4102  }
4103  }
4104  for (auto *I : ScalarPtrs)
4105  if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4106  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4107  Worklist.insert(I);
4108  }
4109 
4110  // (3) Add to the worklist all pointer induction variables and their update
4111  // instructions.
4112  //
4113  // TODO: Once we are able to vectorize pointer induction variables we should
4114  // no longer insert them into the worklist here.
4115  auto *Latch = TheLoop->getLoopLatch();
4116  for (auto &Induction : *Legal->getInductionVars()) {
4117  auto *Ind = Induction.first;
4118  auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4119  if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4120  continue;
4121  Worklist.insert(Ind);
4122  Worklist.insert(IndUpdate);
4123  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4124  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4125  << "\n");
4126  }
4127 
4128  // Insert the forced scalars.
4129  // FIXME: Currently widenPHIInstruction() often creates a dead vector
4130  // induction variable when the PHI user is scalarized.
4131  auto ForcedScalar = ForcedScalars.find(VF);
4132  if (ForcedScalar != ForcedScalars.end())
4133  for (auto *I : ForcedScalar->second)
4134  Worklist.insert(I);
4135 
4136  // Expand the worklist by looking through any bitcasts and getelementptr
4137  // instructions we've already identified as scalar. This is similar to the
4138  // expansion step in collectLoopUniforms(); however, here we're only
4139  // expanding to include additional bitcasts and getelementptr instructions.
4140  unsigned Idx = 0;
4141  while (Idx != Worklist.size()) {
4142  Instruction *Dst = Worklist[Idx++];
4143  if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4144  continue;
4145  auto *Src = cast<Instruction>(Dst->getOperand(0));
4146  if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4147  auto *J = cast<Instruction>(U);
4148  return !TheLoop->contains(J) || Worklist.count(J) ||
4149  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4150  isScalarUse(J, Src));
4151  })) {
4152  Worklist.insert(Src);
4153  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4154  }
4155  }
4156 
4157  // An induction variable will remain scalar if all users of the induction
4158  // variable and induction variable update remain scalar.
4159  for (auto &Induction : *Legal->getInductionVars()) {
4160  auto *Ind = Induction.first;
4161  auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4162 
4163  // We already considered pointer induction variables, so there's no reason
4164  // to look at their users again.
4165  //
4166  // TODO: Once we are able to vectorize pointer induction variables we
4167  // should no longer skip over them here.
4168  if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4169  continue;
4170 
4171  // Determine if all users of the induction variable are scalar after
4172  // vectorization.
4173  auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4174  auto *I = cast<Instruction>(U);
4175  return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4176  });
4177  if (!ScalarInd)
4178  continue;
4179 
4180  // Determine if all users of the induction variable update instruction are
4181  // scalar after vectorization.
4182  auto ScalarIndUpdate =
4183  llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4184  auto *I = cast<Instruction>(U);
4185  return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4186  });
4187  if (!ScalarIndUpdate)
4188  continue;
4189 
4190  // The induction variable and its update instruction will remain scalar.
4191  Worklist.insert(Ind);
4192  Worklist.insert(IndUpdate);
4193  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4194  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4195  << "\n");
4196  }
4197 
4198  Scalars[VF].insert(Worklist.begin(), Worklist.end());
4199 }
4200 
4203  return false;
4204  switch(I->getOpcode()) {
4205  default:
4206  break;
4207  case Instruction::Load:
4208  case Instruction::Store: {
4209  if (!Legal->isMaskRequired(I))
4210  return false;
4211  auto *Ptr = getLoadStorePointerOperand(I);
4212  auto *Ty = getMemInstValueType(I);
4213  // We have already decided how to vectorize this instruction, get that
4214  // result.
4215  if (VF > 1) {
4216  InstWidening WideningDecision = getWideningDecision(I, VF);
4217  assert(WideningDecision != CM_Unknown &&
4218  "Widening decision should be ready at this moment");
4219  return WideningDecision == CM_Scalarize;
4220  }
4221  return isa<LoadInst>(I) ?
4222  !(isLegalMaskedLoad(Ty, Ptr) || isLegalMaskedGather(Ty))
4223  : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty));
4224  }
4225  case Instruction::UDiv:
4226  case Instruction::SDiv:
4227  case Instruction::SRem:
4228  case Instruction::URem:
4229  return mayDivideByZero(*I);
4230  }
4231  return false;
4232 }
4233 
4235  unsigned VF) {
4236  // Get and ensure we have a valid memory instruction.
4237  LoadInst *LI = dyn_cast<LoadInst>(I);
4239  assert((LI || SI) && "Invalid memory instruction");
4240 
4241  auto *Ptr = getLoadStorePointerOperand(I);
4242 
4243  // In order to be widened, the pointer should be consecutive, first of all.
4244  if (!Legal->isConsecutivePtr(Ptr))
4245  return false;
4246 
4247  // If the instruction is a store located in a predicated block, it will be
4248  // scalarized.
4249  if (isScalarWithPredication(I))
4250  return false;
4251 
4252  // If the instruction's allocated size doesn't equal it's type size, it
4253  // requires padding and will be scalarized.
4254  auto &DL = I->getModule()->getDataLayout();
4255  auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4256  if (hasIrregularType(ScalarTy, DL, VF))
4257  return false;
4258 
4259  return true;
4260 }
4261 
4262 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4263  // We should not collect Uniforms more than once per VF. Right now,
4264  // this function is called from collectUniformsAndScalars(), which
4265  // already does this check. Collecting Uniforms for VF=1 does not make any
4266  // sense.
4267 
4268  assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4269  "This function should not be visited twice for the same VF");
4270 
4271  // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4272  // not analyze again. Uniforms.count(VF) will return 1.
4273  Uniforms[VF].clear();
4274 
4275  // We now know that the loop is vectorizable!
4276  // Collect instructions inside the loop that will remain uniform after
4277  // vectorization.
4278 
4279  // Global values, params and instructions outside of current loop are out of
4280  // scope.
4281  auto isOutOfScope = [&](Value *V) -> bool {
4283  return (!I || !TheLoop->contains(I));
4284  };
4285 
4286  SetVector<Instruction *> Worklist;
4287  BasicBlock *Latch = TheLoop->getLoopLatch();
4288 
4289  // Start with the conditional branch. If the branch condition is an
4290  // instruction contained in the loop that is only used by the branch, it is
4291  // uniform.
4292  auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4293  if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
4294  Worklist.insert(Cmp);
4295  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
4296  }
4297 
4298  // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4299  // are pointers that are treated like consecutive pointers during
4300  // vectorization. The pointer operands of interleaved accesses are an
4301  // example.
4302  SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4303 
4304  // Holds pointer operands of instructions that are possibly non-uniform.
4305  SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4306 
4307  auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4308  InstWidening WideningDecision = getWideningDecision(I, VF);
4309  assert(WideningDecision != CM_Unknown &&
4310  "Widening decision should be ready at this moment");
4311 
4312  return (WideningDecision == CM_Widen ||
4313  WideningDecision == CM_Widen_Reverse ||
4314  WideningDecision == CM_Interleave);
4315  };
4316  // Iterate over the instructions in the loop, and collect all
4317  // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4318  // that a consecutive-like pointer operand will be scalarized, we collect it
4319  // in PossibleNonUniformPtrs instead. We use two sets here because a single
4320  // getelementptr instruction can be used by both vectorized and scalarized
4321  // memory instructions. For example, if a loop loads and stores from the same
4322  // location, but the store is conditional, the store will be scalarized, and
4323  // the getelementptr won't remain uniform.
4324  for (auto *BB : TheLoop->blocks())
4325  for (auto &I : *BB) {
4326  // If there's no pointer operand, there's nothing to do.
4327  auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4328  if (!Ptr)
4329  continue;
4330 
4331  // True if all users of Ptr are memory accesses that have Ptr as their
4332  // pointer operand.
4333  auto UsersAreMemAccesses =
4334  llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4335  return getLoadStorePointerOperand(U) == Ptr;
4336  });
4337 
4338  // Ensure the memory instruction will not be scalarized or used by
4339  // gather/scatter, making its pointer operand non-uniform. If the pointer
4340  // operand is used by any instruction other than a memory access, we
4341  // conservatively assume the pointer operand may be non-uniform.
4342  if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4343  PossibleNonUniformPtrs.insert(Ptr);
4344 
4345  // If the memory instruction will be vectorized and its pointer operand
4346  // is consecutive-like, or interleaving - the pointer operand should
4347  // remain uniform.
4348  else
4349  ConsecutiveLikePtrs.insert(Ptr);
4350  }
4351 
4352  // Add to the Worklist all consecutive and consecutive-like pointers that
4353  // aren't also identified as possibly non-uniform.
4354  for (auto *V : ConsecutiveLikePtrs)
4355  if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) {
4356  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
4357  Worklist.insert(V);
4358  }
4359 
4360  // Expand Worklist in topological order: whenever a new instruction
4361  // is added , its users should be already inside Worklist. It ensures
4362  // a uniform instruction will only be used by uniform instructions.
4363  unsigned idx = 0;
4364  while (idx != Worklist.size()) {
4365  Instruction *I = Worklist[idx++];
4366 
4367  for (auto OV : I->operand_values()) {
4368  // isOutOfScope operands cannot be uniform instructions.
4369  if (isOutOfScope(OV))
4370  continue;
4371  // First order recurrence Phi's should typically be considered
4372  // non-uniform.
4373  auto *OP = dyn_cast<PHINode>(OV);
4374  if (OP && Legal->isFirstOrderRecurrence(OP))
4375  continue;
4376  // If all the users of the operand are uniform, then add the
4377  // operand into the uniform worklist.
4378  auto *OI = cast<Instruction>(OV);
4379  if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4380  auto *J = cast<Instruction>(U);
4381  return Worklist.count(J) ||
4382  (OI == getLoadStorePointerOperand(J) &&
4383  isUniformDecision(J, VF));
4384  })) {
4385  Worklist.insert(OI);
4386  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
4387  }
4388  }
4389  }
4390 
4391  // Returns true if Ptr is the pointer operand of a memory access instruction
4392  // I, and I is known to not require scalarization.
4393  auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4394  return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4395  };
4396 
4397  // For an instruction to be added into Worklist above, all its users inside
4398  // the loop should also be in Worklist. However, this condition cannot be
4399  // true for phi nodes that form a cyclic dependence. We must process phi
4400  // nodes separately. An induction variable will remain uniform if all users
4401  // of the induction variable and induction variable update remain uniform.
4402  // The code below handles both pointer and non-pointer induction variables.
4403  for (auto &Induction : *Legal->getInductionVars()) {
4404  auto *Ind = Induction.first;
4405  auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4406 
4407  // Determine if all users of the induction variable are uniform after
4408  // vectorization.
4409  auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4410  auto *I = cast<Instruction>(U);
4411  return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4412  isVectorizedMemAccessUse(I, Ind);
4413  });
4414  if (!UniformInd)
4415  continue;
4416 
4417  // Determine if all users of the induction variable update instruction are
4418  // uniform after vectorization.
4419  auto UniformIndUpdate =
4420  llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4421  auto *I = cast<Instruction>(U);
4422  return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4423  isVectorizedMemAccessUse(I, IndUpdate);
4424  });
4425  if (!UniformIndUpdate)
4426  continue;
4427 
4428  // The induction variable and its update instruction will remain uniform.
4429  Worklist.insert(Ind);
4430  Worklist.insert(IndUpdate);
4431  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
4432  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate
4433  << "\n");
4434  }
4435 
4436  Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4437 }
4438 
4441  // TODO: It may by useful to do since it's still likely to be dynamically
4442  // uniform if the target can skip.
4443  LLVM_DEBUG(
4444  dbgs() << "LV: Not inserting runtime ptr check for divergent target");
4445 
4446  ORE->emit(
4447  createMissedAnalysis("CantVersionLoopWithDivergentTarget")
4448  << "runtime pointer checks needed. Not enabled for divergent target");
4449 
4450  return None;
4451  }
4452 
4453  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4454  if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize.
4455  return computeFeasibleMaxVF(OptForSize, TC);
4456 
4458  ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
4459  << "runtime pointer checks needed. Enable vectorization of this "
4460  "loop with '#pragma clang loop vectorize(enable)' when "
4461  "compiling with -Os/-Oz");
4462  LLVM_DEBUG(
4463  dbgs()
4464  << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
4465  return None;
4466  }
4467 
4468  // If we optimize the program for size, avoid creating the tail loop.
4469  LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4470 
4471  // If we don't know the precise trip count, don't try to vectorize.
4472  if (TC < 2) {
4473  ORE->emit(
4474  createMissedAnalysis("UnknownLoopCountComplexCFG")
4475  << "unable to calculate the loop count due to complex control flow");
4476  LLVM_DEBUG(
4477  dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
4478  return None;
4479  }
4480 
4481  unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC);
4482 
4483  if (TC % MaxVF != 0) {
4484  // If the trip count that we found modulo the vectorization factor is not
4485  // zero then we require a tail.
4486  // FIXME: look for a smaller MaxVF that does divide TC rather than give up.
4487  // FIXME: return None if loop requiresScalarEpilog(<MaxVF>), or look for a
4488  // smaller MaxVF that does not require a scalar epilog.
4489 
4490  ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize")
4491  << "cannot optimize for size and vectorize at the "
4492  "same time. Enable vectorization of this loop "
4493  "with '#pragma clang loop vectorize(enable)' "
4494  "when compiling with -Os/-Oz");
4495  LLVM_DEBUG(
4496  dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
4497  return None;
4498  }
4499 
4500  return MaxVF;
4501 }
4502 
4503 unsigned
4504 LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
4505  unsigned ConstTripCount) {
4506  MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4507  unsigned SmallestType, WidestType;
4508  std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4509  unsigned WidestRegister = TTI.getRegisterBitWidth(true);
4510 
4511  // Get the maximum safe dependence distance in bits computed by LAA.
4512  // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4513  // the memory accesses that is most restrictive (involved in the smallest
4514  // dependence distance).
4515  unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
4516 
4517  WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
4518 
4519  unsigned MaxVectorSize = WidestRegister / WidestType;
4520 
4521  LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4522  << " / " << WidestType << " bits.\n");
4523  LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4524  << WidestRegister << " bits.\n");
4525 
4526  assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
4527  " into one vector!");
4528  if (MaxVectorSize == 0) {
4529  LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
4530  MaxVectorSize = 1;
4531  return MaxVectorSize;
4532  } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
4533  isPowerOf2_32(ConstTripCount)) {
4534  // We need to clamp the VF to be the ConstTripCount. There is no point in
4535  // choosing a higher viable VF as done in the loop below.
4536  LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
4537  << ConstTripCount << "\n");
4538  MaxVectorSize = ConstTripCount;
4539  return MaxVectorSize;
4540  }
4541 
4542  unsigned MaxVF = MaxVectorSize;
4543  if (TTI.shouldMaximizeVectorBandwidth(OptForSize) ||
4544  (MaximizeBandwidth && !OptForSize)) {
4545  // Collect all viable vectorization factors larger than the default MaxVF
4546  // (i.e. MaxVectorSize).
4548  unsigned NewMaxVectorSize = WidestRegister / SmallestType;
4549  for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
4550  VFs.push_back(VS);
4551 
4552  // For each VF calculate its register usage.
4553  auto RUs = calculateRegisterUsage(VFs);
4554 
4555  // Select the largest VF which doesn't require more registers than existing
4556  // ones.
4557  unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
4558  for (int i = RUs.size() - 1; i >= 0; --i) {
4559  if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
4560  MaxVF = VFs[i];
4561  break;
4562  }
4563  }
4564  if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
4565  if (MaxVF < MinVF) {
4566  LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4567  << ") with target's minimum: " << MinVF << '\n');
4568  MaxVF = MinVF;
4569  }
4570  }
4571  }
4572  return MaxVF;
4573 }
4574 
4577  float Cost = expectedCost(1).first;
4578  const float ScalarCost = Cost;
4579  unsigned Width = 1;
4580  LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
4581 
4582  bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
4583  if (ForceVectorization && MaxVF > 1) {
4584  // Ignore scalar width, because the user explicitly wants vectorization.
4585  // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4586  // evaluation.
4588  }
4589 
4590  for (unsigned i = 2; i <= MaxVF; i *= 2) {
4591  // Notice that the vector loop needs to be executed less times, so
4592  // we need to divide the cost of the vector loops by the width of
4593  // the vector elements.
4594  VectorizationCostTy C = expectedCost(i);
4595  float VectorCost = C.first / (float)i;
4596  LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
4597  << " costs: " << (int)VectorCost << ".\n");
4598  if (!C.second && !ForceVectorization) {
4599  LLVM_DEBUG(
4600  dbgs() << "LV: Not considering vector loop of width " << i
4601  << " because it will not generate any vector instructions.\n");
4602  continue;
4603  }
4604  if (VectorCost < Cost) {
4605  Cost = VectorCost;
4606  Width = i;
4607  }
4608  }
4609 
4610  if (!EnableCondStoresVectorization && NumPredStores) {
4611  ORE->emit(createMissedAnalysis("ConditionalStore")
4612  << "store that is conditionally executed prevents vectorization");
4613  LLVM_DEBUG(
4614  dbgs() << "LV: No vectorization. There are conditional stores.\n");
4615  Width = 1;
4616  Cost = ScalarCost;
4617  }
4618 
4619  LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
4620  << "LV: Vectorization seems to be not beneficial, "
4621  << "but was forced by a user.\n");
4622  LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
4623  VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
4624  return Factor;
4625 }
4626 
4627 std::pair<unsigned, unsigned>
4629  unsigned MinWidth = -1U;
4630  unsigned MaxWidth = 8;
4631  const DataLayout &DL = TheFunction->getParent()->getDataLayout();
4632 
4633  // For each block.
4634  for (BasicBlock *BB : TheLoop->blocks()) {
4635  // For each instruction in the loop.
4636  for (Instruction &I : *BB) {
4637  Type *T = I.getType();
4638 
4639  // Skip ignored values.
4640  if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
4641  continue;
4642 
4643  // Only examine Loads, Stores and PHINodes.
4644  if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
4645  continue;
4646 
4647  // Examine PHI nodes that are reduction variables. Update the type to
4648  // account for the recurrence type.
4649  if (auto *PN = dyn_cast<PHINode>(&I)) {
4650  if (!Legal->isReductionVariable(PN))
4651  continue;
4652  RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
4653  T = RdxDesc.getRecurrenceType();
4654  }
4655 
4656  // Examine the stored values.
4657  if (auto *ST = dyn_cast<StoreInst>(&I))
4658  T = ST->getValueOperand()->getType();
4659 
4660  // Ignore loaded pointer types and stored pointer types that are not
4661  // vectorizable.
4662  //
4663  // FIXME: The check here attempts to predict whether a load or store will
4664  // be vectorized. We only know this for certain after a VF has
4665  // been selected. Here, we assume that if an access can be
4666  // vectorized, it will be. We should also look at extending this
4667  // optimization to non-pointer types.
4668  //
4669  if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
4670  !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
4671  continue;
4672 
4673  MinWidth = std::min(MinWidth,
4674  (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
4675  MaxWidth = std::max(MaxWidth,
4676  (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
4677  }
4678  }
4679 
4680  return {MinWidth, MaxWidth};
4681 }
4682 
4684  unsigned VF,
4685  unsigned LoopCost) {
4686  // -- The interleave heuristics --
4687  // We interleave the loop in order to expose ILP and reduce the loop overhead.
4688  // There are many micro-architectural considerations that we can't predict
4689  // at this level. For example, frontend pressure (on decode or fetch) due to
4690  // code size, or the number and capabilities of the execution ports.
4691  //
4692  // We use the following heuristics to select the interleave count:
4693  // 1. If the code has reductions, then we interleave to break the cross
4694  // iteration dependency.
4695  // 2. If the loop is really small, then we interleave to reduce the loop
4696  // overhead.
4697  // 3. We don't interleave if we think that we will spill registers to memory
4698  // due to the increased register pressure.
4699 
4700  // When we optimize for size, we don't interleave.
4701  if (OptForSize)
4702  return 1;
4703 
4704  // We used the distance for the interleave count.
4705  if (Legal->getMaxSafeDepDistBytes() != -1U)
4706  return 1;
4707 
4708  // Do not interleave loops with a relatively small trip count.
4709  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4710  if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
4711  return 1;
4712 
4713  unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
4714  LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
4715  << " registers\n");
4716 
4717  if (VF == 1) {
4718  if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
4719  TargetNumRegisters = ForceTargetNumScalarRegs;
4720  } else {
4721  if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
4722  TargetNumRegisters = ForceTargetNumVectorRegs;
4723  }
4724 
4725  RegisterUsage R = calculateRegisterUsage({VF})[0];
4726  // We divide by these constants so assume that we have at least one
4727  // instruction that uses at least one register.
4729 
4730  // We calculate the interleave count using the following formula.
4731  // Subtract the number of loop invariants from the number of available
4732  // registers. These registers are used by all of the interleaved instances.
4733  // Next, divide the remaining registers by the number of registers that is
4734  // required by the loop, in order to estimate how many parallel instances
4735  // fit without causing spills. All of this is rounded down if necessary to be
4736  // a power of two. We want power of two interleave count to simplify any
4737  // addressing operations or alignment considerations.
4738  unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
4739  R.MaxLocalUsers);
4740 
4741  // Don't count the induction variable as interleaved.
4743  IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
4744  std::max(1U, (R.MaxLocalUsers - 1)));
4745 
4746  // Clamp the interleave ranges to reasonable counts.
4747  unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
4748 
4749  // Check if the user has overridden the max.
4750  if (VF == 1) {
4751  if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
4752  MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
4753  } else {
4754  if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
4755  MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4756  }
4757 
4758  // If we did not calculate the cost for VF (because the user selected the VF)
4759  // then we calculate the cost of VF here.
4760  if (LoopCost == 0)
4761  LoopCost = expectedCost(VF).first;
4762 
4763  // Clamp the calculated IC to be between the 1 and the max interleave count
4764  // that the target allows.
4765  if (IC > MaxInterleaveCount)
4766  IC = MaxInterleaveCount;
4767  else if (IC < 1)
4768  IC = 1;
4769 
4770  // Interleave if we vectorized this loop and there is a reduction that could
4771  // benefit from interleaving.
4772  if (VF > 1 && !Legal->getReductionVars()->empty()) {
4773  LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
4774  return IC;
4775  }
4776 
4777  // Note that if we've already vectorized the loop we will have done the
4778  // runtime check and so interleaving won't require further checks.
4779  bool InterleavingRequiresRuntimePointerCheck =
4780  (VF == 1 && Legal->getRuntimePointerChecking()->Need);
4781 
4782  // We want to interleave small loops in order to reduce the loop overhead and
4783  // potentially expose ILP opportunities.
4784  LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
4785  if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
4786  // We assume that the cost overhead is 1 and we use the cost model
4787  // to estimate the cost of the loop and interleave until the cost of the
4788  // loop overhead is about 5% of the cost of the loop.
4789  unsigned SmallIC =
4790  std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
4791 
4792  // Interleave until store/load ports (estimated by max interleave count) are
4793  // saturated.
4794  unsigned NumStores = Legal->getNumStores();
4795  unsigned NumLoads = Legal->getNumLoads();
4796  unsigned StoresIC = IC / (NumStores ? NumStores : 1);
4797  unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
4798 
4799  // If we have a scalar reduction (vector reductions are already dealt with
4800  // by this point), we can increase the critical path length if the loop
4801  // we're interleaving is inside another loop. Limit, by default to 2, so the
4802  // critical path only gets increased by one reduction operation.
4803  if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) {
4804  unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
4805  SmallIC = std::min(SmallIC, F);
4806  StoresIC = std::min(StoresIC, F);
4807  LoadsIC = std::min(LoadsIC, F);
4808  }
4809 
4811  std::max(StoresIC, LoadsIC) > SmallIC) {
4812  LLVM_DEBUG(
4813  dbgs() << "LV: Interleaving to saturate store or load ports.\n");
4814  return std::max(StoresIC, LoadsIC);
4815  }
4816 
4817  LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
4818  return SmallIC;
4819  }
4820 
4821  // Interleave if this is a large loop (small loops are already dealt with by
4822  // this point) that could benefit from interleaving.
4823  bool HasReductions = !Legal->getReductionVars()->empty();
4824  if (TTI.enableAggressiveInterleaving(HasReductions)) {
4825  LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
4826  return IC;
4827  }
4828 
4829  LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
4830  return 1;
4831 }
4832 
4835  // This function calculates the register usage by measuring the highest number
4836  // of values that are alive at a single location. Obviously, this is a very
4837  // rough estimation. We scan the loop in a topological order in order and
4838  // assign a number to each instruction. We use RPO to ensure that defs are
4839  // met before their users. We assume that each instruction that has in-loop
4840  // users starts an interval. We record every time that an in-loop value is
4841  // used, so we have a list of the first and last occurrences of each
4842  // instruction. Next, we transpose this data structure into a multi map that
4843  // holds the list of intervals that *end* at a specific location. This multi
4844  // map allows us to perform a linear search. We scan the instructions linearly
4845  // and record each time that a new interval starts, by placing it in a set.
4846  // If we find this value in the multi-map then we remove it from the set.
4847  // The max register usage is the maximum size of the set.
4848  // We also search for instructions that are defined outside the loop, but are
4849  // used inside the loop. We need this number separately from the max-interval
4850  // usage number because when we unroll, loop-invariant values do not take
4851  // more register.
4852  LoopBlocksDFS DFS(TheLoop);
4853  DFS.perform(LI);
4854 
4855  RegisterUsage RU;
4856 
4857  // Each 'key' in the map opens a new interval. The values
4858  // of the map are the index of the 'last seen' usage of the
4859  // instruction that is the key.
4861 
4862  // Maps instruction to its index.
4864  // Marks the end of each interval.
4865  IntervalMap EndPoint;
4866  // Saves the list of instruction indices that are used in the loop.
4868  // Saves the list of values that are used in the loop but are
4869  // defined outside the loop, such as arguments and constants.
4870  SmallPtrSet<Value *, 8> LoopInvariants;
4871 
4872  unsigned Index = 0;
4873  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
4874  for (Instruction &I : *BB) {
4875  IdxToInstr[Index++] = &I;
4876 
4877  // Save the end location of each USE.
4878  for (Value *U : I.operands()) {
4879  auto *Instr = dyn_cast<Instruction>(U);
4880 
4881  // Ignore non-instruction values such as arguments, constants, etc.
4882  if (!Instr)
4883  continue;
4884 
4885  // If this instruction is outside the loop then record it and continue.
4886  if (!TheLoop->contains(Instr)) {
4887  LoopInvariants.insert(Instr);
4888  continue;
4889  }
4890 
4891  // Overwrite previous end points.
4892  EndPoint[Instr] = Index;
4893  Ends.insert(Instr);
4894  }
4895  }
4896  }
4897 
4898  // Saves the list of intervals that end with the index in 'key'.
4899  using InstrList = SmallVector<Instruction *, 2>;
4900  DenseMap<unsigned, InstrList> TransposeEnds;
4901 
4902  // Transpose the EndPoints to a list of values that end at each index.
4903  for (auto &Interval : EndPoint)
4904  TransposeEnds[Interval.second].push_back(Interval.first);
4905 
4906  SmallPtrSet<Instruction *, 8> OpenIntervals;
4907 
4908  // Get the size of the widest register.
4909  unsigned MaxSafeDepDist = -1U;
4910  if (Legal->getMaxSafeDepDistBytes() != -1U)
4911  MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
4912  unsigned WidestRegister =
4913  std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
4914  const DataLayout &DL = TheFunction->getParent()->getDataLayout();
4915 
4917  SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);
4918 
4919  LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
4920 
4921  // A lambda that gets the register usage for the given type and VF.
4922  auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
4923  if (Ty->isTokenTy())
4924  return 0U;
4925  unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
4926  return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
4927  };
4928 
4929  for (unsigned int i = 0; i < Index; ++i) {
4930  Instruction *I = IdxToInstr[i];
4931 
4932  // Remove all of the instructions that end at this location.
4933  InstrList &List = TransposeEnds[i];
4934  for (Instruction *ToRemove : List)
4935  OpenIntervals.erase(ToRemove);
4936 
4937  // Ignore instructions that are never used within the loop.
4938  if (Ends.find(I) == Ends.end())
4939  continue;
4940 
4941  // Skip ignored values.
4942  if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
4943  continue;
4944 
4945  // For each VF find the maximum usage of registers.
4946  for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
4947  if (VFs[j] == 1) {
4948  MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());
4949  continue;
4950  }
4951  collectUniformsAndScalars(VFs[j]);
4952  // Count the number of live intervals.
4953  unsigned RegUsage = 0;
4954  for (auto Inst : OpenIntervals) {
4955  // Skip ignored values for VF > 1.
4956  if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end() ||
4957  isScalarAfterVectorization(Inst, VFs[j]))
4958  continue;
4959  RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
4960  }
4961  MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
4962  }
4963 
4964  LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
4965  << OpenIntervals.size() << '\n');
4966 
4967  // Add the current instruction to the list of open intervals.
4968  OpenIntervals.insert(I);
4969  }
4970 
4971  for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
4972  unsigned Invariant = 0;
4973  if (VFs[i] == 1)
4974  Invariant = LoopInvariants.size();
4975  else {
4976  for (auto Inst : LoopInvariants)
4977  Invariant += GetRegUsage(Inst->getType(), VFs[i]);
4978  }
4979 
4980  LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n');
4981  LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
4982  LLVM_DEBUG(