LLVM  10.0.0svn
LoopVectorize.cpp
Go to the documentation of this file.
1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 // of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 // widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 // of vectorization. It decides on the optimal vector width, which
26 // can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 // Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46 // Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52 // Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanHCFGTransforms.h"
62 #include "VPlanPredicator.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
82 #include "llvm/Analysis/CFG.h"
88 #include "llvm/Analysis/LoopInfo.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/LLVMContext.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
133 #include "llvm/Support/MathExtras.h"
141 #include <algorithm>
142 #include <cassert>
143 #include <cstdint>
144 #include <cstdlib>
145 #include <functional>
146 #include <iterator>
147 #include <limits>
148 #include <memory>
149 #include <string>
150 #include <tuple>
151 #include <utility>
152 #include <vector>
153 
154 using namespace llvm;
155 
156 #define LV_NAME "loop-vectorize"
157 #define DEBUG_TYPE LV_NAME
158 
159 /// @{
160 /// Metadata attribute names
161 static const char *const LLVMLoopVectorizeFollowupAll =
162  "llvm.loop.vectorize.followup_all";
163 static const char *const LLVMLoopVectorizeFollowupVectorized =
164  "llvm.loop.vectorize.followup_vectorized";
165 static const char *const LLVMLoopVectorizeFollowupEpilogue =
166  "llvm.loop.vectorize.followup_epilogue";
167 /// @}
168 
169 STATISTIC(LoopsVectorized, "Number of loops vectorized");
170 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
171 
172 /// Loops with a known constant trip count below this number are vectorized only
173 /// if no scalar iteration overheads are incurred.
175  "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
176  cl::desc("Loops with a constant trip count that is smaller than this "
177  "value are vectorized only if no scalar iteration overheads "
178  "are incurred."));
179 
180 // Indicates that an epilogue is undesired, predication is preferred.
181 // This means that the vectorizer will try to fold the loop-tail (epilogue)
182 // into the loop and predicate the loop body accordingly.
184  "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
185  cl::desc("Indicate that an epilogue is undesired, predication should be "
186  "used instead."));
187 
189  "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
190  cl::desc("Maximize bandwidth when selecting vectorization factor which "
191  "will be determined by the smallest type in loop."));
192 
194  "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
195  cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
196 
197 /// An interleave-group may need masking if it resides in a block that needs
198 /// predication, or in order to mask away gaps.
200  "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
201  cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
202 
203 /// We don't interleave loops with a known constant trip count below this
204 /// number.
205 static const unsigned TinyTripCountInterleaveThreshold = 128;
206 
208  "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
209  cl::desc("A flag that overrides the target's number of scalar registers."));
210 
212  "force-target-num-vector-regs", cl::init(0), cl::Hidden,
213  cl::desc("A flag that overrides the target's number of vector registers."));
214 
216  "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
217  cl::desc("A flag that overrides the target's max interleave factor for "
218  "scalar loops."));
219 
221  "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
222  cl::desc("A flag that overrides the target's max interleave factor for "
223  "vectorized loops."));
224 
226  "force-target-instruction-cost", cl::init(0), cl::Hidden,
227  cl::desc("A flag that overrides the target's expected cost for "
228  "an instruction to a single constant value. Mostly "
229  "useful for getting consistent testing."));
230 
232  "small-loop-cost", cl::init(20), cl::Hidden,
233  cl::desc(
234  "The cost of a loop that is considered 'small' by the interleaver."));
235 
237  "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
238  cl::desc("Enable the use of the block frequency analysis to access PGO "
239  "heuristics minimizing code growth in cold regions and being more "
240  "aggressive in hot regions."));
241 
242 // Runtime interleave loops for load/store throughput.
244  "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
245  cl::desc(
246  "Enable runtime interleaving until load/store ports are saturated"));
247 
248 /// The number of stores in a loop that are allowed to need predication.
250  "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
251  cl::desc("Max number of stores to be predicated behind an if."));
252 
254  "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
255  cl::desc("Count the induction variable only once when interleaving"));
256 
258  "enable-cond-stores-vec", cl::init(true), cl::Hidden,
259  cl::desc("Enable if predication of stores during vectorization."));
260 
262  "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
263  cl::desc("The maximum interleave count to use when interleaving a scalar "
264  "reduction in a nested loop."));
265 
267  "enable-vplan-native-path", cl::init(false), cl::Hidden,
268  cl::desc("Enable VPlan-native vectorization path with "
269  "support for outer loop vectorization."));
270 
271 // FIXME: Remove this switch once we have divergence analysis. Currently we
272 // assume divergent non-backedge branches when this switch is true.
274  "enable-vplan-predication", cl::init(false), cl::Hidden,
275  cl::desc("Enable VPlan-native vectorization path predicator with "
276  "support for outer loop vectorization."));
277 
278 // This flag enables the stress testing of the VPlan H-CFG construction in the
279 // VPlan-native vectorization path. It must be used in conjuction with
280 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
281 // verification of the H-CFGs built.
283  "vplan-build-stress-test", cl::init(false), cl::Hidden,
284  cl::desc(
285  "Build VPlan for every supported loop nest in the function and bail "
286  "out right after the build (stress test the VPlan H-CFG construction "
287  "in the VPlan-native vectorization path)."));
288 
290  "interleave-loops", cl::init(true), cl::Hidden,
291  cl::desc("Enable loop interleaving in Loop vectorization passes"));
293  "vectorize-loops", cl::init(true), cl::Hidden,
294  cl::desc("Run the Loop vectorization passes"));
295 
296 /// A helper function for converting Scalar types to vector types.
297 /// If the incoming type is void, we return void. If the VF is 1, we return
298 /// the scalar type.
299 static Type *ToVectorTy(Type *Scalar, unsigned VF) {
300  if (Scalar->isVoidTy() || VF == 1)
301  return Scalar;
302  return VectorType::get(Scalar, VF);
303 }
304 
305 /// A helper function that returns the type of loaded or stored value.
307  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
308  "Expected Load or Store instruction");
309  if (auto *LI = dyn_cast<LoadInst>(I))
310  return LI->getType();
311  return cast<StoreInst>(I)->getValueOperand()->getType();
312 }
313 
314 /// A helper function that returns true if the given type is irregular. The
315 /// type is irregular if its allocated size doesn't equal the store size of an
316 /// element of the corresponding vector type at the given vectorization factor.
317 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
318  // Determine if an array of VF elements of type Ty is "bitcast compatible"
319  // with a <VF x Ty> vector.
320  if (VF > 1) {
321  auto *VectorTy = VectorType::get(Ty, VF);
322  return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
323  }
324 
325  // If the vectorization factor is one, we just check if an array of type Ty
326  // requires padding between elements.
327  return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
328 }
329 
330 /// A helper function that returns the reciprocal of the block probability of
331 /// predicated blocks. If we return X, we are assuming the predicated block
332 /// will execute once for every X iterations of the loop header.
333 ///
334 /// TODO: We should use actual block probability here, if available. Currently,
335 /// we always assume predicated blocks have a 50% chance of executing.
336 static unsigned getReciprocalPredBlockProb() { return 2; }
337 
338 /// A helper function that adds a 'fast' flag to floating-point operations.
340  if (isa<FPMathOperator>(V))
341  cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
342  return V;
343 }
344 
346  if (isa<FPMathOperator>(V))
347  cast<Instruction>(V)->setFastMathFlags(FMF);
348  return V;
349 }
350 
351 /// A helper function that returns an integer or floating-point constant with
352 /// value C.
353 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
354  return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
355  : ConstantFP::get(Ty, C);
356 }
357 
358 /// Returns "best known" trip count for the specified loop \p L as defined by
359 /// the following procedure:
360 /// 1) Returns exact trip count if it is known.
361 /// 2) Returns expected trip count according to profile data if any.
362 /// 3) Returns upper bound estimate if it is known.
363 /// 4) Returns None if all of the above failed.
365  // Check if exact trip count is known.
366  if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
367  return ExpectedTC;
368 
369  // Check if there is an expected trip count available from profile data.
371  if (auto EstimatedTC = getLoopEstimatedTripCount(L))
372  return EstimatedTC;
373 
374  // Check if upper bound estimate is known.
375  if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
376  return ExpectedTC;
377 
378  return None;
379 }
380 
381 namespace llvm {
382 
383 /// InnerLoopVectorizer vectorizes loops which contain only one basic
384 /// block to a specified vectorization factor (VF).
385 /// This class performs the widening of scalars into vectors, or multiple
386 /// scalars. This class also implements the following features:
387 /// * It inserts an epilogue loop for handling loops that don't have iteration
388 /// counts that are known to be a multiple of the vectorization factor.
389 /// * It handles the code generation for reduction variables.
390 /// * Scalarization (implementation using scalars) of un-vectorizable
391 /// instructions.
392 /// InnerLoopVectorizer does not perform any vectorization-legality
393 /// checks, and relies on the caller to check for the different legality
394 /// aspects. The InnerLoopVectorizer relies on the
395 /// LoopVectorizationLegality class to provide information about the induction
396 /// and reduction variables that were found to a given vectorization factor.
398 public:
401  const TargetLibraryInfo *TLI,
403  OptimizationRemarkEmitter *ORE, unsigned VecWidth,
404  unsigned UnrollFactor, LoopVectorizationLegality *LVL,
406  : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
407  AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
408  Builder(PSE.getSE()->getContext()),
409  VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
410  virtual ~InnerLoopVectorizer() = default;
411 
412  /// Create a new empty loop. Unlink the old loop and connect the new one.
413  /// Return the pre-header block of the new loop.
415 
416  /// Widen a single instruction within the innermost loop.
418 
419  /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
420  void fixVectorizedLoop();
421 
422  // Return true if any runtime check is added.
424 
425  /// A type for vectorized values in the new loop. Each value from the
426  /// original loop, when vectorized, is represented by UF vector values in the
427  /// new unrolled loop, where UF is the unroll factor.
429 
430  /// Vectorize a single PHINode in a block. This method handles the induction
431  /// variable canonicalization. It supports both VF = 1 for unrolled loops and
432  /// arbitrary length vectors.
433  void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
434 
435  /// A helper function to scalarize a single Instruction in the innermost loop.
436  /// Generates a sequence of scalar instances for each lane between \p MinLane
437  /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
438  /// inclusive..
439  void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
440  bool IfPredicateInstr);
441 
442  /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
443  /// is provided, the integer induction variable will first be truncated to
444  /// the corresponding type.
445  void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
446 
447  /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
448  /// vector or scalar value on-demand if one is not yet available. When
449  /// vectorizing a loop, we visit the definition of an instruction before its
450  /// uses. When visiting the definition, we either vectorize or scalarize the
451  /// instruction, creating an entry for it in the corresponding map. (In some
452  /// cases, such as induction variables, we will create both vector and scalar
453  /// entries.) Then, as we encounter uses of the definition, we derive values
454  /// for each scalar or vector use unless such a value is already available.
455  /// For example, if we scalarize a definition and one of its uses is vector,
456  /// we build the required vector on-demand with an insertelement sequence
457  /// when visiting the use. Otherwise, if the use is scalar, we can use the
458  /// existing scalar definition.
459  ///
460  /// Return a value in the new loop corresponding to \p V from the original
461  /// loop at unroll index \p Part. If the value has already been vectorized,
462  /// the corresponding vector entry in VectorLoopValueMap is returned. If,
463  /// however, the value has a scalar entry in VectorLoopValueMap, we construct
464  /// a new vector value on-demand by inserting the scalar values into a vector
465  /// with an insertelement sequence. If the value has been neither vectorized
466  /// nor scalarized, it must be loop invariant, so we simply broadcast the
467  /// value into a vector.
468  Value *getOrCreateVectorValue(Value *V, unsigned Part);
469 
470  /// Return a value in the new loop corresponding to \p V from the original
471  /// loop at unroll and vector indices \p Instance. If the value has been
472  /// vectorized but not scalarized, the necessary extractelement instruction
473  /// will be generated.
474  Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
475 
476  /// Construct the vector value of a scalarized value \p V one lane at a time.
477  void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
478 
479  /// Try to vectorize the interleaved access group that \p Instr belongs to,
480  /// optionally masking the vector operations if \p BlockInMask is non-null.
482  VectorParts *BlockInMask = nullptr);
483 
484  /// Vectorize Load and Store instructions, optionally masking the vector
485  /// operations if \p BlockInMask is non-null.
487  VectorParts *BlockInMask = nullptr);
488 
489  /// Set the debug location in the builder using the debug location in
490  /// the instruction.
491  void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
492 
493  /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
494  void fixNonInductionPHIs(void);
495 
496 protected:
498 
499  /// A small list of PHINodes.
501 
502  /// A type for scalarized values in the new loop. Each value from the
503  /// original loop, when scalarized, is represented by UF x VF scalar values
504  /// in the new unrolled loop, where UF is the unroll factor and VF is the
505  /// vectorization factor.
507 
508  /// Set up the values of the IVs correctly when exiting the vector loop.
509  void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
510  Value *CountRoundDown, Value *EndValue,
511  BasicBlock *MiddleBlock);
512 
513  /// Create a new induction variable inside L.
514  PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
515  Value *Step, Instruction *DL);
516 
517  /// Handle all cross-iteration phis in the header.
518  void fixCrossIterationPHIs();
519 
520  /// Fix a first-order recurrence. This is the second phase of vectorizing
521  /// this phi node.
522  void fixFirstOrderRecurrence(PHINode *Phi);
523 
524  /// Fix a reduction cross-iteration phi. This is the second phase of
525  /// vectorizing this phi node.
526  void fixReduction(PHINode *Phi);
527 
528  /// The Loop exit block may have single value PHI nodes with some
529  /// incoming value. While vectorizing we only handled real values
530  /// that were defined inside the loop and we should have one value for
531  /// each predecessor of its parent basic block. See PR14725.
532  void fixLCSSAPHIs();
533 
534  /// Iteratively sink the scalarized operands of a predicated instruction into
535  /// the block that was created for it.
536  void sinkScalarOperands(Instruction *PredInst);
537 
538  /// Shrinks vector element sizes to the smallest bitwidth they can be legally
539  /// represented as.
541 
542  /// Insert the new loop to the loop hierarchy and pass manager
543  /// and update the analysis passes.
544  void updateAnalysis();
545 
546  /// Create a broadcast instruction. This method generates a broadcast
547  /// instruction (shuffle) for loop invariant values and for the induction
548  /// value. If this is the induction variable then we extend it to N, N+1, ...
549  /// this is needed because each iteration in the loop corresponds to a SIMD
550  /// element.
551  virtual Value *getBroadcastInstrs(Value *V);
552 
553  /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
554  /// to each vector element of Val. The sequence starts at StartIndex.
555  /// \p Opcode is relevant for FP induction variable.
556  virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
557  Instruction::BinaryOps Opcode =
558  Instruction::BinaryOpsEnd);
559 
560  /// Compute scalar induction steps. \p ScalarIV is the scalar induction
561  /// variable on which to base the steps, \p Step is the size of the step, and
562  /// \p EntryVal is the value from the original loop that maps to the steps.
563  /// Note that \p EntryVal doesn't have to be an induction variable - it
564  /// can also be a truncate instruction.
565  void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
566  const InductionDescriptor &ID);
567 
568  /// Create a vector induction phi node based on an existing scalar one. \p
569  /// EntryVal is the value from the original loop that maps to the vector phi
570  /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
571  /// truncate instruction, instead of widening the original IV, we widen a
572  /// version of the IV truncated to \p EntryVal's type.
574  Value *Step, Instruction *EntryVal);
575 
576  /// Returns true if an instruction \p I should be scalarized instead of
577  /// vectorized for the chosen vectorization factor.
579 
580  /// Returns true if we should generate a scalar version of \p IV.
581  bool needsScalarInduction(Instruction *IV) const;
582 
583  /// If there is a cast involved in the induction variable \p ID, which should
584  /// be ignored in the vectorized loop body, this function records the
585  /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
586  /// cast. We had already proved that the casted Phi is equal to the uncasted
587  /// Phi in the vectorized loop (under a runtime guard), and therefore
588  /// there is no need to vectorize the cast - the same value can be used in the
589  /// vector loop for both the Phi and the cast.
590  /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
591  /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
592  ///
593  /// \p EntryVal is the value from the original loop that maps to the vector
594  /// phi node and is used to distinguish what is the IV currently being
595  /// processed - original one (if \p EntryVal is a phi corresponding to the
596  /// original IV) or the "newly-created" one based on the proof mentioned above
597  /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
598  /// latter case \p EntryVal is a TruncInst and we must not record anything for
599  /// that IV, but it's error-prone to expect callers of this routine to care
600  /// about that, hence this explicit parameter.
602  const Instruction *EntryVal,
603  Value *VectorLoopValue,
604  unsigned Part,
605  unsigned Lane = UINT_MAX);
606 
607  /// Generate a shuffle sequence that will reverse the vector Vec.
608  virtual Value *reverseVector(Value *Vec);
609 
610  /// Returns (and creates if needed) the original loop trip count.
611  Value *getOrCreateTripCount(Loop *NewLoop);
612 
613  /// Returns (and creates if needed) the trip count of the widened loop.
615 
616  /// Returns a bitcasted value to the requested vector type.
617  /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
619  const DataLayout &DL);
620 
621  /// Emit a bypass check to see if the vector trip count is zero, including if
622  /// it overflows.
624 
625  /// Emit a bypass check to see if all of the SCEV assumptions we've
626  /// had to make are correct.
627  void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
628 
629  /// Emit bypass checks to check any memory assumptions we may have made.
630  void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
631 
632  /// Compute the transformed value of Index at offset StartValue using step
633  /// StepValue.
634  /// For integer induction, returns StartValue + Index * StepValue.
635  /// For pointer induction, returns StartValue[Index * StepValue].
636  /// FIXME: The newly created binary instructions should contain nsw/nuw
637  /// flags, which can be found from the original scalar operations.
639  const DataLayout &DL,
640  const InductionDescriptor &ID) const;
641 
642  /// Add additional metadata to \p To that was not present on \p Orig.
643  ///
644  /// Currently this is used to add the noalias annotations based on the
645  /// inserted memchecks. Use this for instructions that are *cloned* into the
646  /// vector loop.
647  void addNewMetadata(Instruction *To, const Instruction *Orig);
648 
649  /// Add metadata from one instruction to another.
650  ///
651  /// This includes both the original MDs from \p From and additional ones (\see
652  /// addNewMetadata). Use this for *newly created* instructions in the vector
653  /// loop.
655 
656  /// Similar to the previous function but it adds the metadata to a
657  /// vector of instructions.
659 
660  /// The original loop.
662 
663  /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
664  /// dynamic knowledge to simplify SCEV expressions and converts them to a
665  /// more usable form.
667 
668  /// Loop Info.
670 
671  /// Dominator Tree.
673 
674  /// Alias Analysis.
676 
677  /// Target Library Info.
679 
680  /// Target Transform Info.
682 
683  /// Assumption Cache.
685 
686  /// Interface to emit optimization remarks.
688 
689  /// LoopVersioning. It's only set up (non-null) if memchecks were
690  /// used.
691  ///
692  /// This is currently only used to add no-alias metadata based on the
693  /// memchecks. The actually versioning is performed manually.
694  std::unique_ptr<LoopVersioning> LVer;
695 
696  /// The vectorization SIMD factor to use. Each vector will have this many
697  /// vector elements.
698  unsigned VF;
699 
700  /// The vectorization unroll factor to use. Each scalar is vectorized to this
701  /// many different vector instructions.
702  unsigned UF;
703 
704  /// The builder that we use
706 
707  // --- Vectorization state ---
708 
709  /// The vector-loop preheader.
711 
712  /// The scalar-loop preheader.
714 
715  /// Middle Block between the vector and the scalar.
717 
718  /// The ExitBlock of the scalar loop.
720 
721  /// The vector loop body.
723 
724  /// The scalar loop body.
726 
727  /// A list of all bypass blocks. The first block is the entry of the loop.
729 
730  /// The new Induction variable which was added to the new block.
731  PHINode *Induction = nullptr;
732 
733  /// The induction variable of the old basic block.
734  PHINode *OldInduction = nullptr;
735 
736  /// Maps values from the original loop to their corresponding values in the
737  /// vectorized loop. A key value can map to either vector values, scalar
738  /// values or both kinds of values, depending on whether the key was
739  /// vectorized and scalarized.
741 
742  /// Store instructions that were predicated.
744 
745  /// Trip count of the original loop.
746  Value *TripCount = nullptr;
747 
748  /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
749  Value *VectorTripCount = nullptr;
750 
751  /// The legality analysis.
753 
754  /// The profitablity analysis.
756 
757  // Record whether runtime checks are added.
758  bool AddedSafetyChecks = false;
759 
760  // Holds the end values for each induction variable. We save the end values
761  // so we can later fix-up the external users of the induction variables.
763 
764  // Vector of original scalar PHIs whose corresponding widened PHIs need to be
765  // fixed up at the end of vector code generation.
767 };
768 
770 public:
773  const TargetLibraryInfo *TLI,
775  OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
778  : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
779  UnrollFactor, LVL, CM) {}
780 
781 private:
782  Value *getBroadcastInstrs(Value *V) override;
783  Value *getStepVector(Value *Val, int StartIdx, Value *Step,
784  Instruction::BinaryOps Opcode =
785  Instruction::BinaryOpsEnd) override;
786  Value *reverseVector(Value *Vec) override;
787 };
788 
789 } // end namespace llvm
790 
791 /// Look for a meaningful debug location on the instruction or it's
792 /// operands.
794  if (!I)
795  return I;
796 
797  DebugLoc Empty;
798  if (I->getDebugLoc() != Empty)
799  return I;
800 
801  for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
802  if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
803  if (OpInst->getDebugLoc() != Empty)
804  return OpInst;
805  }
806 
807  return I;
808 }
809 
811  if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
812  const DILocation *DIL = Inst->getDebugLoc();
813  if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
814  !isa<DbgInfoIntrinsic>(Inst)) {
815  auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
816  if (NewDIL)
817  B.SetCurrentDebugLocation(NewDIL.getValue());
818  else
819  LLVM_DEBUG(dbgs()
820  << "Failed to create new discriminator: "
821  << DIL->getFilename() << " Line: " << DIL->getLine());
822  }
823  else
825  } else
827 }
828 
829 /// Write a record \p DebugMsg about vectorization failure to the debug
830 /// output stream. If \p I is passed, it is an instruction that prevents
831 /// vectorization.
832 #ifndef NDEBUG
833 static void debugVectorizationFailure(const StringRef DebugMsg,
834  Instruction *I) {
835  dbgs() << "LV: Not vectorizing: " << DebugMsg;
836  if (I != nullptr)
837  dbgs() << " " << *I;
838  else
839  dbgs() << '.';
840  dbgs() << '\n';
841 }
842 #endif
843 
844 /// Create an analysis remark that explains why vectorization failed
845 ///
846 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
847 /// RemarkName is the identifier for the remark. If \p I is passed it is an
848 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for
849 /// the location of the remark. \return the remark object that can be
850 /// streamed to.
851 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
852  StringRef RemarkName, Loop *TheLoop, Instruction *I) {
853  Value *CodeRegion = TheLoop->getHeader();
854  DebugLoc DL = TheLoop->getStartLoc();
855 
856  if (I) {
857  CodeRegion = I->getParent();
858  // If there is no debug location attached to the instruction, revert back to
859  // using the loop's.
860  if (I->getDebugLoc())
861  DL = I->getDebugLoc();
862  }
863 
864  OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
865  R << "loop not vectorized: ";
866  return R;
867 }
868 
869 namespace llvm {
870 
872  const StringRef OREMsg, const StringRef ORETag,
875  LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
877  ORETag, TheLoop, I) << OREMsg);
878 }
879 
880 } // end namespace llvm
881 
882 #ifndef NDEBUG
883 /// \return string containing a file name and a line # for the given loop.
884 static std::string getDebugLocString(const Loop *L) {
885  std::string Result;
886  if (L) {
887  raw_string_ostream OS(Result);
888  if (const DebugLoc LoopDbgLoc = L->getStartLoc())
889  LoopDbgLoc.print(OS);
890  else
891  // Just print the module name.
892  OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
893  OS.flush();
894  }
895  return Result;
896 }
897 #endif
898 
900  const Instruction *Orig) {
901  // If the loop was versioned with memchecks, add the corresponding no-alias
902  // metadata.
903  if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
904  LVer->annotateInstWithNoAlias(To, Orig);
905 }
906 
908  Instruction *From) {
909  propagateMetadata(To, From);
910  addNewMetadata(To, From);
911 }
912 
914  Instruction *From) {
915  for (Value *V : To) {
916  if (Instruction *I = dyn_cast<Instruction>(V))
917  addMetadata(I, From);
918  }
919 }
920 
921 namespace llvm {
922 
923 // Loop vectorization cost-model hints how the scalar epilogue loop should be
924 // lowered.
926 
927  // The default: allowing scalar epilogues.
929 
930  // Vectorization with OptForSize: don't allow epilogues.
932 
933  // A special case of vectorisation with OptForSize: loops with a very small
934  // trip count are considered for vectorization under OptForSize, thereby
935  // making sure the cost of their loop body is dominant, free of runtime
936  // guards and scalar iteration overheads.
938 
939  // Loop hint predicate indicating an epilogue is undesired.
941 };
942 
943 /// LoopVectorizationCostModel - estimates the expected speedups due to
944 /// vectorization.
945 /// In many cases vectorization is not profitable. This can happen because of
946 /// a number of reasons. In this class we mainly attempt to predict the
947 /// expected speedup/slowdowns due to the supported instruction set. We use the
948 /// TargetTransformInfo to query the different backends for the cost of
949 /// different operations.
951 public:
955  const TargetTransformInfo &TTI,
956  const TargetLibraryInfo *TLI, DemandedBits *DB,
959  const LoopVectorizeHints *Hints,
961  : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
962  TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
963  Hints(Hints), InterleaveInfo(IAI) {}
964 
965  /// \return An upper bound for the vectorization factor, or None if
966  /// vectorization and interleaving should be avoided up front.
967  Optional<unsigned> computeMaxVF();
968 
969  /// \return True if runtime checks are required for vectorization, and false
970  /// otherwise.
971  bool runtimeChecksRequired();
972 
973  /// \return The most profitable vectorization factor and the cost of that VF.
974  /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
975  /// then this vectorization factor will be selected if vectorization is
976  /// possible.
977  VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
978 
979  /// Setup cost-based decisions for user vectorization factor.
980  void selectUserVectorizationFactor(unsigned UserVF) {
981  collectUniformsAndScalars(UserVF);
982  collectInstsToScalarize(UserVF);
983  }
984 
985  /// \return The size (in bits) of the smallest and widest types in the code
986  /// that needs to be vectorized. We ignore values that remain scalar such as
987  /// 64 bit loop indices.
988  std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
989 
990  /// \return The desired interleave count.
991  /// If interleave count has been specified by metadata it will be returned.
992  /// Otherwise, the interleave count is computed and returned. VF and LoopCost
993  /// are the selected vectorization factor and the cost of the selected VF.
994  unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
995 
996  /// Memory access instruction may be vectorized in more than one way.
997  /// Form of instruction after vectorization depends on cost.
998  /// This function takes cost-based decisions for Load/Store instructions
999  /// and collects them in a map. This decisions map is used for building
1000  /// the lists of loop-uniform and loop-scalar instructions.
1001  /// The calculated cost is saved with widening decision in order to
1002  /// avoid redundant calculations.
1003  void setCostBasedWideningDecision(unsigned VF);
1004 
1005  /// A struct that represents some properties of the register usage
1006  /// of a loop.
1007  struct RegisterUsage {
1008  /// Holds the number of loop invariant values that are used in the loop.
1009  /// The key is ClassID of target-provided register class.
1011  /// Holds the maximum number of concurrent live intervals in the loop.
1012  /// The key is ClassID of target-provided register class.
1014  };
1015 
1016  /// \return Returns information about the register usages of the loop for the
1017  /// given vectorization factors.
1018  SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
1019 
1020  /// Collect values we want to ignore in the cost model.
1021  void collectValuesToIgnore();
1022 
1023  /// \returns The smallest bitwidth each instruction can be represented with.
1024  /// The vector equivalents of these instructions should be truncated to this
1025  /// type.
1027  return MinBWs;
1028  }
1029 
1030  /// \returns True if it is more profitable to scalarize instruction \p I for
1031  /// vectorization factor \p VF.
1032  bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1033  assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1034 
1035  // Cost model is not run in the VPlan-native path - return conservative
1036  // result until this changes.
1038  return false;
1039 
1040  auto Scalars = InstsToScalarize.find(VF);
1041  assert(Scalars != InstsToScalarize.end() &&
1042  "VF not yet analyzed for scalarization profitability");
1043  return Scalars->second.find(I) != Scalars->second.end();
1044  }
1045 
1046  /// Returns true if \p I is known to be uniform after vectorization.
1047  bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1048  if (VF == 1)
1049  return true;
1050 
1051  // Cost model is not run in the VPlan-native path - return conservative
1052  // result until this changes.
1054  return false;
1055 
1056  auto UniformsPerVF = Uniforms.find(VF);
1057  assert(UniformsPerVF != Uniforms.end() &&
1058  "VF not yet analyzed for uniformity");
1059  return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
1060  }
1061 
1062  /// Returns true if \p I is known to be scalar after vectorization.
1063  bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1064  if (VF == 1)
1065  return true;
1066 
1067  // Cost model is not run in the VPlan-native path - return conservative
1068  // result until this changes.
1070  return false;
1071 
1072  auto ScalarsPerVF = Scalars.find(VF);
1073  assert(ScalarsPerVF != Scalars.end() &&
1074  "Scalar values are not calculated for VF");
1075  return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
1076  }
1077 
1078  /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1079  /// for vectorization factor \p VF.
1080  bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1081  return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
1082  !isProfitableToScalarize(I, VF) &&
1083  !isScalarAfterVectorization(I, VF);
1084  }
1085 
1086  /// Decision that was taken during cost calculation for memory instruction.
1089  CM_Widen, // For consecutive accesses with stride +1.
1090  CM_Widen_Reverse, // For consecutive accesses with stride -1.
1093  CM_Scalarize
1094  };
1095 
1096  /// Save vectorization decision \p W and \p Cost taken by the cost model for
1097  /// instruction \p I and vector width \p VF.
1099  unsigned Cost) {
1100  assert(VF >= 2 && "Expected VF >=2");
1101  WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1102  }
1103 
1104  /// Save vectorization decision \p W and \p Cost taken by the cost model for
1105  /// interleaving group \p Grp and vector width \p VF.
1107  InstWidening W, unsigned Cost) {
1108  assert(VF >= 2 && "Expected VF >=2");
1109  /// Broadcast this decicion to all instructions inside the group.
1110  /// But the cost will be assigned to one instruction only.
1111  for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1112  if (auto *I = Grp->getMember(i)) {
1113  if (Grp->getInsertPos() == I)
1114  WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1115  else
1116  WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1117  }
1118  }
1119  }
1120 
1121  /// Return the cost model decision for the given instruction \p I and vector
1122  /// width \p VF. Return CM_Unknown if this instruction did not pass
1123  /// through the cost modeling.
1125  assert(VF >= 2 && "Expected VF >=2");
1126 
1127  // Cost model is not run in the VPlan-native path - return conservative
1128  // result until this changes.
1130  return CM_GatherScatter;
1131 
1132  std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1133  auto Itr = WideningDecisions.find(InstOnVF);
1134  if (Itr == WideningDecisions.end())
1135  return CM_Unknown;
1136  return Itr->second.first;
1137  }
1138 
1139  /// Return the vectorization cost for the given instruction \p I and vector
1140  /// width \p VF.
1141  unsigned getWideningCost(Instruction *I, unsigned VF) {
1142  assert(VF >= 2 && "Expected VF >=2");
1143  std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1144  assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1145  "The cost is not calculated");
1146  return WideningDecisions[InstOnVF].second;
1147  }
1148 
1149  /// Return True if instruction \p I is an optimizable truncate whose operand
1150  /// is an induction variable. Such a truncate will be removed by adding a new
1151  /// induction variable with the destination type.
1152  bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1153  // If the instruction is not a truncate, return false.
1154  auto *Trunc = dyn_cast<TruncInst>(I);
1155  if (!Trunc)
1156  return false;
1157 
1158  // Get the source and destination types of the truncate.
1159  Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1160  Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1161 
1162  // If the truncate is free for the given types, return false. Replacing a
1163  // free truncate with an induction variable would add an induction variable
1164  // update instruction to each iteration of the loop. We exclude from this
1165  // check the primary induction variable since it will need an update
1166  // instruction regardless.
1167  Value *Op = Trunc->getOperand(0);
1168  if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1169  return false;
1170 
1171  // If the truncated value is not an induction variable, return false.
1172  return Legal->isInductionPhi(Op);
1173  }
1174 
1175  /// Collects the instructions to scalarize for each predicated instruction in
1176  /// the loop.
1177  void collectInstsToScalarize(unsigned VF);
1178 
1179  /// Collect Uniform and Scalar values for the given \p VF.
1180  /// The sets depend on CM decision for Load/Store instructions
1181  /// that may be vectorized as interleave, gather-scatter or scalarized.
1182  void collectUniformsAndScalars(unsigned VF) {
1183  // Do the analysis once.
1184  if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1185  return;
1186  setCostBasedWideningDecision(VF);
1187  collectLoopUniforms(VF);
1188  collectLoopScalars(VF);
1189  }
1190 
1191  /// Returns true if the target machine supports masked store operation
1192  /// for the given \p DataType and kind of access to \p Ptr.
1194  return Legal->isConsecutivePtr(Ptr) &&
1195  TTI.isLegalMaskedStore(DataType, Alignment);
1196  }
1197 
1198  /// Returns true if the target machine supports masked load operation
1199  /// for the given \p DataType and kind of access to \p Ptr.
1200  bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1201  return Legal->isConsecutivePtr(Ptr) &&
1202  TTI.isLegalMaskedLoad(DataType, Alignment);
1203  }
1204 
1205  /// Returns true if the target machine supports masked scatter operation
1206  /// for the given \p DataType.
1208  return TTI.isLegalMaskedScatter(DataType);
1209  }
1210 
1211  /// Returns true if the target machine supports masked gather operation
1212  /// for the given \p DataType.
1214  return TTI.isLegalMaskedGather(DataType);
1215  }
1216 
1217  /// Returns true if the target machine can represent \p V as a masked gather
1218  /// or scatter operation.
1220  bool LI = isa<LoadInst>(V);
1221  bool SI = isa<StoreInst>(V);
1222  if (!LI && !SI)
1223  return false;
1224  auto *Ty = getMemInstValueType(V);
1225  return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
1226  }
1227 
1228  /// Returns true if \p I is an instruction that will be scalarized with
1229  /// predication. Such instructions include conditional stores and
1230  /// instructions that may divide by zero.
1231  /// If a non-zero VF has been calculated, we check if I will be scalarized
1232  /// predication for that VF.
1233  bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1234 
1235  // Returns true if \p I is an instruction that will be predicated either
1236  // through scalar predication or masked load/store or masked gather/scatter.
1237  // Superset of instructions that return true for isScalarWithPredication.
1239  if (!blockNeedsPredication(I->getParent()))
1240  return false;
1241  // Loads and stores that need some form of masked operation are predicated
1242  // instructions.
1243  if (isa<LoadInst>(I) || isa<StoreInst>(I))
1244  return Legal->isMaskRequired(I);
1245  return isScalarWithPredication(I);
1246  }
1247 
1248  /// Returns true if \p I is a memory instruction with consecutive memory
1249  /// access that can be widened.
1250  bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1251 
1252  /// Returns true if \p I is a memory instruction in an interleaved-group
1253  /// of memory accesses that can be vectorized with wide vector loads/stores
1254  /// and shuffles.
1255  bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1256 
1257  /// Check if \p Instr belongs to any interleaved access group.
1259  return InterleaveInfo.isInterleaved(Instr);
1260  }
1261 
1262  /// Get the interleaved access group that \p Instr belongs to.
1265  return InterleaveInfo.getInterleaveGroup(Instr);
1266  }
1267 
1268  /// Returns true if an interleaved group requires a scalar iteration
1269  /// to handle accesses with gaps, and there is nothing preventing us from
1270  /// creating a scalar epilogue.
1271  bool requiresScalarEpilogue() const {
1272  return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1273  }
1274 
1275  /// Returns true if a scalar epilogue is not allowed due to optsize or a
1276  /// loop hint annotation.
1278  return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1279  }
1280 
1281  /// Returns true if all loop blocks should be masked to fold tail loop.
1282  bool foldTailByMasking() const { return FoldTailByMasking; }
1283 
1285  return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1286  }
1287 
1288  /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1289  /// with factor VF. Return the cost of the instruction, including
1290  /// scalarization overhead if it's needed.
1291  unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1292 
1293  /// Estimate cost of a call instruction CI if it were vectorized with factor
1294  /// VF. Return the cost of the instruction, including scalarization overhead
1295  /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1296  /// scalarized -
1297  /// i.e. either vector version isn't available, or is too expensive.
1298  unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1299 
1300 private:
1301  unsigned NumPredStores = 0;
1302 
1303  /// \return An upper bound for the vectorization factor, larger than zero.
1304  /// One is returned if vectorization should best be avoided due to cost.
1305  unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1306 
1307  /// The vectorization cost is a combination of the cost itself and a boolean
1308  /// indicating whether any of the contributing operations will actually
1309  /// operate on
1310  /// vector values after type legalization in the backend. If this latter value
1311  /// is
1312  /// false, then all operations will be scalarized (i.e. no vectorization has
1313  /// actually taken place).
1314  using VectorizationCostTy = std::pair<unsigned, bool>;
1315 
1316  /// Returns the expected execution cost. The unit of the cost does
1317  /// not matter because we use the 'cost' units to compare different
1318  /// vector widths. The cost that is returned is *not* normalized by
1319  /// the factor width.
1320  VectorizationCostTy expectedCost(unsigned VF);
1321 
1322  /// Returns the execution time cost of an instruction for a given vector
1323  /// width. Vector width of one means scalar.
1324  VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1325 
1326  /// The cost-computation logic from getInstructionCost which provides
1327  /// the vector type as an output parameter.
1328  unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1329 
1330  /// Calculate vectorization cost of memory instruction \p I.
1331  unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1332 
1333  /// The cost computation for scalarized memory instruction.
1334  unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1335 
1336  /// The cost computation for interleaving group of memory instructions.
1337  unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1338 
1339  /// The cost computation for Gather/Scatter instruction.
1340  unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1341 
1342  /// The cost computation for widening instruction \p I with consecutive
1343  /// memory access.
1344  unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1345 
1346  /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1347  /// Load: scalar load + broadcast.
1348  /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1349  /// element)
1350  unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1351 
1352  /// Estimate the overhead of scalarizing an instruction. This is a
1353  /// convenience wrapper for the type-based getScalarizationOverhead API.
1354  unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1355 
1356  /// Returns whether the instruction is a load or store and will be a emitted
1357  /// as a vector operation.
1358  bool isConsecutiveLoadOrStore(Instruction *I);
1359 
1360  /// Returns true if an artificially high cost for emulated masked memrefs
1361  /// should be used.
1362  bool useEmulatedMaskMemRefHack(Instruction *I);
1363 
1364  /// Map of scalar integer values to the smallest bitwidth they can be legally
1365  /// represented as. The vector equivalents of these values should be truncated
1366  /// to this type.
1368 
1369  /// A type representing the costs for instructions if they were to be
1370  /// scalarized rather than vectorized. The entries are Instruction-Cost
1371  /// pairs.
1373 
1374  /// A set containing all BasicBlocks that are known to present after
1375  /// vectorization as a predicated block.
1376  SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1377 
1378  /// Records whether it is allowed to have the original scalar loop execute at
1379  /// least once. This may be needed as a fallback loop in case runtime
1380  /// aliasing/dependence checks fail, or to handle the tail/remainder
1381  /// iterations when the trip count is unknown or doesn't divide by the VF,
1382  /// or as a peel-loop to handle gaps in interleave-groups.
1383  /// Under optsize and when the trip count is very small we don't allow any
1384  /// iterations to execute in the scalar loop.
1385  ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1386 
1387  /// All blocks of loop are to be masked to fold tail of scalar iterations.
1388  bool FoldTailByMasking = false;
1389 
1390  /// A map holding scalar costs for different vectorization factors. The
1391  /// presence of a cost for an instruction in the mapping indicates that the
1392  /// instruction will be scalarized when vectorizing with the associated
1393  /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1394  DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1395 
1396  /// Holds the instructions known to be uniform after vectorization.
1397  /// The data is collected per VF.
1399 
1400  /// Holds the instructions known to be scalar after vectorization.
1401  /// The data is collected per VF.
1403 
1404  /// Holds the instructions (address computations) that are forced to be
1405  /// scalarized.
1407 
1408  /// Returns the expected difference in cost from scalarizing the expression
1409  /// feeding a predicated instruction \p PredInst. The instructions to
1410  /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1411  /// non-negative return value implies the expression will be scalarized.
1412  /// Currently, only single-use chains are considered for scalarization.
1413  int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1414  unsigned VF);
1415 
1416  /// Collect the instructions that are uniform after vectorization. An
1417  /// instruction is uniform if we represent it with a single scalar value in
1418  /// the vectorized loop corresponding to each vector iteration. Examples of
1419  /// uniform instructions include pointer operands of consecutive or
1420  /// interleaved memory accesses. Note that although uniformity implies an
1421  /// instruction will be scalar, the reverse is not true. In general, a
1422  /// scalarized instruction will be represented by VF scalar values in the
1423  /// vectorized loop, each corresponding to an iteration of the original
1424  /// scalar loop.
1425  void collectLoopUniforms(unsigned VF);
1426 
1427  /// Collect the instructions that are scalar after vectorization. An
1428  /// instruction is scalar if it is known to be uniform or will be scalarized
1429  /// during vectorization. Non-uniform scalarized instructions will be
1430  /// represented by VF values in the vectorized loop, each corresponding to an
1431  /// iteration of the original scalar loop.
1432  void collectLoopScalars(unsigned VF);
1433 
1434  /// Keeps cost model vectorization decision and cost for instructions.
1435  /// Right now it is used for memory instructions only.
1437  std::pair<InstWidening, unsigned>>;
1438 
1439  DecisionList WideningDecisions;
1440 
1441  /// Returns true if \p V is expected to be vectorized and it needs to be
1442  /// extracted.
1443  bool needsExtract(Value *V, unsigned VF) const {
1444  Instruction *I = dyn_cast<Instruction>(V);
1445  if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1446  return false;
1447 
1448  // Assume we can vectorize V (and hence we need extraction) if the
1449  // scalars are not computed yet. This can happen, because it is called
1450  // via getScalarizationOverhead from setCostBasedWideningDecision, before
1451  // the scalars are collected. That should be a safe assumption in most
1452  // cases, because we check if the operands have vectorizable types
1453  // beforehand in LoopVectorizationLegality.
1454  return Scalars.find(VF) == Scalars.end() ||
1455  !isScalarAfterVectorization(I, VF);
1456  };
1457 
1458  /// Returns a range containing only operands needing to be extracted.
1459  SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1460  unsigned VF) {
1462  Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1463  }
1464 
1465 public:
1466  /// The loop that we evaluate.
1468 
1469  /// Predicated scalar evolution analysis.
1471 
1472  /// Loop Info analysis.
1474 
1475  /// Vectorization legality.
1477 
1478  /// Vector target information.
1480 
1481  /// Target Library Info.
1483 
1484  /// Demanded bits analysis.
1486 
1487  /// Assumption cache.
1489 
1490  /// Interface to emit optimization remarks.
1492 
1494 
1495  /// Loop Vectorize Hint.
1497 
1498  /// The interleave access information contains groups of interleaved accesses
1499  /// with the same stride and close to each other.
1501 
1502  /// Values to ignore in the cost model.
1504 
1505  /// Values to ignore in the cost model when VF > 1.
1507 };
1508 
1509 } // end namespace llvm
1510 
1511 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1512 // vectorization. The loop needs to be annotated with #pragma omp simd
1513 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1514 // vector length information is not provided, vectorization is not considered
1515 // explicit. Interleave hints are not allowed either. These limitations will be
1516 // relaxed in the future.
1517 // Please, note that we are currently forced to abuse the pragma 'clang
1518 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1519 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1520 // provides *explicit vectorization hints* (LV can bypass legal checks and
1521 // assume that vectorization is legal). However, both hints are implemented
1522 // using the same metadata (llvm.loop.vectorize, processed by
1523 // LoopVectorizeHints). This will be fixed in the future when the native IR
1524 // representation for pragma 'omp simd' is introduced.
1525 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1527  assert(!OuterLp->empty() && "This is not an outer loop");
1528  LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1529 
1530  // Only outer loops with an explicit vectorization hint are supported.
1531  // Unannotated outer loops are ignored.
1532  if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1533  return false;
1534 
1535  Function *Fn = OuterLp->getHeader()->getParent();
1536  if (!Hints.allowVectorization(Fn, OuterLp,
1537  true /*VectorizeOnlyWhenForced*/)) {
1538  LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1539  return false;
1540  }
1541 
1542  if (Hints.getInterleave() > 1) {
1543  // TODO: Interleave support is future work.
1544  LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1545  "outer loops.\n");
1546  Hints.emitRemarkWithHints();
1547  return false;
1548  }
1549 
1550  return true;
1551 }
1552 
1556  // Collect inner loops and outer loops without irreducible control flow. For
1557  // now, only collect outer loops that have explicit vectorization hints. If we
1558  // are stress testing the VPlan H-CFG construction, we collect the outermost
1559  // loop of every loop nest.
1560  if (L.empty() || VPlanBuildStressTest ||
1562  LoopBlocksRPO RPOT(&L);
1563  RPOT.perform(LI);
1564  if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1565  V.push_back(&L);
1566  // TODO: Collect inner loops inside marked outer loops in case
1567  // vectorization fails for the outer loop. Do not invoke
1568  // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1569  // already known to be reducible. We can use an inherited attribute for
1570  // that.
1571  return;
1572  }
1573  }
1574  for (Loop *InnerL : L)
1575  collectSupportedLoops(*InnerL, LI, ORE, V);
1576 }
1577 
1578 namespace {
1579 
1580 /// The LoopVectorize Pass.
1581 struct LoopVectorize : public FunctionPass {
1582  /// Pass identification, replacement for typeid
1583  static char ID;
1584 
1585  LoopVectorizePass Impl;
1586 
1587  explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1588  bool VectorizeOnlyWhenForced = false)
1589  : FunctionPass(ID) {
1590  Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
1591  Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
1593  }
1594 
1595  bool runOnFunction(Function &F) override {
1596  if (skipFunction(F))
1597  return false;
1598 
1599  auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1600  auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1601  auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1602  auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1603  auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1604  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1605  auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1606  auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1607  auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1608  auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1609  auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1610  auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1611  auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1612 
1613  std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1614  [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1615 
1616  return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1617  GetLAA, *ORE, PSI);
1618  }
1619 
1620  void getAnalysisUsage(AnalysisUsage &AU) const override {
1631 
1632  // We currently do not preserve loopinfo/dominator analyses with outer loop
1633  // vectorization. Until this is addressed, mark these analyses as preserved
1634  // only for non-VPlan-native path.
1635  // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1636  if (!EnableVPlanNativePath) {
1639  }
1640 
1644  }
1645 };
1646 
1647 } // end anonymous namespace
1648 
1649 //===----------------------------------------------------------------------===//
1650 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1651 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1652 //===----------------------------------------------------------------------===//
1653 
1655  // We need to place the broadcast of invariant variables outside the loop,
1656  // but only if it's proven safe to do so. Else, broadcast will be inside
1657  // vector loop body.
1658  Instruction *Instr = dyn_cast<Instruction>(V);
1659  bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1660  (!Instr ||
1662  // Place the code for broadcasting invariant variables in the new preheader.
1664  if (SafeToHoist)
1666 
1667  // Broadcast the scalar into all locations in the vector.
1668  Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1669 
1670  return Shuf;
1671 }
1672 
1674  const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1675  assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1676  "Expected either an induction phi-node or a truncate of it!");
1677  Value *Start = II.getStartValue();
1678 
1679  // Construct the initial value of the vector IV in the vector loop preheader
1680  auto CurrIP = Builder.saveIP();
1682  if (isa<TruncInst>(EntryVal)) {
1683  assert(Start->getType()->isIntegerTy() &&
1684  "Truncation requires an integer type");
1685  auto *TruncType = cast<IntegerType>(EntryVal->getType());
1686  Step = Builder.CreateTrunc(Step, TruncType);
1687  Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1688  }
1689  Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1690  Value *SteppedStart =
1691  getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1692 
1693  // We create vector phi nodes for both integer and floating-point induction
1694  // variables. Here, we determine the kind of arithmetic we will perform.
1695  Instruction::BinaryOps AddOp;
1696  Instruction::BinaryOps MulOp;
1697  if (Step->getType()->isIntegerTy()) {
1698  AddOp = Instruction::Add;
1699  MulOp = Instruction::Mul;
1700  } else {
1701  AddOp = II.getInductionOpcode();
1702  MulOp = Instruction::FMul;
1703  }
1704 
1705  // Multiply the vectorization factor by the step using integer or
1706  // floating-point arithmetic as appropriate.
1707  Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1708  Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1709 
1710  // Create a vector splat to use in the induction update.
1711  //
1712  // FIXME: If the step is non-constant, we create the vector splat with
1713  // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1714  // handle a constant vector splat.
1715  Value *SplatVF = isa<Constant>(Mul)
1716  ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1717  : Builder.CreateVectorSplat(VF, Mul);
1718  Builder.restoreIP(CurrIP);
1719 
1720  // We may need to add the step a number of times, depending on the unroll
1721  // factor. The last of those goes into the PHI.
1722  PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1724  VecInd->setDebugLoc(EntryVal->getDebugLoc());
1725  Instruction *LastInduction = VecInd;
1726  for (unsigned Part = 0; Part < UF; ++Part) {
1727  VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1728 
1729  if (isa<TruncInst>(EntryVal))
1730  addMetadata(LastInduction, EntryVal);
1731  recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1732 
1733  LastInduction = cast<Instruction>(addFastMathFlag(
1734  Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1735  LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1736  }
1737 
1738  // Move the last step to the end of the latch block. This ensures consistent
1739  // placement of all induction updates.
1740  auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1741  auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1742  auto *ICmp = cast<Instruction>(Br->getCondition());
1743  LastInduction->moveBefore(ICmp);
1744  LastInduction->setName("vec.ind.next");
1745 
1746  VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1747  VecInd->addIncoming(LastInduction, LoopVectorLatch);
1748 }
1749 
1751  return Cost->isScalarAfterVectorization(I, VF) ||
1753 }
1754 
1757  return true;
1758  auto isScalarInst = [&](User *U) -> bool {
1759  auto *I = cast<Instruction>(U);
1761  };
1762  return llvm::any_of(IV->users(), isScalarInst);
1763 }
1764 
1766  const InductionDescriptor &ID, const Instruction *EntryVal,
1767  Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1768  assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1769  "Expected either an induction phi-node or a truncate of it!");
1770 
1771  // This induction variable is not the phi from the original loop but the
1772  // newly-created IV based on the proof that casted Phi is equal to the
1773  // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1774  // re-uses the same InductionDescriptor that original IV uses but we don't
1775  // have to do any recording in this case - that is done when original IV is
1776  // processed.
1777  if (isa<TruncInst>(EntryVal))
1778  return;
1779 
1780  const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1781  if (Casts.empty())
1782  return;
1783  // Only the first Cast instruction in the Casts vector is of interest.
1784  // The rest of the Casts (if exist) have no uses outside the
1785  // induction update chain itself.
1786  Instruction *CastInst = *Casts.begin();
1787  if (Lane < UINT_MAX)
1788  VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1789  else
1790  VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1791 }
1792 
1794  assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1795  "Primary induction variable must have an integer type");
1796 
1797  auto II = Legal->getInductionVars()->find(IV);
1798  assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
1799 
1800  auto ID = II->second;
1801  assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1802 
1803  // The scalar value to broadcast. This will be derived from the canonical
1804  // induction variable.
1805  Value *ScalarIV = nullptr;
1806 
1807  // The value from the original loop to which we are mapping the new induction
1808  // variable.
1809  Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1810 
1811  // True if we have vectorized the induction variable.
1812  auto VectorizedIV = false;
1813 
1814  // Determine if we want a scalar version of the induction variable. This is
1815  // true if the induction variable itself is not widened, or if it has at
1816  // least one user in the loop that is not widened.
1817  auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
1818 
1819  // Generate code for the induction step. Note that induction steps are
1820  // required to be loop-invariant
1821  assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
1822  "Induction step should be loop invariant");
1823  auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1824  Value *Step = nullptr;
1825  if (PSE.getSE()->isSCEVable(IV->getType())) {
1826  SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1827  Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
1829  } else {
1830  Step = cast<SCEVUnknown>(ID.getStep())->getValue();
1831  }
1832 
1833  // Try to create a new independent vector induction variable. If we can't
1834  // create the phi node, we will splat the scalar induction variable in each
1835  // loop iteration.
1836  if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
1837  createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1838  VectorizedIV = true;
1839  }
1840 
1841  // If we haven't yet vectorized the induction variable, or if we will create
1842  // a scalar one, we need to define the scalar induction variable and step
1843  // values. If we were given a truncation type, truncate the canonical
1844  // induction variable and step. Otherwise, derive these values from the
1845  // induction descriptor.
1846  if (!VectorizedIV || NeedsScalarIV) {
1847  ScalarIV = Induction;
1848  if (IV != OldInduction) {
1849  ScalarIV = IV->getType()->isIntegerTy()
1851  : Builder.CreateCast(Instruction::SIToFP, Induction,
1852  IV->getType());
1853  ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1854  ScalarIV->setName("offset.idx");
1855  }
1856  if (Trunc) {
1857  auto *TruncType = cast<IntegerType>(Trunc->getType());
1858  assert(Step->getType()->isIntegerTy() &&
1859  "Truncation requires an integer step");
1860  ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1861  Step = Builder.CreateTrunc(Step, TruncType);
1862  }
1863  }
1864 
1865  // If we haven't yet vectorized the induction variable, splat the scalar
1866  // induction variable, and build the necessary step vectors.
1867  // TODO: Don't do it unless the vectorized IV is really required.
1868  if (!VectorizedIV) {
1869  Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1870  for (unsigned Part = 0; Part < UF; ++Part) {
1871  Value *EntryPart =
1872  getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1873  VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1874  if (Trunc)
1875  addMetadata(EntryPart, Trunc);
1876  recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1877  }
1878  }
1879 
1880  // If an induction variable is only used for counting loop iterations or
1881  // calculating addresses, it doesn't need to be widened. Create scalar steps
1882  // that can be used by instructions we will later scalarize. Note that the
1883  // addition of the scalar steps will not increase the number of instructions
1884  // in the loop in the common case prior to InstCombine. We will be trading
1885  // one vector extract for each scalar step.
1886  if (NeedsScalarIV)
1887  buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1888 }
1889 
1891  Instruction::BinaryOps BinOp) {
1892  // Create and check the types.
1893  assert(Val->getType()->isVectorTy() && "Must be a vector");
1894  int VLen = Val->getType()->getVectorNumElements();
1895 
1896  Type *STy = Val->getType()->getScalarType();
1897  assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1898  "Induction Step must be an integer or FP");
1899  assert(Step->getType() == STy && "Step has wrong type");
1900 
1902 
1903  if (STy->isIntegerTy()) {
1904  // Create a vector of consecutive numbers from zero to VF.
1905  for (int i = 0; i < VLen; ++i)
1906  Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1907 
1908  // Add the consecutive indices to the vector value.
1909  Constant *Cv = ConstantVector::get(Indices);
1910  assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1911  Step = Builder.CreateVectorSplat(VLen, Step);
1912  assert(Step->getType() == Val->getType() && "Invalid step vec");
1913  // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1914  // which can be found from the original scalar operations.
1915  Step = Builder.CreateMul(Cv, Step);
1916  return Builder.CreateAdd(Val, Step, "induction");
1917  }
1918 
1919  // Floating point induction.
1920  assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1921  "Binary Opcode should be specified for FP induction");
1922  // Create a vector of consecutive numbers from zero to VF.
1923  for (int i = 0; i < VLen; ++i)
1924  Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1925 
1926  // Add the consecutive indices to the vector value.
1927  Constant *Cv = ConstantVector::get(Indices);
1928 
1929  Step = Builder.CreateVectorSplat(VLen, Step);
1930 
1931  // Floating point operations had to be 'fast' to enable the induction.
1932  FastMathFlags Flags;
1933  Flags.setFast();
1934 
1935  Value *MulOp = Builder.CreateFMul(Cv, Step);
1936  if (isa<Instruction>(MulOp))
1937  // Have to check, MulOp may be a constant
1938  cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1939 
1940  Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1941  if (isa<Instruction>(BOp))
1942  cast<Instruction>(BOp)->setFastMathFlags(Flags);
1943  return BOp;
1944 }
1945 
1947  Instruction *EntryVal,
1948  const InductionDescriptor &ID) {
1949  // We shouldn't have to build scalar steps if we aren't vectorizing.
1950  assert(VF > 1 && "VF should be greater than one");
1951 
1952  // Get the value type and ensure it and the step have the same integer type.
1953  Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1954  assert(ScalarIVTy == Step->getType() &&
1955  "Val and Step should have the same type");
1956 
1957  // We build scalar steps for both integer and floating-point induction
1958  // variables. Here, we determine the kind of arithmetic we will perform.
1959  Instruction::BinaryOps AddOp;
1960  Instruction::BinaryOps MulOp;
1961  if (ScalarIVTy->isIntegerTy()) {
1962  AddOp = Instruction::Add;
1963  MulOp = Instruction::Mul;
1964  } else {
1965  AddOp = ID.getInductionOpcode();
1966  MulOp = Instruction::FMul;
1967  }
1968 
1969  // Determine the number of scalars we need to generate for each unroll
1970  // iteration. If EntryVal is uniform, we only need to generate the first
1971  // lane. Otherwise, we generate all VF values.
1972  unsigned Lanes =
1973  Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1974  : VF;
1975  // Compute the scalar steps and save the results in VectorLoopValueMap.
1976  for (unsigned Part = 0; Part < UF; ++Part) {
1977  for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
1978  auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1979  auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1980  auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
1981  VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
1982  recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
1983  }
1984  }
1985 }
1986 
1988  assert(V != Induction && "The new induction variable should not be used.");
1989  assert(!V->getType()->isVectorTy() && "Can't widen a vector");
1990  assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1991 
1992  // If we have a stride that is replaced by one, do it here. Defer this for
1993  // the VPlan-native path until we start running Legal checks in that path.
1995  V = ConstantInt::get(V->getType(), 1);
1996 
1997  // If we have a vector mapped to this value, return it.
1998  if (VectorLoopValueMap.hasVectorValue(V, Part))
1999  return VectorLoopValueMap.getVectorValue(V, Part);
2000 
2001  // If the value has not been vectorized, check if it has been scalarized
2002  // instead. If it has been scalarized, and we actually need the value in
2003  // vector form, we will construct the vector values on demand.
2005  Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2006 
2007  // If we've scalarized a value, that value should be an instruction.
2008  auto *I = cast<Instruction>(V);
2009 
2010  // If we aren't vectorizing, we can just copy the scalar map values over to
2011  // the vector map.
2012  if (VF == 1) {
2013  VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2014  return ScalarValue;
2015  }
2016 
2017  // Get the last scalar instruction we generated for V and Part. If the value
2018  // is known to be uniform after vectorization, this corresponds to lane zero
2019  // of the Part unroll iteration. Otherwise, the last instruction is the one
2020  // we created for the last vector lane of the Part unroll iteration.
2021  unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
2022  auto *LastInst = cast<Instruction>(
2023  VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2024 
2025  // Set the insert point after the last scalarized instruction. This ensures
2026  // the insertelement sequence will directly follow the scalar definitions.
2027  auto OldIP = Builder.saveIP();
2028  auto NewIP = std::next(BasicBlock::iterator(LastInst));
2029  Builder.SetInsertPoint(&*NewIP);
2030 
2031  // However, if we are vectorizing, we need to construct the vector values.
2032  // If the value is known to be uniform after vectorization, we can just
2033  // broadcast the scalar value corresponding to lane zero for each unroll
2034  // iteration. Otherwise, we construct the vector values using insertelement
2035  // instructions. Since the resulting vectors are stored in
2036  // VectorLoopValueMap, we will only generate the insertelements once.
2037  Value *VectorValue = nullptr;
2039  VectorValue = getBroadcastInstrs(ScalarValue);
2040  VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2041  } else {
2042  // Initialize packing with insertelements to start from undef.
2044  VectorLoopValueMap.setVectorValue(V, Part, Undef);
2045  for (unsigned Lane = 0; Lane < VF; ++Lane)
2046  packScalarIntoVectorValue(V, {Part, Lane});
2047  VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2048  }
2049  Builder.restoreIP(OldIP);
2050  return VectorValue;
2051  }
2052 
2053  // If this scalar is unknown, assume that it is a constant or that it is
2054  // loop invariant. Broadcast V and save the value for future uses.
2055  Value *B = getBroadcastInstrs(V);
2056  VectorLoopValueMap.setVectorValue(V, Part, B);
2057  return B;
2058 }
2059 
2060 Value *
2062  const VPIteration &Instance) {
2063  // If the value is not an instruction contained in the loop, it should
2064  // already be scalar.
2065  if (OrigLoop->isLoopInvariant(V))
2066  return V;
2067 
2068  assert(Instance.Lane > 0
2069  ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2070  : true && "Uniform values only have lane zero");
2071 
2072  // If the value from the original loop has not been vectorized, it is
2073  // represented by UF x VF scalar values in the new loop. Return the requested
2074  // scalar value.
2075  if (VectorLoopValueMap.hasScalarValue(V, Instance))
2076  return VectorLoopValueMap.getScalarValue(V, Instance);
2077 
2078  // If the value has not been scalarized, get its entry in VectorLoopValueMap
2079  // for the given unroll part. If this entry is not a vector type (i.e., the
2080  // vectorization factor is one), there is no need to generate an
2081  // extractelement instruction.
2082  auto *U = getOrCreateVectorValue(V, Instance.Part);
2083  if (!U->getType()->isVectorTy()) {
2084  assert(VF == 1 && "Value not scalarized has non-vector type");
2085  return U;
2086  }
2087 
2088  // Otherwise, the value from the original loop has been vectorized and is
2089  // represented by UF vector values. Extract and return the requested scalar
2090  // value from the appropriate vector lane.
2091  return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2092 }
2093 
2095  Value *V, const VPIteration &Instance) {
2096  assert(V != Induction && "The new induction variable should not be used.");
2097  assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2098  assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2099 
2100  Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2101  Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2102  VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2103  Builder.getInt32(Instance.Lane));
2104  VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2105 }
2106 
2108  assert(Vec->getType()->isVectorTy() && "Invalid type");
2109  SmallVector<Constant *, 8> ShuffleMask;
2110  for (unsigned i = 0; i < VF; ++i)
2111  ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2112 
2113  return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2114  ConstantVector::get(ShuffleMask),
2115  "reverse");
2116 }
2117 
2118 // Return whether we allow using masked interleave-groups (for dealing with
2119 // strided loads/stores that reside in predicated blocks, or for dealing
2120 // with gaps).
2122  // If an override option has been passed in for interleaved accesses, use it.
2123  if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2125 
2127 }
2128 
2129 // Try to vectorize the interleave group that \p Instr belongs to.
2130 //
2131 // E.g. Translate following interleaved load group (factor = 3):
2132 // for (i = 0; i < N; i+=3) {
2133 // R = Pic[i]; // Member of index 0
2134 // G = Pic[i+1]; // Member of index 1
2135 // B = Pic[i+2]; // Member of index 2
2136 // ... // do something to R, G, B
2137 // }
2138 // To:
2139 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2140 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements
2141 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements
2142 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements
2143 //
2144 // Or translate following interleaved store group (factor = 3):
2145 // for (i = 0; i < N; i+=3) {
2146 // ... do something to R, G, B
2147 // Pic[i] = R; // Member of index 0
2148 // Pic[i+1] = G; // Member of index 1
2149 // Pic[i+2] = B; // Member of index 2
2150 // }
2151 // To:
2152 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2153 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2154 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2155 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2156 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2158  VectorParts *BlockInMask) {
2159  const InterleaveGroup<Instruction> *Group =
2161  assert(Group && "Fail to get an interleaved access group.");
2162 
2163  // Skip if current instruction is not the insert position.
2164  if (Instr != Group->getInsertPos())
2165  return;
2166 
2167  const DataLayout &DL = Instr->getModule()->getDataLayout();
2168  Value *Ptr = getLoadStorePointerOperand(Instr);
2169 
2170  // Prepare for the vector type of the interleaved load/store.
2171  Type *ScalarTy = getMemInstValueType(Instr);
2172  unsigned InterleaveFactor = Group->getFactor();
2173  Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2174  Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr));
2175 
2176  // Prepare for the new pointers.
2178  SmallVector<Value *, 2> NewPtrs;
2179  unsigned Index = Group->getIndex(Instr);
2180 
2181  VectorParts Mask;
2182  bool IsMaskForCondRequired = BlockInMask;
2183  if (IsMaskForCondRequired) {
2184  Mask = *BlockInMask;
2185  // TODO: extend the masked interleaved-group support to reversed access.
2186  assert(!Group->isReverse() && "Reversed masked interleave-group "
2187  "not supported.");
2188  }
2189 
2190  // If the group is reverse, adjust the index to refer to the last vector lane
2191  // instead of the first. We adjust the index from the first vector lane,
2192  // rather than directly getting the pointer for lane VF - 1, because the
2193  // pointer operand of the interleaved access is supposed to be uniform. For
2194  // uniform instructions, we're only required to generate a value for the
2195  // first vector lane in each unroll iteration.
2196  if (Group->isReverse())
2197  Index += (VF - 1) * Group->getFactor();
2198 
2199  bool InBounds = false;
2200  if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2201  InBounds = gep->isInBounds();
2202 
2203  for (unsigned Part = 0; Part < UF; Part++) {
2204  Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
2205 
2206  // Notice current instruction could be any index. Need to adjust the address
2207  // to the member of index 0.
2208  //
2209  // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2210  // b = A[i]; // Member of index 0
2211  // Current pointer is pointed to A[i+1], adjust it to A[i].
2212  //
2213  // E.g. A[i+1] = a; // Member of index 1
2214  // A[i] = b; // Member of index 0
2215  // A[i+2] = c; // Member of index 2 (Current instruction)
2216  // Current pointer is pointed to A[i+2], adjust it to A[i].
2217  NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index));
2218  if (InBounds)
2219  cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);
2220 
2221  // Cast to the vector pointer type.
2222  NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
2223  }
2224 
2225  setDebugLocFromInst(Builder, Instr);
2226  Value *UndefVec = UndefValue::get(VecTy);
2227 
2228  Value *MaskForGaps = nullptr;
2229  if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2230  MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2231  assert(MaskForGaps && "Mask for Gaps is required but it is null");
2232  }
2233 
2234  // Vectorize the interleaved load group.
2235  if (isa<LoadInst>(Instr)) {
2236  // For each unroll part, create a wide load for the group.
2237  SmallVector<Value *, 2> NewLoads;
2238  for (unsigned Part = 0; Part < UF; Part++) {
2239  Instruction *NewLoad;
2240  if (IsMaskForCondRequired || MaskForGaps) {
2242  "masked interleaved groups are not allowed.");
2243  Value *GroupMask = MaskForGaps;
2244  if (IsMaskForCondRequired) {
2245  auto *Undefs = UndefValue::get(Mask[Part]->getType());
2246  auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2247  Value *ShuffledMask = Builder.CreateShuffleVector(
2248  Mask[Part], Undefs, RepMask, "interleaved.mask");
2249  GroupMask = MaskForGaps
2250  ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2251  MaskForGaps)
2252  : ShuffledMask;
2253  }
2254  NewLoad =
2255  Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),
2256  GroupMask, UndefVec, "wide.masked.vec");
2257  }
2258  else
2259  NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part],
2260  Group->getAlignment(), "wide.vec");
2261  Group->addMetadata(NewLoad);
2262  NewLoads.push_back(NewLoad);
2263  }
2264 
2265  // For each member in the group, shuffle out the appropriate data from the
2266  // wide loads.
2267  for (unsigned I = 0; I < InterleaveFactor; ++I) {
2268  Instruction *Member = Group->getMember(I);
2269 
2270  // Skip the gaps in the group.
2271  if (!Member)
2272  continue;
2273 
2274  Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2275  for (unsigned Part = 0; Part < UF; Part++) {
2276  Value *StridedVec = Builder.CreateShuffleVector(
2277  NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2278 
2279  // If this member has different type, cast the result type.
2280  if (Member->getType() != ScalarTy) {
2281  VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2282  StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2283  }
2284 
2285  if (Group->isReverse())
2286  StridedVec = reverseVector(StridedVec);
2287 
2288  VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2289  }
2290  }
2291  return;
2292  }
2293 
2294  // The sub vector type for current instruction.
2295  VectorType *SubVT = VectorType::get(ScalarTy, VF);
2296 
2297  // Vectorize the interleaved store group.
2298  for (unsigned Part = 0; Part < UF; Part++) {
2299  // Collect the stored vector from each member.
2300  SmallVector<Value *, 4> StoredVecs;
2301  for (unsigned i = 0; i < InterleaveFactor; i++) {
2302  // Interleaved store group doesn't allow a gap, so each index has a member
2303  Instruction *Member = Group->getMember(i);
2304  assert(Member && "Fail to get a member from an interleaved store group");
2305 
2306  Value *StoredVec = getOrCreateVectorValue(
2307  cast<StoreInst>(Member)->getValueOperand(), Part);
2308  if (Group->isReverse())
2309  StoredVec = reverseVector(StoredVec);
2310 
2311  // If this member has different type, cast it to a unified type.
2312 
2313  if (StoredVec->getType() != SubVT)
2314  StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2315 
2316  StoredVecs.push_back(StoredVec);
2317  }
2318 
2319  // Concatenate all vectors into a wide vector.
2320  Value *WideVec = concatenateVectors(Builder, StoredVecs);
2321 
2322  // Interleave the elements in the wide vector.
2323  Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2324  Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2325  "interleaved.vec");
2326 
2327  Instruction *NewStoreInstr;
2328  if (IsMaskForCondRequired) {
2329  auto *Undefs = UndefValue::get(Mask[Part]->getType());
2330  auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2331  Value *ShuffledMask = Builder.CreateShuffleVector(
2332  Mask[Part], Undefs, RepMask, "interleaved.mask");
2333  NewStoreInstr = Builder.CreateMaskedStore(
2334  IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask);
2335  }
2336  else
2337  NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part],
2338  Group->getAlignment());
2339 
2340  Group->addMetadata(NewStoreInstr);
2341  }
2342 }
2343 
2345  VectorParts *BlockInMask) {
2346  // Attempt to issue a wide load.
2347  LoadInst *LI = dyn_cast<LoadInst>(Instr);
2348  StoreInst *SI = dyn_cast<StoreInst>(Instr);
2349 
2350  assert((LI || SI) && "Invalid Load/Store instruction");
2351 
2353  Cost->getWideningDecision(Instr, VF);
2355  "CM decision should be taken at this point");
2357  return vectorizeInterleaveGroup(Instr);
2358 
2359  Type *ScalarDataTy = getMemInstValueType(Instr);
2360  Type *DataTy = VectorType::get(ScalarDataTy, VF);
2361  Value *Ptr = getLoadStorePointerOperand(Instr);
2362  // An alignment of 0 means target abi alignment. We need to use the scalar's
2363  // target abi alignment in such a case.
2364  const DataLayout &DL = Instr->getModule()->getDataLayout();
2365  const Align Alignment =
2366  DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy);
2367  unsigned AddressSpace = getLoadStoreAddressSpace(Instr);
2368 
2369  // Determine if the pointer operand of the access is either consecutive or
2370  // reverse consecutive.
2371  bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2372  bool ConsecutiveStride =
2373  Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2374  bool CreateGatherScatter =
2376 
2377  // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2378  // gather/scatter. Otherwise Decision should have been to Scalarize.
2379  assert((ConsecutiveStride || CreateGatherScatter) &&
2380  "The instruction should be scalarized");
2381 
2382  // Handle consecutive loads/stores.
2383  if (ConsecutiveStride)
2384  Ptr = getOrCreateScalarValue(Ptr, {0, 0});
2385 
2386  VectorParts Mask;
2387  bool isMaskRequired = BlockInMask;
2388  if (isMaskRequired)
2389  Mask = *BlockInMask;
2390 
2391  bool InBounds = false;
2392  if (auto *gep = dyn_cast<GetElementPtrInst>(
2393  getLoadStorePointerOperand(Instr)->stripPointerCasts()))
2394  InBounds = gep->isInBounds();
2395 
2396  const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2397  // Calculate the pointer for the specific unroll-part.
2398  GetElementPtrInst *PartPtr = nullptr;
2399 
2400  if (Reverse) {
2401  // If the address is consecutive but reversed, then the
2402  // wide store needs to start at the last vector element.
2403  PartPtr = cast<GetElementPtrInst>(
2404  Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2405  PartPtr->setIsInBounds(InBounds);
2406  PartPtr = cast<GetElementPtrInst>(
2407  Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2408  PartPtr->setIsInBounds(InBounds);
2409  if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2410  Mask[Part] = reverseVector(Mask[Part]);
2411  } else {
2412  PartPtr = cast<GetElementPtrInst>(
2413  Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2414  PartPtr->setIsInBounds(InBounds);
2415  }
2416 
2417  return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2418  };
2419 
2420  // Handle Stores:
2421  if (SI) {
2423 
2424  for (unsigned Part = 0; Part < UF; ++Part) {
2425  Instruction *NewSI = nullptr;
2426  Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
2427  if (CreateGatherScatter) {
2428  Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2429  Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2430  NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep,
2431  Alignment.value(), MaskPart);
2432  } else {
2433  if (Reverse) {
2434  // If we store to reverse consecutive memory locations, then we need
2435  // to reverse the order of elements in the stored value.
2436  StoredVal = reverseVector(StoredVal);
2437  // We don't want to update the value in the map as it might be used in
2438  // another expression. So don't call resetVectorValue(StoredVal).
2439  }
2440  auto *VecPtr = CreateVecPtr(Part, Ptr);
2441  if (isMaskRequired)
2442  NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr,
2443  Alignment.value(), Mask[Part]);
2444  else
2445  NewSI =
2446  Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment.value());
2447  }
2448  addMetadata(NewSI, SI);
2449  }
2450  return;
2451  }
2452 
2453  // Handle loads.
2454  assert(LI && "Must have a load instruction");
2456  for (unsigned Part = 0; Part < UF; ++Part) {
2457  Value *NewLI;
2458  if (CreateGatherScatter) {
2459  Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2460  Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2461  NewLI = Builder.CreateMaskedGather(VectorGep, Alignment.value(), MaskPart,
2462  nullptr, "wide.masked.gather");
2463  addMetadata(NewLI, LI);
2464  } else {
2465  auto *VecPtr = CreateVecPtr(Part, Ptr);
2466  if (isMaskRequired)
2467  NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment.value(), Mask[Part],
2468  UndefValue::get(DataTy),
2469  "wide.masked.load");
2470  else
2471  NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment.value(),
2472  "wide.load");
2473 
2474  // Add metadata to the load, but setVectorValue to the reverse shuffle.
2475  addMetadata(NewLI, LI);
2476  if (Reverse)
2477  NewLI = reverseVector(NewLI);
2478  }
2479  VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2480  }
2481 }
2482 
2484  const VPIteration &Instance,
2485  bool IfPredicateInstr) {
2486  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2487 
2488  setDebugLocFromInst(Builder, Instr);
2489 
2490  // Does this instruction return a value ?
2491  bool IsVoidRetTy = Instr->getType()->isVoidTy();
2492 
2493  Instruction *Cloned = Instr->clone();
2494  if (!IsVoidRetTy)
2495  Cloned->setName(Instr->getName() + ".cloned");
2496 
2497  // Replace the operands of the cloned instructions with their scalar
2498  // equivalents in the new loop.
2499  for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2500  auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2501  Cloned->setOperand(op, NewOp);
2502  }
2503  addNewMetadata(Cloned, Instr);
2504 
2505  // Place the cloned scalar in the new loop.
2506  Builder.Insert(Cloned);
2507 
2508  // Add the cloned scalar to the scalar map entry.
2509  VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2510 
2511  // If we just cloned a new assumption, add it the assumption cache.
2512  if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2513  if (II->getIntrinsicID() == Intrinsic::assume)
2514  AC->registerAssumption(II);
2515 
2516  // End if-block.
2517  if (IfPredicateInstr)
2518  PredicatedInstructions.push_back(Cloned);
2519 }
2520 
2522  Value *End, Value *Step,
2523  Instruction *DL) {
2524  BasicBlock *Header = L->getHeader();
2525  BasicBlock *Latch = L->getLoopLatch();
2526  // As we're just creating this loop, it's possible no latch exists
2527  // yet. If so, use the header as this will be a single block loop.
2528  if (!Latch)
2529  Latch = Header;
2530 
2533  setDebugLocFromInst(Builder, OldInst);
2534  auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2535 
2537  setDebugLocFromInst(Builder, OldInst);
2538 
2539  // Create i+1 and fill the PHINode.
2540  Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2541  Induction->addIncoming(Start, L->getLoopPreheader());
2542  Induction->addIncoming(Next, Latch);
2543  // Create the compare.
2544  Value *ICmp = Builder.CreateICmpEQ(Next, End);
2545  Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2546 
2547  // Now we have two terminators. Remove the old one from the block.
2548  Latch->getTerminator()->eraseFromParent();
2549 
2550  return Induction;
2551 }
2552 
2554  if (TripCount)
2555  return TripCount;
2556 
2557  assert(L && "Create Trip Count for null loop.");
2559  // Find the loop boundaries.
2560  ScalarEvolution *SE = PSE.getSE();
2561  const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2562  assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2563  "Invalid loop count");
2564 
2565  Type *IdxTy = Legal->getWidestInductionType();
2566  assert(IdxTy && "No type for induction");
2567 
2568  // The exit count might have the type of i64 while the phi is i32. This can
2569  // happen if we have an induction variable that is sign extended before the
2570  // compare. The only way that we get a backedge taken count is that the
2571  // induction variable was signed and as such will not overflow. In such a case
2572  // truncation is legal.
2573  if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
2574  IdxTy->getPrimitiveSizeInBits())
2575  BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2576  BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2577 
2578  // Get the total trip count from the count by adding 1.
2579  const SCEV *ExitCount = SE->getAddExpr(
2580  BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2581 
2582  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2583 
2584  // Expand the trip count and place the new instructions in the preheader.
2585  // Notice that the pre-header does not change, only the loop body.
2586  SCEVExpander Exp(*SE, DL, "induction");
2587 
2588  // Count holds the overall loop count (N).
2589  TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2591 
2592  if (TripCount->getType()->isPointerTy())
2593  TripCount =
2594  CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2596 
2597  return TripCount;
2598 }
2599 
2601  if (VectorTripCount)
2602  return VectorTripCount;
2603 
2604  Value *TC = getOrCreateTripCount(L);
2606 
2607  Type *Ty = TC->getType();
2608  Constant *Step = ConstantInt::get(Ty, VF * UF);
2609 
2610  // If the tail is to be folded by masking, round the number of iterations N
2611  // up to a multiple of Step instead of rounding down. This is done by first
2612  // adding Step-1 and then rounding down. Note that it's ok if this addition
2613  // overflows: the vector induction variable will eventually wrap to zero given
2614  // that it starts at zero and its Step is a power of two; the loop will then
2615  // exit, with the last early-exit vector comparison also producing all-true.
2616  if (Cost->foldTailByMasking()) {
2617  assert(isPowerOf2_32(VF * UF) &&
2618  "VF*UF must be a power of 2 when folding tail by masking");
2619  TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2620  }
2621 
2622  // Now we need to generate the expression for the part of the loop that the
2623  // vectorized body will execute. This is equal to N - (N % Step) if scalar
2624  // iterations are not required for correctness, or N - Step, otherwise. Step
2625  // is equal to the vectorization factor (number of SIMD elements) times the
2626  // unroll factor (number of SIMD instructions).
2627  Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2628 
2629  // If there is a non-reversed interleaved group that may speculatively access
2630  // memory out-of-bounds, we need to ensure that there will be at least one
2631  // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2632  // the trip count, we set the remainder to be equal to the step. If the step
2633  // does not evenly divide the trip count, no adjustment is necessary since
2634  // there will already be scalar iterations. Note that the minimum iterations
2635  // check ensures that N >= Step.
2636  if (VF > 1 && Cost->requiresScalarEpilogue()) {
2637  auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2638  R = Builder.CreateSelect(IsZero, Step, R);
2639  }
2640 
2641  VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2642 
2643  return VectorTripCount;
2644 }
2645 
2647  const DataLayout &DL) {
2648  // Verify that V is a vector type with same number of elements as DstVTy.
2649  unsigned VF = DstVTy->getNumElements();
2650  VectorType *SrcVecTy = cast<VectorType>(V->getType());
2651  assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2652  Type *SrcElemTy = SrcVecTy->getElementType();
2653  Type *DstElemTy = DstVTy->getElementType();
2654  assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2655  "Vector elements must have same size");
2656 
2657  // Do a direct cast if element types are castable.
2658  if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2659  return Builder.CreateBitOrPointerCast(V, DstVTy);
2660  }
2661  // V cannot be directly casted to desired vector type.
2662  // May happen when V is a floating point vector but DstVTy is a vector of
2663  // pointers or vice-versa. Handle this using a two-step bitcast using an
2664  // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2665  assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2666  "Only one type should be a pointer type");
2667  assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2668  "Only one type should be a floating point type");
2669  Type *IntTy =
2671  VectorType *VecIntTy = VectorType::get(IntTy, VF);
2672  Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2673  return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2674 }
2675 
2677  BasicBlock *Bypass) {
2678  Value *Count = getOrCreateTripCount(L);
2679  BasicBlock *BB = L->getLoopPreheader();
2681 
2682  // Generate code to check if the loop's trip count is less than VF * UF, or
2683  // equal to it in case a scalar epilogue is required; this implies that the
2684  // vector trip count is zero. This check also covers the case where adding one
2685  // to the backedge-taken count overflowed leading to an incorrect trip count
2686  // of zero. In this case we will also jump to the scalar loop.
2689 
2690  // If tail is to be folded, vector loop takes care of all iterations.
2691  Value *CheckMinIters = Builder.getFalse();
2692  if (!Cost->foldTailByMasking())
2693  CheckMinIters = Builder.CreateICmp(
2694  P, Count, ConstantInt::get(Count->getType(), VF * UF),
2695  "min.iters.check");
2696 
2697  BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2698  // Update dominator tree immediately if the generated block is a
2699  // LoopBypassBlock because SCEV expansions to generate loop bypass
2700  // checks may query it before the current function is finished.
2701  DT->addNewBlock(NewBB, BB);
2702  if (L->getParentLoop())
2703  L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2705  BranchInst::Create(Bypass, NewBB, CheckMinIters));
2706  LoopBypassBlocks.push_back(BB);
2707 }
2708 
2710  BasicBlock *BB = L->getLoopPreheader();
2711 
2712  // Generate the code to check that the SCEV assumptions that we made.
2713  // We want the new basic block to start at the first instruction in a
2714  // sequence of instructions that form a check.
2715  SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2716  "scev.check");
2717  Value *SCEVCheck =
2718  Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
2719 
2720  if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2721  if (C->isZero())
2722  return;
2723 
2724  assert(!BB->getParent()->hasOptSize() &&
2725  "Cannot SCEV check stride or overflow when optimizing for size");
2726 
2727  // Create a new block containing the stride check.
2728  BB->setName("vector.scevcheck");
2729  auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2730  // Update dominator tree immediately if the generated block is a
2731  // LoopBypassBlock because SCEV expansions to generate loop bypass
2732  // checks may query it before the current function is finished.
2733  DT->addNewBlock(NewBB, BB);
2734  if (L->getParentLoop())
2735  L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2737  BranchInst::Create(Bypass, NewBB, SCEVCheck));
2738  LoopBypassBlocks.push_back(BB);
2739  AddedSafetyChecks = true;
2740 }
2741 
2743  // VPlan-native path does not do any analysis for runtime checks currently.
2745  return;
2746 
2747  BasicBlock *BB = L->getLoopPreheader();
2748 
2749  // Generate the code that checks in runtime if arrays overlap. We put the
2750  // checks into a separate block to make the more common case of few elements
2751  // faster.
2752  Instruction *FirstCheckInst;
2753  Instruction *MemRuntimeCheck;
2754  std::tie(FirstCheckInst, MemRuntimeCheck) =
2756  if (!MemRuntimeCheck)
2757  return;
2758 
2759  if (BB->getParent()->hasOptSize()) {
2761  "Cannot emit memory checks when optimizing for size, unless forced "
2762  "to vectorize.");
2763  ORE->emit([&]() {
2764  return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2765  L->getStartLoc(), L->getHeader())
2766  << "Code-size may be reduced by not forcing "
2767  "vectorization, or by source-code modifications "
2768  "eliminating the need for runtime checks "
2769  "(e.g., adding 'restrict').";
2770  });
2771  }
2772 
2773  // Create a new block containing the memory check.
2774  BB->setName("vector.memcheck");
2775  auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2776  // Update dominator tree immediately if the generated block is a
2777  // LoopBypassBlock because SCEV expansions to generate loop bypass
2778  // checks may query it before the current function is finished.
2779  DT->addNewBlock(NewBB, BB);
2780  if (L->getParentLoop())
2781  L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2783  BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
2784  LoopBypassBlocks.push_back(BB);
2785  AddedSafetyChecks = true;
2786 
2787  // We currently don't use LoopVersioning for the actual loop cloning but we
2788  // still use it to add the noalias metadata.
2789  LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2790  PSE.getSE());
2791  LVer->prepareNoAliasMetadata();
2792 }
2793 
2795  IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2796  const InductionDescriptor &ID) const {
2797 
2798  SCEVExpander Exp(*SE, DL, "induction");
2799  auto Step = ID.getStep();
2800  auto StartValue = ID.getStartValue();
2801  assert(Index->getType() == Step->getType() &&
2802  "Index type does not match StepValue type");
2803 
2804  // Note: the IR at this point is broken. We cannot use SE to create any new
2805  // SCEV and then expand it, hoping that SCEV's simplification will give us
2806  // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2807  // lead to various SCEV crashes. So all we can do is to use builder and rely
2808  // on InstCombine for future simplifications. Here we handle some trivial
2809  // cases only.
2810  auto CreateAdd = [&B](Value *X, Value *Y) {
2811  assert(X->getType() == Y->getType() && "Types don't match!");
2812  if (auto *CX = dyn_cast<ConstantInt>(X))
2813  if (CX->isZero())
2814  return Y;
2815  if (auto *CY = dyn_cast<ConstantInt>(Y))
2816  if (CY->isZero())
2817  return X;
2818  return B.CreateAdd(X, Y);
2819  };
2820 
2821  auto CreateMul = [&B](Value *X, Value *Y) {
2822  assert(X->getType() == Y->getType() && "Types don't match!");
2823  if (auto *CX = dyn_cast<ConstantInt>(X))
2824  if (CX->isOne())
2825  return Y;
2826  if (auto *CY = dyn_cast<ConstantInt>(Y))
2827  if (CY->isOne())
2828  return X;
2829  return B.CreateMul(X, Y);
2830  };
2831 
2832  switch (ID.getKind()) {
2834  assert(Index->getType() == StartValue->getType() &&
2835  "Index type does not match StartValue type");
2837  return B.CreateSub(StartValue, Index);
2838  auto *Offset = CreateMul(
2839  Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2840  return CreateAdd(StartValue, Offset);
2841  }
2843  assert(isa<SCEVConstant>(Step) &&
2844  "Expected constant step for pointer induction");
2845  return B.CreateGEP(
2846  StartValue->getType()->getPointerElementType(), StartValue,
2847  CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2848  &*B.GetInsertPoint())));
2849  }
2851  assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2852  auto InductionBinOp = ID.getInductionBinOp();
2853  assert(InductionBinOp &&
2854  (InductionBinOp->getOpcode() == Instruction::FAdd ||
2855  InductionBinOp->getOpcode() == Instruction::FSub) &&
2856  "Original bin op should be defined for FP induction");
2857 
2858  Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2859 
2860  // Floating point operations had to be 'fast' to enable the induction.
2861  FastMathFlags Flags;
2862  Flags.setFast();
2863 
2864  Value *MulExp = B.CreateFMul(StepValue, Index);
2865  if (isa<Instruction>(MulExp))
2866  // We have to check, the MulExp may be a constant.
2867  cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2868 
2869  Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2870  "induction");
2871  if (isa<Instruction>(BOp))
2872  cast<Instruction>(BOp)->setFastMathFlags(Flags);
2873 
2874  return BOp;
2875  }
2877  return nullptr;
2878  }
2879  llvm_unreachable("invalid enum");
2880 }
2881 
2883  /*
2884  In this function we generate a new loop. The new loop will contain
2885  the vectorized instructions while the old loop will continue to run the
2886  scalar remainder.
2887 
2888  [ ] <-- loop iteration number check.
2889  / |
2890  / v
2891  | [ ] <-- vector loop bypass (may consist of multiple blocks).
2892  | / |
2893  | / v
2894  || [ ] <-- vector pre header.
2895  |/ |
2896  | v
2897  | [ ] \
2898  | [ ]_| <-- vector loop.
2899  | |
2900  | v
2901  | -[ ] <--- middle-block.
2902  | / |
2903  | / v
2904  -|- >[ ] <--- new preheader.
2905  | |
2906  | v
2907  | [ ] \
2908  | [ ]_| <-- old scalar loop to handle remainder.
2909  \ |
2910  \ v
2911  >[ ] <-- exit block.
2912  ...
2913  */
2914 
2915  BasicBlock *OldBasicBlock = OrigLoop->getHeader();
2916  BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2917  BasicBlock *ExitBlock = OrigLoop->getExitBlock();
2918  MDNode *OrigLoopID = OrigLoop->getLoopID();
2919  assert(VectorPH && "Invalid loop structure");
2920  assert(ExitBlock && "Must have an exit block");
2921 
2922  // Some loops have a single integer induction variable, while other loops
2923  // don't. One example is c++ iterators that often have multiple pointer
2924  // induction variables. In the code below we also support a case where we
2925  // don't have a single induction variable.
2926  //
2927  // We try to obtain an induction variable from the original loop as hard
2928  // as possible. However if we don't find one that:
2929  // - is an integer
2930  // - counts from zero, stepping by one
2931  // - is the size of the widest induction variable type
2932  // then we create a new one.
2934  Type *IdxTy = Legal->getWidestInductionType();
2935 
2936  // Split the single block loop into the two loop structure described above.
2937  BasicBlock *VecBody =
2938  VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
2939  BasicBlock *MiddleBlock =
2940  VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
2941  BasicBlock *ScalarPH =
2942  MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
2943 
2944  // Create and register the new vector loop.
2945  Loop *Lp = LI->AllocateLoop();
2946  Loop *ParentLoop = OrigLoop->getParentLoop();
2947 
2948  // Insert the new loop into the loop nest and register the new basic blocks
2949  // before calling any utilities such as SCEV that require valid LoopInfo.
2950  if (ParentLoop) {
2951  ParentLoop->addChildLoop(Lp);
2952  ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
2953  ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
2954  } else {
2955  LI->addTopLevelLoop(Lp);
2956  }
2957  Lp->addBasicBlockToLoop(VecBody, *LI);
2958 
2959  // Find the loop boundaries.
2960  Value *Count = getOrCreateTripCount(Lp);
2961 
2962  Value *StartIdx = ConstantInt::get(IdxTy, 0);
2963 
2964  // Now, compare the new count to zero. If it is zero skip the vector loop and
2965  // jump to the scalar loop. This check also covers the case where the
2966  // backedge-taken count is uint##_max: adding one to it will overflow leading
2967  // to an incorrect trip count of zero. In this (rare) case we will also jump
2968  // to the scalar loop.
2969  emitMinimumIterationCountCheck(Lp, ScalarPH);
2970 
2971  // Generate the code to check any assumptions that we've made for SCEV
2972  // expressions.
2973  emitSCEVChecks(Lp, ScalarPH);
2974 
2975  // Generate the code that checks in runtime if arrays overlap. We put the
2976  // checks into a separate block to make the more common case of few elements
2977  // faster.
2978  emitMemRuntimeChecks(Lp, ScalarPH);
2979 
2980  // Generate the induction variable.
2981  // The loop step is equal to the vectorization factor (num of SIMD elements)
2982  // times the unroll factor (num of SIMD instructions).
2983  Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
2984  Constant *Step = ConstantInt::get(IdxTy, VF * UF);
2985  Induction =
2986  createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
2988 
2989  // We are going to resume the execution of the scalar loop.
2990  // Go over all of the induction variables that we found and fix the
2991  // PHIs that are left in the scalar version of the loop.
2992  // The starting values of PHI nodes depend on the counter of the last
2993  // iteration in the vectorized loop.
2994  // If we come from a bypass edge then we need to start from the original
2995  // start value.
2996 
2997  // This variable saves the new starting index for the scalar loop. It is used
2998  // to test if there are any tail iterations left once the vector loop has
2999  // completed.
3001  for (auto &InductionEntry : *List) {
3002  PHINode *OrigPhi = InductionEntry.first;
3003  InductionDescriptor II = InductionEntry.second;
3004 
3005  // Create phi nodes to merge from the backedge-taken check block.
3006  PHINode *BCResumeVal = PHINode::Create(
3007  OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
3008  // Copy original phi DL over to the new one.
3009  BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3010  Value *&EndValue = IVEndValues[OrigPhi];
3011  if (OrigPhi == OldInduction) {
3012  // We know what the end value is.
3013  EndValue = CountRoundDown;
3014  } else {
3016  Type *StepType = II.getStep()->getType();
3017  Instruction::CastOps CastOp =
3018  CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
3019  Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
3020  const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
3021  EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3022  EndValue->setName("ind.end");
3023  }
3024 
3025  // The new PHI merges the original incoming value, in case of a bypass,
3026  // or the value at the end of the vectorized loop.
3027  BCResumeVal->addIncoming(EndValue, MiddleBlock);
3028 
3029  // Fix the scalar body counter (PHI node).
3030  // The old induction's phi node in the scalar body needs the truncated
3031  // value.
3032  for (BasicBlock *BB : LoopBypassBlocks)
3033  BCResumeVal->addIncoming(II.getStartValue(), BB);
3034  OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal);
3035  }
3036 
3037  // We need the OrigLoop (scalar loop part) latch terminator to help
3038  // produce correct debug info for the middle block BB instructions.
3039  // The legality check stage guarantees that the loop will have a single
3040  // latch.
3041  assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3042  "Scalar loop latch terminator isn't a branch");
3043  BranchInst *ScalarLatchBr =
3044  cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3045 
3046  // Add a check in the middle block to see if we have completed
3047  // all of the iterations in the first vector loop.
3048  // If (N - N%VF) == N, then we *don't* need to run the remainder.
3049  // If tail is to be folded, we know we don't need to run the remainder.
3050  Value *CmpN = Builder.getTrue();
3051  if (!Cost->foldTailByMasking()) {
3052  CmpN =
3053  CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3054  CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
3055 
3056  // Here we use the same DebugLoc as the scalar loop latch branch instead
3057  // of the corresponding compare because they may have ended up with
3058  // different line numbers and we want to avoid awkward line stepping while
3059  // debugging. Eg. if the compare has got a line number inside the loop.
3060  cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3061  }
3062 
3063  BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN);
3064  BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3065  ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst);
3066 
3067  // Get ready to start creating new instructions into the vectorized body.
3069 
3070  // Save the state.
3072  LoopScalarPreHeader = ScalarPH;
3073  LoopMiddleBlock = MiddleBlock;
3074  LoopExitBlock = ExitBlock;
3075  LoopVectorBody = VecBody;
3076  LoopScalarBody = OldBasicBlock;
3077 
3078  Optional<MDNode *> VectorizedLoopID =
3080  LLVMLoopVectorizeFollowupVectorized});
3081  if (VectorizedLoopID.hasValue()) {
3082  Lp->setLoopID(VectorizedLoopID.getValue());
3083 
3084  // Do not setAlreadyVectorized if loop attributes have been defined
3085  // explicitly.
3086  return LoopVectorPreHeader;
3087  }
3088 
3089  // Keep all loop hints from the original loop on the vector loop (we'll
3090  // replace the vectorizer-specific hints below).
3091  if (MDNode *LID = OrigLoop->getLoopID())
3092  Lp->setLoopID(LID);
3093 
3094  LoopVectorizeHints Hints(Lp, true, *ORE);
3095  Hints.setAlreadyVectorized();
3096 
3097  return LoopVectorPreHeader;
3098 }
3099 
3100 // Fix up external users of the induction variable. At this point, we are
3101 // in LCSSA form, with all external PHIs that use the IV having one input value,
3102 // coming from the remainder loop. We need those PHIs to also have a correct
3103 // value for the IV when arriving directly from the middle block.
3105  const InductionDescriptor &II,
3106  Value *CountRoundDown, Value *EndValue,
3107  BasicBlock *MiddleBlock) {
3108  // There are two kinds of external IV usages - those that use the value
3109  // computed in the last iteration (the PHI) and those that use the penultimate
3110  // value (the value that feeds into the phi from the loop latch).
3111  // We allow both, but they, obviously, have different values.
3112 
3113  assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3114 
3115  DenseMap<Value *, Value *> MissingVals;
3116 
3117  // An external user of the last iteration's value should see the value that
3118  // the remainder loop uses to initialize its own IV.
3120  for (User *U : PostInc->users()) {
3121  Instruction *UI = cast<Instruction>(U);
3122  if (!OrigLoop->contains(UI)) {
3123  assert(isa<PHINode>(UI) && "Expected LCSSA form");
3124  MissingVals[UI] = EndValue;
3125  }
3126  }
3127 
3128  // An external user of the penultimate value need to see EndValue - Step.
3129  // The simplest way to get this is to recompute it from the constituent SCEVs,
3130  // that is Start + (Step * (CRD - 1)).
3131  for (User *U : OrigPhi->users()) {
3132  auto *UI = cast<Instruction>(U);
3133  if (!OrigLoop->contains(UI)) {
3134  const DataLayout &DL =
3136  assert(isa<PHINode>(UI) && "Expected LCSSA form");
3137 
3138  IRBuilder<> B(MiddleBlock->getTerminator());
3139  Value *CountMinusOne = B.CreateSub(
3140  CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3141  Value *CMO =
3142  !II.getStep()->getType()->isIntegerTy()
3143  ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3144  II.getStep()->getType())
3145  : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3146  CMO->setName("cast.cmo");
3147  Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3148  Escape->setName("ind.escape");
3149  MissingVals[UI] = Escape;
3150  }
3151  }
3152 
3153  for (auto &I : MissingVals) {
3154  PHINode *PHI = cast<PHINode>(I.first);
3155  // One corner case we have to handle is two IVs "chasing" each-other,
3156  // that is %IV2 = phi [...], [ %IV1, %latch ]
3157  // In this case, if IV1 has an external use, we need to avoid adding both
3158  // "last value of IV1" and "penultimate value of IV2". So, verify that we
3159  // don't already have an incoming value for the middle block.
3160  if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3161  PHI->addIncoming(I.second, MiddleBlock);
3162  }
3163 }
3164 
3165 namespace {
3166 
3167 struct CSEDenseMapInfo {
3168  static bool canHandle(const Instruction *I) {
3169  return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3170  isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3171  }
3172 
3173  static inline Instruction *getEmptyKey() {
3175  }
3176 
3177  static inline Instruction *getTombstoneKey() {
3179  }
3180 
3181  static unsigned getHashValue(const Instruction *I) {
3182  assert(canHandle(I) && "Unknown instruction!");
3184  I->value_op_end()));
3185  }
3186 
3187  static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3188  if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3189  LHS == getTombstoneKey() || RHS == getTombstoneKey())
3190  return LHS == RHS;
3191  return LHS->isIdenticalTo(RHS);
3192  }
3193 };
3194 
3195 } // end anonymous namespace
3196 
3197 ///Perform cse of induction variable instructions.
3198 static void cse(BasicBlock *BB) {
3199  // Perform simple cse.
3201  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3202  Instruction *In = &*I++;
3203 
3204  if (!CSEDenseMapInfo::canHandle(In))
3205  continue;
3206 
3207  // Check if we can replace this instruction with any of the
3208  // visited instructions.
3209  if (Instruction *V = CSEMap.lookup(In)) {
3210  In->replaceAllUsesWith(V);
3211  In->eraseFromParent();
3212  continue;
3213  }
3214 
3215  CSEMap[In] = In;
3216  }
3217 }
3218 
3220  unsigned VF,
3221  bool &NeedToScalarize) {
3222  Function *F = CI->getCalledFunction();
3223  StringRef FnName = CI->getCalledFunction()->getName();
3224  Type *ScalarRetTy = CI->getType();
3225  SmallVector<Type *, 4> Tys, ScalarTys;
3226  for (auto &ArgOp : CI->arg_operands())
3227  ScalarTys.push_back(ArgOp->getType());
3228 
3229  // Estimate cost of scalarized vector call. The source operands are assumed
3230  // to be vectors, so we need to extract individual elements from there,
3231  // execute VF scalar calls, and then gather the result into the vector return
3232  // value.
3233  unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3234  if (VF == 1)
3235  return ScalarCallCost;
3236 
3237  // Compute corresponding vector type for return value and arguments.
3238  Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3239  for (Type *ScalarTy : ScalarTys)
3240  Tys.push_back(ToVectorTy(ScalarTy, VF));
3241 
3242  // Compute costs of unpacking argument values for the scalar calls and
3243  // packing the return values to a vector.
3244  unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3245 
3246  unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3247 
3248  // If we can't emit a vector call for this function, then the currently found
3249  // cost is the cost we need to return.
3250  NeedToScalarize = true;
3251  if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
3252  return Cost;
3253 
3254  // If the corresponding vector cost is cheaper, return its cost.
3255  unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3256  if (VectorCallCost < Cost) {
3257  NeedToScalarize = false;
3258  return VectorCallCost;
3259  }
3260  return Cost;
3261 }
3262 
3264  unsigned VF) {
3266  assert(ID && "Expected intrinsic call!");
3267 
3268  FastMathFlags FMF;
3269  if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3270  FMF = FPMO->getFastMathFlags();
3271 
3273  return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3274 }
3275 
3277  auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3278  auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3279  return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3280 }
3282  auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3283  auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3284  return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3285 }
3286 
3288  // For every instruction `I` in MinBWs, truncate the operands, create a
3289  // truncated version of `I` and reextend its result. InstCombine runs
3290  // later and will remove any ext/trunc pairs.
3291  SmallPtrSet<Value *, 4> Erased;
3292  for (const auto &KV : Cost->getMinimalBitwidths()) {
3293  // If the value wasn't vectorized, we must maintain the original scalar
3294  // type. The absence of the value from VectorLoopValueMap indicates that it
3295  // wasn't vectorized.
3296  if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3297  continue;
3298  for (unsigned Part = 0; Part < UF; ++Part) {
3299  Value *I = getOrCreateVectorValue(KV.first, Part);
3300  if (Erased.find(I) != Erased.end() || I->use_empty() ||
3301  !isa<Instruction>(I))
3302  continue;
3303  Type *OriginalTy = I->getType();
3304  Type *ScalarTruncatedTy =
3305  IntegerType::get(OriginalTy->getContext(), KV.second);
3306  Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3307  OriginalTy->getVectorNumElements());
3308  if (TruncatedTy == OriginalTy)
3309  continue;
3310 
3311  IRBuilder<> B(cast<Instruction>(I));
3312  auto ShrinkOperand = [&](Value *V) -> Value * {
3313  if (auto *ZI = dyn_cast<ZExtInst>(V))
3314  if (ZI->getSrcTy() == TruncatedTy)
3315  return ZI->getOperand(0);
3316  return B.CreateZExtOrTrunc(V, TruncatedTy);
3317  };
3318 
3319  // The actual instruction modification depends on the instruction type,
3320  // unfortunately.
3321  Value *NewI = nullptr;
3322  if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3323  NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3324  ShrinkOperand(BO->getOperand(1)));
3325 
3326  // Any wrapping introduced by shrinking this operation shouldn't be
3327  // considered undefined behavior. So, we can't unconditionally copy
3328  // arithmetic wrapping flags to NewI.
3329  cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3330  } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3331  NewI =
3332  B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3333  ShrinkOperand(CI->getOperand(1)));
3334  } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3335  NewI = B.CreateSelect(SI->getCondition(),
3336  ShrinkOperand(SI->getTrueValue()),
3337  ShrinkOperand(SI->getFalseValue()));
3338  } else if (auto *CI = dyn_cast<CastInst>(I)) {
3339  switch (CI->getOpcode()) {
3340  default:
3341  llvm_unreachable("Unhandled cast!");
3342  case Instruction::Trunc:
3343  NewI = ShrinkOperand(CI->getOperand(0));
3344  break;
3345  case Instruction::SExt:
3346  NewI = B.CreateSExtOrTrunc(
3347  CI->getOperand(0),
3348  smallestIntegerVectorType(OriginalTy, TruncatedTy));
3349  break;
3350  case Instruction::ZExt:
3351  NewI = B.CreateZExtOrTrunc(
3352  CI->getOperand(0),
3353  smallestIntegerVectorType(OriginalTy, TruncatedTy));
3354  break;
3355  }
3356  } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3357  auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3358  auto *O0 = B.CreateZExtOrTrunc(
3359  SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3360  auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3361  auto *O1 = B.CreateZExtOrTrunc(
3362  SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3363 
3364  NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3365  } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3366  // Don't do anything with the operands, just extend the result.
3367  continue;
3368  } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3369  auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3370  auto *O0 = B.CreateZExtOrTrunc(
3371  IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3372  auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3373  NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3374  } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3375  auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3376  auto *O0 = B.CreateZExtOrTrunc(
3377  EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3378  NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3379  } else {
3380  // If we don't know what to do, be conservative and don't do anything.
3381  continue;
3382  }
3383 
3384  // Lastly, extend the result.
3385  NewI->takeName(cast<Instruction>(I));
3386  Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3387  I->replaceAllUsesWith(Res);
3388  cast<Instruction>(I)->eraseFromParent();
3389  Erased.insert(I);
3390  VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3391  }
3392  }
3393 
3394  // We'll have created a bunch of ZExts that are now parentless. Clean up.
3395  for (const auto &KV : Cost->getMinimalBitwidths()) {
3396  // If the value wasn't vectorized, we must maintain the original scalar
3397  // type. The absence of the value from VectorLoopValueMap indicates that it
3398  // wasn't vectorized.
3399  if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3400  continue;
3401  for (unsigned Part = 0; Part < UF; ++Part) {
3402  Value *I = getOrCreateVectorValue(KV.first, Part);
3403  ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3404  if (Inst && Inst->use_empty()) {
3405  Value *NewI = Inst->getOperand(0);
3406  Inst->eraseFromParent();
3407  VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3408  }
3409  }
3410  }
3411 }
3412 
3414  // Insert truncates and extends for any truncated instructions as hints to
3415  // InstCombine.
3416  if (VF > 1)
3418 
3419  // Fix widened non-induction PHIs by setting up the PHI operands.
3420  if (OrigPHIsToFix.size()) {
3422  "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3424  }
3425 
3426  // At this point every instruction in the original loop is widened to a
3427  // vector form. Now we need to fix the recurrences in the loop. These PHI
3428  // nodes are currently empty because we did not want to introduce cycles.
3429  // This is the second stage of vectorizing recurrences.
3431 
3432  // Update the dominator tree.
3433  //
3434  // FIXME: After creating the structure of the new loop, the dominator tree is
3435  // no longer up-to-date, and it remains that way until we update it
3436  // here. An out-of-date dominator tree is problematic for SCEV,
3437  // because SCEVExpander uses it to guide code generation. The
3438  // vectorizer use SCEVExpanders in several places. Instead, we should
3439  // keep the dominator tree up-to-date as we go.
3440  updateAnalysis();
3441 
3442  // Fix-up external users of the induction variables.
3443  for (auto &Entry : *Legal->getInductionVars())
3444  fixupIVUsers(Entry.first, Entry.second,
3447 
3448  fixLCSSAPHIs();
3450  sinkScalarOperands(&*PI);
3451 
3452  // Remove redundant induction instructions.
3454 }
3455 
3457  // In order to support recurrences we need to be able to vectorize Phi nodes.
3458  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3459  // stage #2: We now need to fix the recurrences by adding incoming edges to
3460  // the currently empty PHI nodes. At this point every instruction in the
3461  // original loop is widened to a vector form so we can use them to construct
3462  // the incoming edges.
3463  for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3464  // Handle first-order recurrences and reductions that need to be fixed.
3465  if (Legal->isFirstOrderRecurrence(&Phi))
3467  else if (Legal->isReductionVariable(&Phi))
3468  fixReduction(&Phi);
3469  }
3470 }
3471 
3473  // This is the second phase of vectorizing first-order recurrences. An
3474  // overview of the transformation is described below. Suppose we have the
3475  // following loop.
3476  //
3477  // for (int i = 0; i < n; ++i)
3478  // b[i] = a[i] - a[i - 1];
3479  //
3480  // There is a first-order recurrence on "a". For this loop, the shorthand
3481  // scalar IR looks like:
3482  //
3483  // scalar.ph:
3484  // s_init = a[-1]
3485  // br scalar.body
3486  //
3487  // scalar.body:
3488  // i = phi [0, scalar.ph], [i+1, scalar.body]
3489  // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3490  // s2 = a[i]
3491  // b[i] = s2 - s1
3492  // br cond, scalar.body, ...
3493  //
3494  // In this example, s1 is a recurrence because it's value depends on the
3495  // previous iteration. In the first phase of vectorization, we created a
3496  // temporary value for s1. We now complete the vectorization and produce the
3497  // shorthand vector IR shown below (for VF = 4, UF = 1).
3498  //
3499  // vector.ph:
3500  // v_init = vector(..., ..., ..., a[-1])
3501  // br vector.body
3502  //
3503  // vector.body
3504  // i = phi [0, vector.ph], [i+4, vector.body]
3505  // v1 = phi [v_init, vector.ph], [v2, vector.body]
3506  // v2 = a[i, i+1, i+2, i+3];
3507  // v3 = vector(v1(3), v2(0, 1, 2))
3508  // b[i, i+1, i+2, i+3] = v2 - v3
3509  // br cond, vector.body, middle.block
3510  //
3511  // middle.block:
3512  // x = v2(3)
3513  // br scalar.ph
3514  //
3515  // scalar.ph:
3516  // s_init = phi [x, middle.block], [a[-1], otherwise]
3517  // br scalar.body
3518  //
3519  // After execution completes the vector loop, we extract the next value of
3520  // the recurrence (x) to use as the initial value in the scalar loop.
3521 
3522  // Get the original loop preheader and single loop latch.
3523  auto *Preheader = OrigLoop->getLoopPreheader();
3524  auto *Latch = OrigLoop->getLoopLatch();
3525 
3526  // Get the initial and previous values of the scalar recurrence.
3527  auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3528  auto *Previous = Phi->getIncomingValueForBlock(Latch);
3529 
3530  // Create a vector from the initial value.
3531  auto *VectorInit = ScalarInit;
3532  if (VF > 1) {
3534  VectorInit = Builder.CreateInsertElement(
3535  UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3536  Builder.getInt32(VF - 1), "vector.recur.init");
3537  }
3538 
3539  // We constructed a temporary phi node in the first phase of vectorization.
3540  // This phi node will eventually be deleted.
3542  cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3543 
3544  // Create a phi node for the new recurrence. The current value will either be
3545  // the initial value inserted into a vector or loop-varying vector value.
3546  auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3547  VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3548 
3549  // Get the vectorized previous value of the last part UF - 1. It appears last
3550  // among all unrolled iterations, due to the order of their construction.
3551  Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3552 
3553  // Set the insertion point after the previous value if it is an instruction.
3554  // Note that the previous value may have been constant-folded so it is not
3555  // guaranteed to be an instruction in the vector loop. Also, if the previous
3556  // value is a phi node, we should insert after all the phi nodes to avoid
3557  // breaking basic block verification.
3558  if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||
3559  isa<PHINode>(PreviousLastPart))
3561  else
3563  &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));
3564 
3565  // We will construct a vector for the recurrence by combining the values for
3566  // the current and previous iterations. This is the required shuffle mask.
3567  SmallVector<Constant *, 8> ShuffleMask(VF);
3568  ShuffleMask[0] = Builder.getInt32(VF - 1);
3569  for (unsigned I = 1; I < VF; ++I)
3570  ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3571 
3572  // The vector from which to take the initial value for the current iteration
3573  // (actual or unrolled). Initially, this is the vector phi node.
3574  Value *Incoming = VecPhi;
3575 
3576  // Shuffle the current and previous vector and update the vector parts.
3577  for (unsigned Part = 0; Part < UF; ++Part) {
3578  Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3579  Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3580  auto *Shuffle =
3581  VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3582  ConstantVector::get(ShuffleMask))
3583  : Incoming;
3584  PhiPart->replaceAllUsesWith(Shuffle);
3585  cast<Instruction>(PhiPart)->eraseFromParent();
3586  VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3587  Incoming = PreviousPart;
3588  }
3589 
3590  // Fix the latch value of the new recurrence in the vector loop.
3591  VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3592 
3593  // Extract the last vector element in the middle block. This will be the
3594  // initial value for the recurrence when jumping to the scalar loop.
3595  auto *ExtractForScalar = Incoming;
3596  if (VF > 1) {
3598  ExtractForScalar = Builder.CreateExtractElement(
3599  ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3600  }
3601  // Extract the second last element in the middle block if the
3602  // Phi is used outside the loop. We need to extract the phi itself
3603  // and not the last element (the phi update in the current iteration). This
3604  // will be the value when jumping to the exit block from the LoopMiddleBlock,
3605  // when the scalar loop is not run at all.
3606  Value *ExtractForPhiUsedOutsideLoop = nullptr;
3607  if (VF > 1)
3608  ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3609  Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3610  // When loop is unrolled without vectorizing, initialize
3611  // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3612  // `Incoming`. This is analogous to the vectorized case above: extracting the
3613  // second last element when VF > 1.
3614  else if (UF > 1)
3615  ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3616 
3617  // Fix the initial value of the original recurrence in the scalar loop.
3619  auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3620  for (auto *BB : predecessors(LoopScalarPreHeader)) {
3621  auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3622  Start->addIncoming(Incoming, BB);
3623  }
3624 
3626  Phi->setName("scalar.recur");
3627 
3628  // Finally, fix users of the recurrence outside the loop. The users will need
3629  // either the last value of the scalar recurrence or the last value of the
3630  // vector recurrence we extracted in the middle block. Since the loop is in
3631  // LCSSA form, we just need to find all the phi nodes for the original scalar
3632  // recurrence in the exit block, and then add an edge for the middle block.
3633  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3634  if (LCSSAPhi.getIncomingValue(0) == Phi) {
3635  LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3636  }
3637  }
3638 }
3639 
3641  Constant *Zero = Builder.getInt32(0);
3642 
3643  // Get it's reduction variable descriptor.
3645  "Unable to find the reduction variable");
3646  RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
3647 
3649  TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3650  Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3652  RdxDesc.getMinMaxRecurrenceKind();
3653  setDebugLocFromInst(Builder, ReductionStartValue);
3654 
3655  // We need to generate a reduction vector from the incoming scalar.
3656  // To do so, we need to generate the 'identity' vector and override
3657  // one of the elements with the incoming scalar reduction. We need
3658  // to do it in the vector-loop preheader.
3660 
3661  // This is the vector-clone of the value that leaves the loop.
3662  Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3663 
3664  // Find the reduction identity variable. Zero for addition, or, xor,
3665  // one for multiplication, -1 for And.
3666  Value *Identity;
3667  Value *VectorStart;
3670  // MinMax reduction have the start value as their identify.
3671  if (VF == 1) {
3672  VectorStart = Identity = ReductionStartValue;
3673  } else {
3674  VectorStart = Identity =
3675  Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3676  }
3677  } else {
3678  // Handle other reduction kinds:
3680  RK, VecTy->getScalarType());
3681  if (VF == 1) {
3682  Identity = Iden;
3683  // This vector is the Identity vector where the first element is the
3684  // incoming scalar reduction.
3685  VectorStart = ReductionStartValue;
3686  } else {
3687  Identity = ConstantVector::getSplat(VF, Iden);
3688 
3689  // This vector is the Identity vector where the first element is the
3690  // incoming scalar reduction.
3691  VectorStart =
3692  Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3693  }
3694  }
3695 
3696  // Fix the vector-loop phi.
3697 
3698  // Reductions do not have to start at zero. They can start with
3699  // any loop invariant values.
3700  BasicBlock *Latch = OrigLoop->getLoopLatch();
3701  Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3702  for (unsigned Part = 0; Part < UF; ++Part) {
3703  Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3704  Value *Val = getOrCreateVectorValue(LoopVal, Part);
3705  // Make sure to add the reduction stat value only to the
3706  // first unroll part.
3707  Value *StartVal = (Part == 0) ? VectorStart : Identity;
3708  cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3709  cast<PHINode>(VecRdxPhi)
3710  ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3711  }
3712 
3713  // Before each round, move the insertion point right between
3714  // the PHIs and the values we are going to write.
3715  // This allows us to write both PHINodes and the extractelement
3716  // instructions.
3718 
3719  setDebugLocFromInst(Builder, LoopExitInst);
3720 
3721  // If tail is folded by masking, the vector value to leave the loop should be
3722  // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3723  // instead of the former.
3724  if (Cost->foldTailByMasking()) {
3725  for (unsigned Part = 0; Part < UF; ++Part) {
3726  Value *VecLoopExitInst =
3727  VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3728  Value *Sel = nullptr;
3729  for (User *U : VecLoopExitInst->users()) {
3730  if (isa<SelectInst>(U)) {
3731  assert(!Sel && "Reduction exit feeding two selects");
3732  Sel = U;
3733  } else
3734  assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3735  }
3736  assert(Sel && "Reduction exit feeds no select");
3737  VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
3738  }
3739  }
3740 
3741  // If the vector reduction can be performed in a smaller type, we truncate
3742  // then extend the loop exit value to enable InstCombine to evaluate the
3743  // entire expression in the smaller type.
3744  if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3745  Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3748  VectorParts RdxParts(UF);
3749  for (unsigned Part = 0; Part < UF; ++Part) {
3750  RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3751  Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3752  Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3753  : Builder.CreateZExt(Trunc, VecTy);
3754  for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3755  UI != RdxParts[Part]->user_end();)
3756  if (*UI != Trunc) {
3757  (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3758  RdxParts[Part] = Extnd;
3759  } else {
3760  ++UI;
3761  }
3762  }
3764  for (unsigned Part = 0; Part < UF; ++Part) {
3765  RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3766  VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3767  }
3768  }
3769 
3770  // Reduce all of the unrolled parts into a single vector.
3771  Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3773 
3774  // The middle block terminator has already been assigned a DebugLoc here (the
3775  // OrigLoop's single latch terminator). We want the whole middle block to
3776  // appear to execute on this line because: (a) it is all compiler generated,
3777  // (b) these instructions are always executed after evaluating the latch
3778  // conditional branch, and (c) other passes may add new predecessors which
3779  // terminate on this line. This is the easiest way to ensure we don't
3780  // accidentally cause an extra step back into the loop while debugging.
3782  for (unsigned Part = 1; Part < UF; ++Part) {
3783  Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3784  if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3785  // Floating point operations had to be 'fast' to enable the reduction.
3786  ReducedPartRdx = addFastMathFlag(
3788  ReducedPartRdx, "bin.rdx"),
3789  RdxDesc.getFastMathFlags());
3790  else
3791  ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3792  RdxPart);
3793  }
3794 
3795  if (VF > 1) {
3796  bool NoNaN = Legal->hasFunNoNaNAttr();
3797  ReducedPartRdx =
3798  createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3799  // If the reduction can be performed in a smaller type, we need to extend
3800  // the reduction to the wider type before we branch to the original loop.
3801  if (Phi->getType() != RdxDesc.getRecurrenceType())
3802  ReducedPartRdx =
3803  RdxDesc.isSigned()
3804  ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3805  : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3806  }
3807 
3808  // Create a phi node that merges control-flow from the backedge-taken check
3809  // block and the middle block.
3810  PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3812  for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3813  BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3814  BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3815 
3816  // Now, we need to fix the users of the reduction variable
3817  // inside and outside of the scalar remainder loop.
3818  // We know that the loop is in LCSSA form. We need to update the
3819  // PHI nodes in the exit blocks.
3820  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3821  // All PHINodes need to have a single entry edge, or two if
3822  // we already fixed them.
3823  assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3824 
3825  // We found a reduction value exit-PHI. Update it with the
3826  // incoming bypass edge.
3827  if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3828  LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3829  } // end of the LCSSA phi scan.
3830 
3831  // Fix the scalar loop reduction variable with the incoming reduction sum
3832  // from the vector body and from the backedge value.
3833  int IncomingEdgeBlockIdx =
3835  assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3836  // Pick the other block.
3837  int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3838  Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3839  Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3840 }
3841 
3843  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3844  if (LCSSAPhi.getNumIncomingValues() == 1) {
3845  auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3846  // Non-instruction incoming values will have only one value.
3847  unsigned LastLane = 0;
3848  if (isa<Instruction>(IncomingValue))
3849  LastLane = Cost->isUniformAfterVectorization(
3850  cast<Instruction>(IncomingValue), VF)
3851  ? 0
3852  : VF - 1;
3853  // Can be a loop invariant incoming value or the last scalar value to be
3854  // extracted from the vectorized loop.
3856  Value *lastIncomingValue =
3857  getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3858  LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3859  }
3860  }
3861 }
3862 
3864  // The basic block and loop containing the predicated instruction.
3865  auto *PredBB = PredInst->getParent();
3866  auto *VectorLoop = LI->getLoopFor(PredBB);
3867 
3868  // Initialize a worklist with the operands of the predicated instruction.
3869  SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3870 
3871  // Holds instructions that we need to analyze again. An instruction may be
3872  // reanalyzed if we don't yet know if we can sink it or not.
3873  SmallVector<Instruction *, 8> InstsToReanalyze;
3874 
3875  // Returns true if a given use occurs in the predicated block. Phi nodes use
3876  // their operands in their corresponding predecessor blocks.
3877  auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3878  auto *I = cast<Instruction>(U.getUser());
3879  BasicBlock *BB = I->getParent();
3880  if (auto *Phi = dyn_cast<PHINode>(I))
3881  BB = Phi->getIncomingBlock(
3882  PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3883  return BB == PredBB;
3884  };
3885 
3886  // Iteratively sink the scalarized operands of the predicated instruction
3887  // into the block we created for it. When an instruction is sunk, it's
3888  // operands are then added to the worklist. The algorithm ends after one pass
3889  // through the worklist doesn't sink a single instruction.
3890  bool Changed;
3891  do {
3892  // Add the instructions that need to be reanalyzed to the worklist, and
3893  // reset the changed indicator.
3894  Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3895  InstsToReanalyze.clear();
3896  Changed = false;
3897 
3898  while (!Worklist.empty()) {
3899  auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3900 
3901  // We can't sink an instruction if it is a phi node, is already in the
3902  // predicated block, is not in the loop, or may have side effects.
3903  if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
3904  !VectorLoop->contains(I) || I->mayHaveSideEffects())
3905  continue;
3906 
3907  // It's legal to sink the instruction if all its uses occur in the
3908  // predicated block. Otherwise, there's nothing to do yet, and we may
3909  // need to reanalyze the instruction.
3910  if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3911  InstsToReanalyze.push_back(I);
3912  continue;
3913  }
3914 
3915  // Move the instruction to the beginning of the predicated block, and add
3916  // it's operands to the worklist.
3917  I->moveBefore(&*PredBB->getFirstInsertionPt());
3918  Worklist.insert(I->op_begin(), I->op_end());
3919 
3920  // The sinking may have enabled other instructions to be sunk, so we will
3921  // need to iterate.
3922  Changed = true;
3923  }
3924  } while (Changed);
3925 }
3926 
3928  for (PHINode *OrigPhi : OrigPHIsToFix) {
3929  PHINode *NewPhi =
3930  cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
3931  unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
3932 
3933  SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
3934  predecessors(OrigPhi->getParent()));
3935  SmallVector<BasicBlock *, 2> VectorBBPredecessors(
3936  predecessors(NewPhi->getParent()));
3937  assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
3938  "Scalar and Vector BB should have the same number of predecessors");
3939 
3940  // The insertion point in Builder may be invalidated by the time we get
3941  // here. Force the Builder insertion point to something valid so that we do
3942  // not run into issues during insertion point restore in
3943  // getOrCreateVectorValue calls below.
3944  Builder.SetInsertPoint(NewPhi);
3945 
3946  // The predecessor order is preserved and we can rely on mapping between
3947  // scalar and vector block predecessors.
3948  for (unsigned i = 0; i < NumIncomingValues; ++i) {
3949  BasicBlock *NewPredBB = VectorBBPredecessors[i];
3950 
3951  // When looking up the new scalar/vector values to fix up, use incoming
3952  // values from original phi.
3953  Value *ScIncV =
3954  OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
3955 
3956  // Scalar incoming value may need a broadcast
3957  Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
3958  NewPhi->addIncoming(NewIncV, NewPredBB);
3959  }
3960  }
3961 }
3962 
3964  unsigned VF) {
3965  PHINode *P = cast<PHINode>(PN);
3966  if (EnableVPlanNativePath) {
3967  // Currently we enter here in the VPlan-native path for non-induction
3968  // PHIs where all control flow is uniform. We simply widen these PHIs.
3969  // Create a vector phi with no operands - the vector phi operands will be
3970  // set at the end of vector code generation.
3971  Type *VecTy =
3972  (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3973  Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
3974  VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
3975  OrigPHIsToFix.push_back(P);
3976 
3977  return;
3978  }
3979 
3980  assert(PN->getParent() == OrigLoop->getHeader() &&
3981  "Non-header phis should have been handled elsewhere");
3982 
3983  // In order to support recurrences we need to be able to vectorize Phi nodes.
3984  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3985  // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3986  // this value when we vectorize all of the instructions that use the PHI.
3988  for (unsigned Part = 0; Part < UF; ++Part) {
3989  // This is phase one of vectorizing PHIs.
3990  Type *VecTy =
3991  (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3992  Value *EntryPart = PHINode::Create(
3993  VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
3994  VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
3995  }
3996  return;
3997  }
3998 
4000 
4001  // This PHINode must be an induction variable.
4002  // Make sure that we know about it.
4003  assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
4004 
4006  const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4007 
4008  // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4009  // which can be found from the original scalar operations.
4010  switch (II.getKind()) {
4012  llvm_unreachable("Unknown induction");
4015  llvm_unreachable("Integer/fp induction is handled elsewhere.");
4017  // Handle the pointer induction variable case.
4018  assert(P->getType()->isPointerTy() && "Unexpected type.");
4019  // This is the normalized GEP that starts counting at zero.
4020  Value *PtrInd = Induction;
4021  PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
4022  // Determine the number of scalars we need to generate for each unroll
4023  // iteration. If the instruction is uniform, we only need to generate the
4024  // first lane. Otherwise, we generate all VF values.
4025  unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
4026  // These are the scalar results. Notice that we don't generate vector GEPs
4027  // because scalar GEPs result in better code.
4028  for (unsigned Part = 0; Part < UF; ++Part) {
4029  for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4030  Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
4031  Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4032  Value *SclrGep =
4033  emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4034  SclrGep->setName("next.gep");
4035  VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4036  }
4037  }
4038  return;
4039  }
4040  }
4041 }
4042 
4043 /// A helper function for checking whether an integer division-related
4044 /// instruction may divide by zero (in which case it must be predicated if
4045 /// executed conditionally in the scalar code).
4046 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4047 /// Non-zero divisors that are non compile-time constants will not be
4048 /// converted into multiplication, so we will still end up scalarizing
4049 /// the division, but can do so w/o predication.
4051  assert((I.getOpcode() == Instruction::UDiv ||
4052  I.getOpcode() == Instruction::SDiv ||
4053  I.getOpcode() == Instruction::URem ||
4054  I.getOpcode() == Instruction::SRem) &&
4055  "Unexpected instruction");
4056  Value *Divisor = I.getOperand(1);
4057  auto *CInt = dyn_cast<ConstantInt>(Divisor);
4058  return !CInt || CInt->isZero();
4059 }
4060 
4062  switch (I.getOpcode()) {
4063  case Instruction::Br:
4064  case Instruction::PHI:
4065  llvm_unreachable("This instruction is handled by a different recipe.");
4066  case Instruction::GetElementPtr: {
4067  // Construct a vector GEP by widening the operands of the scalar GEP as
4068  // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4069  // results in a vector of pointers when at least one operand of the GEP
4070  // is vector-typed. Thus, to keep the representation compact, we only use
4071  // vector-typed operands for loop-varying values.
4072  auto *GEP = cast<GetElementPtrInst>(&I);
4073 
4074  if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
4075  // If we are vectorizing, but the GEP has only loop-invariant operands,
4076  // the GEP we build (by only using vector-typed operands for
4077  // loop-varying values) would be a scalar pointer. Thus, to ensure we
4078  // produce a vector of pointers, we need to either arbitrarily pick an
4079  // operand to broadcast, or broadcast a clone of the original GEP.
4080  // Here, we broadcast a clone of the original.
4081  //
4082  // TODO: If at some point we decide to scalarize instructions having
4083  // loop-invariant operands, this special case will no longer be
4084  // required. We would add the scalarization decision to
4085  // collectLoopScalars() and teach getVectorValue() to broadcast
4086  // the lane-zero scalar value.
4087  auto *Clone = Builder.Insert(GEP->clone());
4088  for (unsigned Part = 0; Part < UF; ++Part) {
4089  Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4090  VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
4091  addMetadata(EntryPart, GEP);
4092  }
4093  } else {
4094  // If the GEP has at least one loop-varying operand, we are sure to
4095  // produce a vector of pointers. But if we are only unrolling, we want
4096  // to produce a scalar GEP for each unroll part. Thus, the GEP we
4097  // produce with the code below will be scalar (if VF == 1) or vector
4098  // (otherwise). Note that for the unroll-only case, we still maintain
4099  // values in the vector mapping with initVector, as we do for other
4100  // instructions.
4101  for (unsigned Part = 0; Part < UF; ++Part) {
4102  // The pointer operand of the new GEP. If it's loop-invariant, we
4103  // won't broadcast it.
4104  auto *Ptr =
4105  OrigLoop->isLoopInvariant(GEP->getPointerOperand())
4106  ? GEP->getPointerOperand()
4107  : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
4108 
4109  // Collect all the indices for the new GEP. If any index is
4110  // loop-invariant, we won't broadcast it.
4111  SmallVector<Value *, 4> Indices;
4112  for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
4113  if (OrigLoop->isLoopInvariant(U.get()))
4114  Indices.push_back(U.get());
4115  else
4116  Indices.push_back(getOrCreateVectorValue(U.get(), Part));
4117  }
4118 
4119  // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4120  // but it should be a vector, otherwise.
4121  auto *NewGEP =
4122  GEP->isInBounds()
4123  ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4124  Indices)
4125  : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4126  assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4127  "NewGEP is not a pointer vector");
4128  VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
4129  addMetadata(NewGEP, GEP);
4130  }
4131  }
4132 
4133  break;
4134  }
4135  case Instruction::UDiv:
4136  case Instruction::SDiv:
4137  case Instruction::SRem:
4138  case Instruction::URem:
4139  case Instruction::Add:
4140  case Instruction::FAdd:
4141  case Instruction::Sub:
4142  case Instruction::FSub:
4143  case Instruction::FNeg:
4144  case Instruction::Mul:
4145  case Instruction::FMul:
4146  case Instruction::FDiv:
4147  case Instruction::FRem:
4148  case Instruction::Shl:
4149  case Instruction::LShr:
4150  case Instruction::AShr:
4151  case Instruction::And:
4152  case Instruction::Or:
4153  case Instruction::Xor: {
4154  // Just widen unops and binops.
4156 
4157  for (unsigned Part = 0; Part < UF; ++Part) {
4159  for (Value *Op : I.operands())
4160  Ops.push_back(getOrCreateVectorValue(Op, Part));
4161 
4162  Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4163 
4164  if (auto *VecOp = dyn_cast<Instruction>(V))
4165  VecOp->copyIRFlags(&I);
4166 
4167  // Use this vector value for all users of the original instruction.
4168  VectorLoopValueMap.setVectorValue(&I, Part, V);
4169  addMetadata(V, &I);
4170  }
4171 
4172  break;
4173  }
4174  case Instruction::Select: {
4175  // Widen selects.
4176  // If the selector is loop invariant we can create a select
4177  // instruction with a scalar condition. Otherwise, use vector-select.
4178  auto *SE = PSE.getSE();
4179  bool InvariantCond =
4182 
4183  // The condition can be loop invariant but still defined inside the
4184  // loop. This means that we can't just use the original 'cond' value.
4185  // We have to take the 'vectorized' value and pick the first lane.
4186  // Instcombine will make this a no-op.
4187 
4188  auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4189 
4190  for (unsigned Part = 0; Part < UF; ++Part) {
4191  Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4192  Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4193  Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4194  Value *Sel =
4195  Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4196  VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4197  addMetadata(Sel, &I);
4198  }
4199 
4200  break;
4201  }
4202 
4203  case Instruction::ICmp:
4204  case Instruction::FCmp: {
4205  // Widen compares. Generate vector compares.
4206  bool FCmp = (I.getOpcode() == Instruction::FCmp);
4207  auto *Cmp = cast<CmpInst>(&I);
4209  for (unsigned Part = 0; Part < UF; ++Part) {
4210  Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4211  Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4212  Value *C = nullptr;
4213  if (FCmp) {
4214  // Propagate fast math flags.
4216  Builder.setFastMathFlags(Cmp->getFastMathFlags());
4217  C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4218  } else {
4219  C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4220  }
4221  VectorLoopValueMap.setVectorValue(&I, Part, C);
4222  addMetadata(C, &I);
4223  }
4224 
4225  break;
4226  }
4227 
4228  case Instruction::ZExt:
4229  case Instruction::SExt:
4230  case Instruction::FPToUI:
4231  case Instruction::FPToSI:
4232  case Instruction::FPExt:
4233  case Instruction::PtrToInt:
4234  case Instruction::IntToPtr:
4235  case Instruction::SIToFP:
4236  case Instruction::UIToFP:
4237  case Instruction::Trunc:
4238  case Instruction::FPTrunc:
4239  case Instruction::BitCast: {
4240  auto *CI = cast<CastInst>(&I);
4242 
4243  /// Vectorize casts.
4244  Type *DestTy =
4245  (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4246 
4247  for (unsigned Part = 0; Part < UF; ++Part) {
4248  Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4249  Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4250  VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4251  addMetadata(Cast, &I);
4252  }
4253  break;
4254  }
4255 
4256  case Instruction::Call: {
4257  // Ignore dbg intrinsics.
4258  if (isa<DbgInfoIntrinsic>(I))
4259  break;
4261 
4262  Module *M = I.getParent()->getParent()->getParent();
4263  auto *CI = cast<CallInst>(&I);
4264 
4265  StringRef FnName = CI->getCalledFunction()->getName();
4266  Function *F = CI->getCalledFunction();
4267  Type *RetTy = ToVectorTy(CI->getType(), VF);
4269  for (Value *ArgOperand : CI->arg_operands())
4270  Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4271 
4273 
4274  // The flag shows whether we use Intrinsic or a usual Call for vectorized
4275  // version of the instruction.
4276  // Is it beneficial to perform intrinsic call compared to lib call?
4277  bool NeedToScalarize;
4278  unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4279  bool UseVectorIntrinsic =
4280  ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4281  assert((UseVectorIntrinsic || !NeedToScalarize) &&
4282  "Instruction should be scalarized elsewhere.");
4283 
4284  for (unsigned Part = 0; Part < UF; ++Part) {
4286  for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4287  Value *Arg = CI->getArgOperand(i);
4288  // Some intrinsics have a scalar argument - don't replace it with a
4289  // vector.
4290  if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4291  Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4292  Args.push_back(Arg);
4293  }
4294 
4295  Function *VectorF;
4296  if (UseVectorIntrinsic) {
4297  // Use vector version of the intrinsic.
4298  Type *TysForDecl[] = {CI->getType()};
4299  if (VF > 1)
4300  TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4301  VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4302  } else {
4303  // Use vector version of the library call.
4304  StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
4305  assert(!VFnName.empty() && "Vector function name is empty.");
4306  VectorF = M->getFunction(VFnName);
4307  if (!VectorF) {
4308  // Generate a declaration
4309  FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
4310  VectorF =
4311  Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
4312  VectorF->copyAttributesFrom(F);
4313  }
4314  }
4315  assert(VectorF && "Can't create vector function.");
4316 
4318  CI->getOperandBundlesAsDefs(OpBundles);
4319  CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4320 
4321  if (isa<FPMathOperator>(V))
4322  V->copyFastMathFlags(CI);
4323 
4324  VectorLoopValueMap.setVectorValue(&I, Part, V);
4325  addMetadata(V, &I);
4326  }
4327 
4328  break;
4329  }
4330 
4331  default:
4332  // This instruction is not vectorized by simple widening.
4333  LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4334  llvm_unreachable("Unhandled instruction!");
4335  } // end of switch.
4336 }
4337 
4339  // Forget the original basic block.
4341 
4342  // DT is not kept up-to-date for outer loop vectorization
4344  return;
4345 
4346  // Update the dominator tree information.
4348  "Entry does not dominate exit.");
4349 
4356 }
4357 
4358 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4359  // We should not collect Scalars more than once per VF. Right now, this
4360  // function is called from collectUniformsAndScalars(), which already does
4361  // this check. Collecting Scalars for VF=1 does not make any sense.
4362  assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4363  "This function should not be visited twice for the same VF");
4364 
4366 
4367  // These sets are used to seed the analysis with pointers used by memory
4368  // accesses that will remain scalar.
4370  SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4371 
4372  // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4373  // The pointer operands of loads and stores will be scalar as long as the
4374  // memory access is not a gather or scatter operation. The value operand of a
4375  // store will remain scalar if the store is scalarized.
4376  auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4377  InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4378  assert(WideningDecision != CM_Unknown &&
4379  "Widening decision should be ready at this moment");
4380  if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4381  if (Ptr == Store->getValueOperand())
4382  return WideningDecision == CM_Scalarize;
4383  assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4384  "Ptr is neither a value or pointer operand");
4385  return WideningDecision != CM_GatherScatter;
4386  };
4387 
4388  // A helper that returns true if the given value is a bitcast or
4389  // getelementptr instruction contained in the loop.
4390  auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4391  return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4392  isa<GetElementPtrInst>(V)) &&
4393  !TheLoop->isLoopInvariant(V);
4394  };
4395 
4396  // A helper that evaluates a memory access's use of a pointer. If the use
4397  // will be a scalar use, and the pointer is only used by memory accesses, we
4398  // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4399  // PossibleNonScalarPtrs.
4400  auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4401  // We only care about bitcast and getelementptr instructions contained in
4402  // the loop.
4403  if (!isLoopVaryingBitCastOrGEP(Ptr))
4404  return;
4405 
4406  // If the pointer has already been identified as scalar (e.g., if it was
4407  // also identified as uniform), there's nothing to do.
4408  auto *I = cast<Instruction>(Ptr);
4409  if (Worklist.count(I))
4410  return;
4411 
4412  // If the use of the pointer will be a scalar use, and all users of the
4413  // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4414  // place the pointer in PossibleNonScalarPtrs.
4415  if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4416  return isa<LoadInst>(U) || isa<StoreInst>(U);
4417  }))
4418  ScalarPtrs.insert(I);
4419  else
4420  PossibleNonScalarPtrs.insert(I);
4421  };
4422 
4423  // We seed the scalars analysis with three classes of instructions: (1)
4424  // instructions marked uniform-after-vectorization, (2) bitcast and
4425  // getelementptr instructions used by memory accesses requiring a scalar use,
4426  // and (3) pointer induction variables and their update instructions (we
4427  // currently only scalarize these).
4428  //
4429  // (1) Add to the worklist all instructions that have been identified as
4430  // uniform-after-vectorization.
4431  Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4432 
4433  // (2) Add to the worklist all bitcast and getelementptr instructions used by
4434  // memory accesses requiring a scalar use. The pointer operands of loads and
4435  // stores will be scalar as long as the memory accesses is not a gather or
4436  // scatter operation. The value operand of a store will remain scalar if the
4437  // store is scalarized.
4438  for (auto *BB : TheLoop->blocks())
4439  for (auto &I : *BB) {
4440  if (auto *Load = dyn_cast<LoadInst>(&I)) {
4441  evaluatePtrUse(Load, Load->getPointerOperand());
4442  } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4443  evaluatePtrUse(Store, Store->getPointerOperand());
4444  evaluatePtrUse(Store, Store->getValueOperand());
4445  }
4446  }
4447  for (auto *I : ScalarPtrs)
4448  if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4449  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4450  Worklist.insert(I);
4451  }
4452 
4453  // (3) Add to the worklist all pointer induction variables and their update
4454  // instructions.
4455  //
4456  // TODO: Once we are able to vectorize pointer induction variables we should
4457  // no longer insert them into the worklist here.
4458  auto *Latch = TheLoop->getLoopLatch();
4459  for (auto &Induction : *Legal->getInductionVars()) {
4460  auto *Ind = Induction.first;
4461  auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4462  if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4463  continue;
4464  Worklist.insert(Ind);
4465  Worklist.insert(IndUpdate);
4466  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4467  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4468  << "\n");
4469  }
4470 
4471  // Insert the forced scalars.
4472  // FIXME: Currently widenPHIInstruction() often creates a dead vector
4473  // induction variable when the PHI user is scalarized.
4474  auto ForcedScalar = ForcedScalars.find(VF);
4475  if (ForcedScalar != ForcedScalars.end())
4476  for (auto *I : ForcedScalar->second)
4477  Worklist.insert(I);
4478 
4479  // Expand the worklist by looking through any bitcasts and getelementptr
4480  // instructions we've already identified as scalar. This is similar to the
4481  // expansion step in collectLoopUniforms(); however, here we're only
4482  // expanding to include additional bitcasts and getelementptr instructions.
4483  unsigned Idx = 0;
4484  while (Idx != Worklist.size()) {
4485  Instruction *Dst = Worklist[Idx++];
4486  if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4487  continue;
4488  auto *Src = cast<Instruction>(Dst->getOperand(0));
4489  if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4490  auto *J = cast<Instruction>(U);
4491  return !TheLoop->contains(J) || Worklist.count(J) ||
4492  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4493  isScalarUse(J, Src));
4494  })) {
4495  Worklist.insert(Src);
4496  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4497  }
4498  }
4499 
4500  // An induction variable will remain scalar if all users of the induction
4501  // variable and induction variable update remain scalar.
4502  for (auto &Induction : *Legal->getInductionVars()) {
4503  auto *Ind = Induction.first;
4504  auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4505 
4506  // We already considered pointer induction variables, so there's no reason
4507  // to look at their users again.
4508  //
4509  // TODO: Once we are able to vectorize pointer induction variables we
4510  // should no longer skip over them here.
4511  if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4512  continue;
4513 
4514  // Determine if all users of the induction variable are scalar after
4515  // vectorization.
4516  auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4517  auto *I = cast<Instruction>(U);
4518  return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4519  });
4520  if (!ScalarInd)
4521  continue;
4522 
4523  // Determine if all users of the induction variable update instruction are
4524  // scalar after vectorization.
4525  auto ScalarIndUpdate =
4526  llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4527  auto *I = cast<Instruction>(U);
4528  return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4529  });
4530  if (!ScalarIndUpdate)
4531  continue;
4532 
4533  // The induction variable and its update instruction will remain scalar.
4534  Worklist.insert(Ind);
4535  Worklist.insert(IndUpdate);
4536  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4537  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4538  << "\n");
4539  }
4540 
4541  Scalars[VF].insert(Worklist.begin(), Worklist.end());
4542 }
4543 
4545  if (!blockNeedsPredication(I->getParent()))
4546  return false;
4547  switch(I->getOpcode()) {
4548  default:
4549  break;
4550  case Instruction::Load:
4551  case Instruction::Store: {
4552  if (!Legal->isMaskRequired(I))
4553  return false;
4554  auto *Ptr = getLoadStorePointerOperand(I);
4555  auto *Ty = getMemInstValueType(I);
4556  // We have already decided how to vectorize this instruction, get that
4557  // result.
4558  if (VF > 1) {
4559  InstWidening WideningDecision = getWideningDecision(I, VF);
4560  assert(WideningDecision != CM_Unknown &&
4561  "Widening decision should be ready at this moment");
4562  return WideningDecision == CM_Scalarize;
4563  }
4564  const MaybeAlign Alignment = getLoadStoreAlignment(I);
4565  return isa<LoadInst>(I) ?
4566  !(isLegalMaskedLoad(Ty, Ptr, Alignment) || isLegalMaskedGather(Ty))
4567  : !(isLegalMaskedStore(Ty, Ptr, Alignment) || isLegalMaskedScatter(Ty));
4568  }
4569  case Instruction::UDiv:
4570  case Instruction::SDiv:
4571  case Instruction::SRem:
4572  case Instruction::URem:
4573  return mayDivideByZero(*I);
4574  }
4575  return false;
4576 }
4577 
4579  unsigned VF) {
4580  assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4581  assert(getWideningDecision(I, VF) == CM_Unknown &&
4582  "Decision should not be set yet.");
4583  auto *Group = getInterleavedAccessGroup(I);
4584  assert(Group && "Must have a group.");
4585 
4586  // If the instruction's allocated size doesn't equal it's type size, it
4587  // requires padding and will be scalarized.
4588  auto &DL = I->getModule()->getDataLayout();
4589  auto *ScalarTy = getMemInstValueType(I);
4590  if (hasIrregularType(ScalarTy, DL, VF))
4591  return false;
4592 
4593  // Check if masking is required.
4594  // A Group may need masking for one of two reasons: it resides in a block that
4595  // needs predication, or it was decided to use masking to deal with gaps.
4596  bool PredicatedAccessRequiresMasking =
4598  bool AccessWithGapsRequiresMasking =
4599  Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4600  if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4601  return true;
4602 
4603  // If masked interleaving is required, we expect that the user/target had
4604  // enabled it, because otherwise it either wouldn't have been created or
4605  // it should have been invalidated by the CostModel.
4607  "Masked interleave-groups for predicated accesses are not enabled.");
4608 
4609  auto *Ty = getMemInstValueType(I);
4610  const MaybeAlign Alignment = getLoadStoreAlignment(I);
4611  return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4612  : TTI.isLegalMaskedStore(Ty, Alignment);
4613 }
4614 
4616  unsigned VF) {
4617  // Get and ensure we have a valid memory instruction.
4618  LoadInst *LI = dyn_cast<LoadInst>(I);
4620  assert((LI || SI) && "Invalid memory instruction");
4621 
4622  auto *Ptr = getLoadStorePointerOperand(I);
4623 
4624  // In order to be widened, the pointer should be consecutive, first of all.
4625  if (!Legal->isConsecutivePtr(Ptr))
4626  return false;
4627 
4628  // If the instruction is a store located in a predicated block, it will be
4629  // scalarized.
4630  if (isScalarWithPredication(I))
4631  return false;
4632 
4633  // If the instruction's allocated size doesn't equal it's type size, it
4634  // requires padding and will be scalarized.
4635  auto &DL = I->getModule()->getDataLayout();
4636  auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4637  if (hasIrregularType(ScalarTy, DL, VF))
4638  return false;
4639 
4640  return true;
4641 }
4642 
4643 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4644  // We should not collect Uniforms more than once per VF. Right now,
4645  // this function is called from collectUniformsAndScalars(), which
4646  // already does this check. Collecting Uniforms for VF=1 does not make any
4647  // sense.
4648 
4649  assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4650  "This function should not be visited twice for the same VF");
4651 
4652  // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4653  // not analyze again. Uniforms.count(VF) will return 1.
4654  Uniforms[VF].clear();
4655 
4656  // We now know that the loop is vectorizable!
4657  // Collect instructions inside the loop that will remain uniform after
4658  // vectorization.
4659 
4660  // Global values, params and instructions outside of current loop are out of
4661  // scope.
4662  auto isOutOfScope = [&](Value *V) -> bool {
4664  return (!I || !TheLoop->contains(I));
4665  };
4666 
4667  SetVector<Instruction *> Worklist;
4668  BasicBlock *Latch = TheLoop->getLoopLatch();
4669 
4670  // Start with the conditional branch. If the branch condition is an
4671  // instruction contained in the loop that is only used by the branch, it is
4672  // uniform.
4673  auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4674  if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
4675  Worklist.insert(Cmp);
4676  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
4677  }
4678 
4679  // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4680  // are pointers that are treated like consecutive pointers during
4681  // vectorization. The pointer operands of interleaved accesses are an
4682  // example.
4683  SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4684 
4685  // Holds pointer operands of instructions that are possibly non-uniform.
4686  SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4687 
4688  auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4689  InstWidening WideningDecision = getWideningDecision(I, VF);
4690  assert(WideningDecision != CM_Unknown &&
4691  "Widening decision should be ready at this moment");
4692 
4693  return (WideningDecision == CM_Widen ||
4694  WideningDecision == CM_Widen_Reverse ||
4695  WideningDecision == CM_Interleave);
4696  };
4697  // Iterate over the instructions in the loop, and collect all
4698  // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4699  // that a consecutive-like pointer operand will be scalarized, we collect it
4700  // in PossibleNonUniformPtrs instead. We use two sets here because a single
4701  // getelementptr instruction can be used by both vectorized and scalarized
4702  // memory instructions. For example, if a loop loads and stores from the same
4703  // location, but the store is conditional, the store will be scalarized, and
4704  // the getelementptr won't remain uniform.
4705  for (auto *BB : TheLoop->blocks())
4706  for (auto &I : *BB) {
4707  // If there's no pointer operand, there's nothing to do.
4708  auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4709  if (!Ptr)
4710  continue;
4711 
4712  // True if all users of Ptr are memory accesses that have Ptr as their
4713  // pointer operand.
4714  auto UsersAreMemAccesses =
4715  llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4716  return getLoadStorePointerOperand(U) == Ptr;
4717  });
4718 
4719  // Ensure the memory instruction will not be scalarized or used by
4720  // gather/scatter, making its pointer operand non-uniform. If the pointer
4721  // operand is used by any instruction other than a memory access, we
4722  // conservatively assume the pointer operand may be non-uniform.
4723  if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4724  PossibleNonUniformPtrs.insert(Ptr);
4725 
4726  // If the memory instruction will be vectorized and its pointer operand
4727  // is consecutive-like, or interleaving - the pointer operand should
4728  // remain uniform.
4729  else
4730  ConsecutiveLikePtrs.insert(Ptr);
4731  }
4732 
4733  // Add to the Worklist all consecutive and consecutive-like pointers that
4734  // aren't also identified as possibly non-uniform.
4735  for (auto *V : ConsecutiveLikePtrs)
4736  if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) {
4737  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
4738  Worklist.insert(V);
4739  }
4740 
4741  // Expand Worklist in topological order: whenever a new instruction
4742  // is added , its users should be already inside Worklist. It ensures
4743  // a uniform instruction will only be used by uniform instructions.
4744  unsigned idx = 0;
4745  while (idx != Worklist.size()) {
4746  Instruction *I = Worklist[idx++];
4747 
4748  for (auto OV : I->operand_values()) {
4749  // isOutOfScope operands cannot be uniform instructions.
4750  if (isOutOfScope(OV))
4751  continue;
4752  // First order recurrence Phi's should typically be considered
4753  // non-uniform.
4754  auto *OP = dyn_cast<PHINode>(OV);
4755  if (OP && Legal->isFirstOrderRecurrence(OP))
4756  continue;
4757  // If all the users of the operand are uniform, then add the
4758  // operand into the uniform worklist.
4759  auto *OI = cast<Instruction>(OV);
4760  if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4761  auto *J = cast<Instruction>(U);
4762  return Worklist.count(J) ||
4763  (OI == getLoadStorePointerOperand(J) &&
4764  isUniformDecision(J, VF));
4765  })) {
4766  Worklist.insert(OI);
4767  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
4768  }
4769  }
4770  }
4771 
4772  // Returns true if Ptr is the pointer operand of a memory access instruction
4773  // I, and I is known to not require scalarization.
4774  auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4775  return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4776  };
4777 
4778  // For an instruction to be added into Worklist above, all its users inside
4779  // the loop should also be in Worklist. However, this condition cannot be
4780  // true for phi nodes that form a cyclic dependence. We must process phi
4781  // nodes separately. An induction variable will remain uniform if all users
4782  // of the induction variable and induction variable update remain uniform.
4783  // The code below handles both pointer and non-pointer induction variables.
4784  for (auto &Induction : *Legal->getInductionVars()) {
4785  auto *Ind = Induction.first;
4786  auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4787 
4788  // Determine if all users of the induction variable are uniform after
4789  // vectorization.
4790  auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4791  auto *I = cast<Instruction>(U);
4792  return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4793  isVectorizedMemAccessUse(I, Ind);
4794  });
4795  if (!UniformInd)
4796  continue;
4797 
4798  // Determine if all users of the induction variable update instruction are
4799  // uniform after vectorization.
4800  auto UniformIndUpdate =
4801  llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4802  auto *I = cast<Instruction>(U);
4803  return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4804  isVectorizedMemAccessUse(I, IndUpdate);
4805  });
4806  if (!UniformIndUpdate)
4807  continue;
4808 
4809  // The induction variable and its update instruction will remain uniform.
4810  Worklist.insert(Ind);
4811  Worklist.insert(IndUpdate);
4812  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
4813  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate
4814  << "\n");
4815  }
4816 
4817  Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4818 }
4819 
4821  LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4822 
4824  reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4825  "runtime pointer checks needed. Enable vectorization of this "
4826  "loop with '#pragma clang loop vectorize(enable)' when "
4827  "compiling with -Os/-Oz",
4828  "CantVersionLoopWithOptForSize", ORE, TheLoop);
4829  return true;
4830  }
4831 
4832  if (!PSE.getUnionPredicate().getPredicates().empty()) {
4833  reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4834  "runtime SCEV checks needed. Enable vectorization of this "
4835  "loop with '#pragma clang loop vectorize(enable)' when "
4836  "compiling with -Os/-Oz",
4837  "CantVersionLoopWithOptForSize", ORE, TheLoop);
4838  return true;
4839  }
4840 
4841  // FIXME: Avoid specializing for stride==1 instead of bailing out.
4842  if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4843  reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
4844  "runtime stride == 1 checks needed. Enable vectorization of "
4845  "this loop with '#pragma clang loop vectorize(enable)' when "
4846  "compiling with -Os/-Oz",
4847  "CantVersionLoopWithOptForSize", ORE, TheLoop);
4848  return true;
4849  }
4850 
4851  return false;
4852 }
4853 
4856  // TODO: It may by useful to do since it's still likely to be dynamically
4857  // uniform if the target can skip.
4859  "Not inserting runtime ptr check for divergent target",
4860  "runtime pointer checks needed. Not enabled for divergent target",
4861  "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4862  return None;
4863  }
4864 
4865  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4866  LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4867  if (TC == 1) {
4868  reportVectorizationFailure("Single iteration (non) loop",
4869  "loop trip count is one, irrelevant for vectorization",
4870  "SingleIterationLoop", ORE, TheLoop);
4871  return None;
4872  }
4873 
4874  switch (ScalarEpilogueStatus) {
4876  return computeFeasibleMaxVF(TC);
4878  LLVM_DEBUG(
4879  dbgs() << "LV: vector predicate hint/switch found.\n"
4880  << "LV: Not allowing scalar epilogue, creating predicated "
4881  << "vector loop.\n");
4882  break;
4884  // fallthrough as a special case of OptForSize
4886  if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4887  LLVM_DEBUG(
4888  dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4889  else
4890  LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4891  << "count.\n");
4892 
4893  // Bail if runtime checks are required, which are not good when optimising
4894  // for size.
4895  if (runtimeChecksRequired())
4896  return None;
4897  break;
4898  }
4899 
4900  // Now try the tail folding
4901 
4902  // Invalidate interleave groups that require an epilogue if we can't mask
4903  // the interleave-group.
4905  InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4906 
4907  unsigned MaxVF = computeFeasibleMaxVF(TC);
4908  if (TC > 0 && TC % MaxVF == 0) {
4909  // Accept MaxVF if we do not have a tail.
4910  LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4911  return MaxVF;
4912  }
4913 
4914  // If we don't know the precise trip count, or if the trip count that we
4915  // found modulo the vectorization factor is not zero, try to fold the tail
4916  // by masking.
4917  // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4919  FoldTailByMasking = true;
4920  return MaxVF;
4921  }
4922 
4923  if (TC == 0) {
4925  "Unable to calculate the loop count due to complex control flow",
4926  "unable to calculate the loop count due to complex control flow",
4927  "UnknownLoopCountComplexCFG", ORE, TheLoop);
4928  return None;
4929  }
4930 
4932  "Cannot optimize for size and vectorize at the same time.",
4933  "cannot optimize for size and vectorize at the same time. "
4934  "Enable vectorization of this loop with '#pragma clang loop "
4935  "vectorize(enable)' when compiling with -Os/-Oz",
4936  "NoTailLoopWithOptForSize", ORE, TheLoop);
4937  return None;
4938 }