LLVM  4.0.0
LoopVectorize.cpp
Go to the documentation of this file.
1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
11 // and generates target-independent LLVM-IR.
12 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
13 // of instructions in order to estimate the profitability of vectorization.
14 //
15 // The loop vectorizer combines consecutive loop iterations into a single
16 // 'wide' iteration. After this transformation the index is incremented
17 // by the SIMD vector width, and not by one.
18 //
19 // This pass has three parts:
20 // 1. The main loop pass that drives the different parts.
21 // 2. LoopVectorizationLegality - A unit that checks for the legality
22 // of the vectorization.
23 // 3. InnerLoopVectorizer - A unit that performs the actual
24 // widening of instructions.
25 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
26 // of vectorization. It decides on the optimal vector width, which
27 // can be one, if vectorization is not profitable.
28 //
29 //===----------------------------------------------------------------------===//
30 //
31 // The reduction-variable vectorization is based on the paper:
32 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
33 //
34 // Variable uniformity checks are inspired by:
35 // Karrenberg, R. and Hack, S. Whole Function Vectorization.
36 //
37 // The interleaved access vectorization is based on the paper:
38 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
39 // Data for SIMD
40 //
41 // Other ideas/concepts are from:
42 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
43 //
44 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
45 // Vectorizing Compilers.
46 //
47 //===----------------------------------------------------------------------===//
48 
50 #include "llvm/ADT/DenseMap.h"
51 #include "llvm/ADT/Hashing.h"
52 #include "llvm/ADT/MapVector.h"
53 #include "llvm/ADT/SCCIterator.h"
54 #include "llvm/ADT/SetVector.h"
55 #include "llvm/ADT/SmallPtrSet.h"
56 #include "llvm/ADT/SmallSet.h"
57 #include "llvm/ADT/SmallVector.h"
58 #include "llvm/ADT/Statistic.h"
59 #include "llvm/ADT/StringExtras.h"
62 #include "llvm/Analysis/LoopInfo.h"
64 #include "llvm/Analysis/LoopPass.h"
69 #include "llvm/IR/Constants.h"
70 #include "llvm/IR/DataLayout.h"
71 #include "llvm/IR/DebugInfo.h"
72 #include "llvm/IR/DerivedTypes.h"
73 #include "llvm/IR/DiagnosticInfo.h"
74 #include "llvm/IR/Dominators.h"
75 #include "llvm/IR/Function.h"
76 #include "llvm/IR/IRBuilder.h"
77 #include "llvm/IR/Instructions.h"
78 #include "llvm/IR/IntrinsicInst.h"
79 #include "llvm/IR/LLVMContext.h"
80 #include "llvm/IR/Module.h"
81 #include "llvm/IR/PatternMatch.h"
82 #include "llvm/IR/Type.h"
83 #include "llvm/IR/User.h"
84 #include "llvm/IR/Value.h"
85 #include "llvm/IR/ValueHandle.h"
86 #include "llvm/IR/Verifier.h"
87 #include "llvm/Pass.h"
90 #include "llvm/Support/Debug.h"
92 #include "llvm/Transforms/Scalar.h"
98 #include <algorithm>
99 #include <map>
100 #include <tuple>
101 
102 using namespace llvm;
103 using namespace llvm::PatternMatch;
104 
105 #define LV_NAME "loop-vectorize"
106 #define DEBUG_TYPE LV_NAME
107 
108 STATISTIC(LoopsVectorized, "Number of loops vectorized");
109 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
110 
111 static cl::opt<bool>
112  EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
113  cl::desc("Enable if-conversion during vectorization."));
114 
115 /// We don't vectorize loops with a known constant trip count below this number.
117  "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
118  cl::desc("Don't vectorize loops with a constant "
119  "trip count that is smaller than this "
120  "value."));
121 
123  "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
124  cl::desc("Maximize bandwidth when selecting vectorization factor which "
125  "will be determined by the smallest type in loop."));
126 
128  "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
129  cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
130 
131 /// Maximum factor for an interleaved memory access.
133  "max-interleave-group-factor", cl::Hidden,
134  cl::desc("Maximum factor for an interleaved access group (default = 8)"),
135  cl::init(8));
136 
137 /// We don't interleave loops with a known constant trip count below this
138 /// number.
139 static const unsigned TinyTripCountInterleaveThreshold = 128;
140 
142  "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
143  cl::desc("A flag that overrides the target's number of scalar registers."));
144 
146  "force-target-num-vector-regs", cl::init(0), cl::Hidden,
147  cl::desc("A flag that overrides the target's number of vector registers."));
148 
149 /// Maximum vectorization interleave count.
150 static const unsigned MaxInterleaveFactor = 16;
151 
153  "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
154  cl::desc("A flag that overrides the target's max interleave factor for "
155  "scalar loops."));
156 
158  "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
159  cl::desc("A flag that overrides the target's max interleave factor for "
160  "vectorized loops."));
161 
163  "force-target-instruction-cost", cl::init(0), cl::Hidden,
164  cl::desc("A flag that overrides the target's expected cost for "
165  "an instruction to a single constant value. Mostly "
166  "useful for getting consistent testing."));
167 
169  "small-loop-cost", cl::init(20), cl::Hidden,
170  cl::desc(
171  "The cost of a loop that is considered 'small' by the interleaver."));
172 
174  "loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden,
175  cl::desc("Enable the use of the block frequency analysis to access PGO "
176  "heuristics minimizing code growth in cold regions and being more "
177  "aggressive in hot regions."));
178 
179 // Runtime interleave loops for load/store throughput.
181  "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
182  cl::desc(
183  "Enable runtime interleaving until load/store ports are saturated"));
184 
185 /// The number of stores in a loop that are allowed to need predication.
187  "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
188  cl::desc("Max number of stores to be predicated behind an if."));
189 
191  "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
192  cl::desc("Count the induction variable only once when interleaving"));
193 
195  "enable-cond-stores-vec", cl::init(true), cl::Hidden,
196  cl::desc("Enable if predication of stores during vectorization."));
197 
199  "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
200  cl::desc("The maximum interleave count to use when interleaving a scalar "
201  "reduction in a nested loop."));
202 
204  "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
205  cl::desc("The maximum allowed number of runtime memory checks with a "
206  "vectorize(enable) pragma."));
207 
209  "vectorize-scev-check-threshold", cl::init(16), cl::Hidden,
210  cl::desc("The maximum number of SCEV checks allowed."));
211 
213  "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden,
214  cl::desc("The maximum number of SCEV checks allowed with a "
215  "vectorize(enable) pragma"));
216 
217 /// Create an analysis remark that explains why vectorization failed
218 ///
219 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
220 /// RemarkName is the identifier for the remark. If \p I is passed it is an
221 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for
222 /// the location of the remark. \return the remark object that can be
223 /// streamed to.
225 createMissedAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
226  Instruction *I = nullptr) {
227  Value *CodeRegion = TheLoop->getHeader();
228  DebugLoc DL = TheLoop->getStartLoc();
229 
230  if (I) {
231  CodeRegion = I->getParent();
232  // If there is no debug location attached to the instruction, revert back to
233  // using the loop's.
234  if (I->getDebugLoc())
235  DL = I->getDebugLoc();
236  }
237 
238  OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
239  R << "loop not vectorized: ";
240  return R;
241 }
242 
243 namespace {
244 
245 // Forward declarations.
246 class LoopVectorizeHints;
247 class LoopVectorizationLegality;
248 class LoopVectorizationCostModel;
249 class LoopVectorizationRequirements;
250 
251 /// Returns true if the given loop body has a cycle, excluding the loop
252 /// itself.
253 static bool hasCyclesInLoopBody(const Loop &L) {
254  if (!L.empty())
255  return true;
256 
257  for (const auto &SCC :
260  if (SCC.size() > 1) {
261  DEBUG(dbgs() << "LVL: Detected a cycle in the loop body:\n");
262  DEBUG(L.dump());
263  return true;
264  }
265  }
266  return false;
267 }
268 
269 /// \brief This modifies LoopAccessReport to initialize message with
270 /// loop-vectorizer-specific part.
271 class VectorizationReport : public LoopAccessReport {
272 public:
273  VectorizationReport(Instruction *I = nullptr)
274  : LoopAccessReport("loop not vectorized: ", I) {}
275 
276  /// \brief This allows promotion of the loop-access analysis report into the
277  /// loop-vectorizer report. It modifies the message to add the
278  /// loop-vectorizer-specific part of the message.
279  explicit VectorizationReport(const LoopAccessReport &R)
280  : LoopAccessReport(Twine("loop not vectorized: ") + R.str(),
281  R.getInstr()) {}
282 };
283 
284 /// A helper function for converting Scalar types to vector types.
285 /// If the incoming type is void, we return void. If the VF is 1, we return
286 /// the scalar type.
287 static Type *ToVectorTy(Type *Scalar, unsigned VF) {
288  if (Scalar->isVoidTy() || VF == 1)
289  return Scalar;
290  return VectorType::get(Scalar, VF);
291 }
292 
293 /// A helper function that returns GEP instruction and knows to skip a
294 /// 'bitcast'. The 'bitcast' may be skipped if the source and the destination
295 /// pointee types of the 'bitcast' have the same size.
296 /// For example:
297 /// bitcast double** %var to i64* - can be skipped
298 /// bitcast double** %var to i8* - can not
299 static GetElementPtrInst *getGEPInstruction(Value *Ptr) {
300 
301  if (isa<GetElementPtrInst>(Ptr))
302  return cast<GetElementPtrInst>(Ptr);
303 
304  if (isa<BitCastInst>(Ptr) &&
305  isa<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0))) {
306  Type *BitcastTy = Ptr->getType();
307  Type *GEPTy = cast<BitCastInst>(Ptr)->getSrcTy();
308  if (!isa<PointerType>(BitcastTy) || !isa<PointerType>(GEPTy))
309  return nullptr;
310  Type *Pointee1Ty = cast<PointerType>(BitcastTy)->getPointerElementType();
311  Type *Pointee2Ty = cast<PointerType>(GEPTy)->getPointerElementType();
312  const DataLayout &DL = cast<BitCastInst>(Ptr)->getModule()->getDataLayout();
313  if (DL.getTypeSizeInBits(Pointee1Ty) == DL.getTypeSizeInBits(Pointee2Ty))
314  return cast<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0));
315  }
316  return nullptr;
317 }
318 
319 /// A helper function that returns the pointer operand of a load or store
320 /// instruction.
321 static Value *getPointerOperand(Value *I) {
322  if (auto *LI = dyn_cast<LoadInst>(I))
323  return LI->getPointerOperand();
324  if (auto *SI = dyn_cast<StoreInst>(I))
325  return SI->getPointerOperand();
326  return nullptr;
327 }
328 
329 /// A helper function that returns true if the given type is irregular. The
330 /// type is irregular if its allocated size doesn't equal the store size of an
331 /// element of the corresponding vector type at the given vectorization factor.
332 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
333 
334  // Determine if an array of VF elements of type Ty is "bitcast compatible"
335  // with a <VF x Ty> vector.
336  if (VF > 1) {
337  auto *VectorTy = VectorType::get(Ty, VF);
338  return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
339  }
340 
341  // If the vectorization factor is one, we just check if an array of type Ty
342  // requires padding between elements.
343  return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
344 }
345 
346 /// A helper function that returns the reciprocal of the block probability of
347 /// predicated blocks. If we return X, we are assuming the predicated block
348 /// will execute once for for every X iterations of the loop header.
349 ///
350 /// TODO: We should use actual block probability here, if available. Currently,
351 /// we always assume predicated blocks have a 50% chance of executing.
352 static unsigned getReciprocalPredBlockProb() { return 2; }
353 
354 /// InnerLoopVectorizer vectorizes loops which contain only one basic
355 /// block to a specified vectorization factor (VF).
356 /// This class performs the widening of scalars into vectors, or multiple
357 /// scalars. This class also implements the following features:
358 /// * It inserts an epilogue loop for handling loops that don't have iteration
359 /// counts that are known to be a multiple of the vectorization factor.
360 /// * It handles the code generation for reduction variables.
361 /// * Scalarization (implementation using scalars) of un-vectorizable
362 /// instructions.
363 /// InnerLoopVectorizer does not perform any vectorization-legality
364 /// checks, and relies on the caller to check for the different legality
365 /// aspects. The InnerLoopVectorizer relies on the
366 /// LoopVectorizationLegality class to provide information about the induction
367 /// and reduction variables that were found to a given vectorization factor.
368 class InnerLoopVectorizer {
369 public:
370  InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
371  LoopInfo *LI, DominatorTree *DT,
372  const TargetLibraryInfo *TLI,
373  const TargetTransformInfo *TTI, AssumptionCache *AC,
374  OptimizationRemarkEmitter *ORE, unsigned VecWidth,
375  unsigned UnrollFactor, LoopVectorizationLegality *LVL,
376  LoopVectorizationCostModel *CM)
377  : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
378  AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
379  Builder(PSE.getSE()->getContext()), Induction(nullptr),
380  OldInduction(nullptr), VectorLoopValueMap(UnrollFactor, VecWidth),
381  TripCount(nullptr), VectorTripCount(nullptr), Legal(LVL), Cost(CM),
382  AddedSafetyChecks(false) {}
383 
384  // Perform the actual loop widening (vectorization).
385  void vectorize() {
386  // Create a new empty loop. Unlink the old loop and connect the new one.
387  createEmptyLoop();
388  // Widen each instruction in the old loop to a new one in the new loop.
389  vectorizeLoop();
390  }
391 
392  // Return true if any runtime check is added.
393  bool areSafetyChecksAdded() { return AddedSafetyChecks; }
394 
395  virtual ~InnerLoopVectorizer() {}
396 
397 protected:
398  /// A small list of PHINodes.
399  typedef SmallVector<PHINode *, 4> PhiVector;
400 
401  /// A type for vectorized values in the new loop. Each value from the
402  /// original loop, when vectorized, is represented by UF vector values in the
403  /// new unrolled loop, where UF is the unroll factor.
404  typedef SmallVector<Value *, 2> VectorParts;
405 
406  /// A type for scalarized values in the new loop. Each value from the
407  /// original loop, when scalarized, is represented by UF x VF scalar values
408  /// in the new unrolled loop, where UF is the unroll factor and VF is the
409  /// vectorization factor.
410  typedef SmallVector<SmallVector<Value *, 4>, 2> ScalarParts;
411 
412  // When we if-convert we need to create edge masks. We have to cache values
413  // so that we don't end up with exponential recursion/IR.
415  EdgeMaskCache;
416 
417  /// Create an empty loop, based on the loop ranges of the old loop.
418  void createEmptyLoop();
419 
420  /// Set up the values of the IVs correctly when exiting the vector loop.
421  void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
422  Value *CountRoundDown, Value *EndValue,
423  BasicBlock *MiddleBlock);
424 
425  /// Create a new induction variable inside L.
426  PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
427  Value *Step, Instruction *DL);
428  /// Copy and widen the instructions from the old loop.
429  virtual void vectorizeLoop();
430 
431  /// Fix a first-order recurrence. This is the second phase of vectorizing
432  /// this phi node.
433  void fixFirstOrderRecurrence(PHINode *Phi);
434 
435  /// \brief The Loop exit block may have single value PHI nodes where the
436  /// incoming value is 'Undef'. While vectorizing we only handled real values
437  /// that were defined inside the loop. Here we fix the 'undef case'.
438  /// See PR14725.
439  void fixLCSSAPHIs();
440 
441  /// Iteratively sink the scalarized operands of a predicated instruction into
442  /// the block that was created for it.
443  void sinkScalarOperands(Instruction *PredInst);
444 
445  /// Predicate conditional instructions that require predication on their
446  /// respective conditions.
447  void predicateInstructions();
448 
449  /// Collect the instructions from the original loop that would be trivially
450  /// dead in the vectorized loop if generated.
451  void collectTriviallyDeadInstructions();
452 
453  /// Shrinks vector element sizes to the smallest bitwidth they can be legally
454  /// represented as.
455  void truncateToMinimalBitwidths();
456 
457  /// A helper function that computes the predicate of the block BB, assuming
458  /// that the header block of the loop is set to True. It returns the *entry*
459  /// mask for the block BB.
460  VectorParts createBlockInMask(BasicBlock *BB);
461  /// A helper function that computes the predicate of the edge between SRC
462  /// and DST.
463  VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
464 
465  /// A helper function to vectorize a single BB within the innermost loop.
466  void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV);
467 
468  /// Vectorize a single PHINode in a block. This method handles the induction
469  /// variable canonicalization. It supports both VF = 1 for unrolled loops and
470  /// arbitrary length vectors.
471  void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF,
472  PhiVector *PV);
473 
474  /// Insert the new loop to the loop hierarchy and pass manager
475  /// and update the analysis passes.
476  void updateAnalysis();
477 
478  /// This instruction is un-vectorizable. Implement it as a sequence
479  /// of scalars. If \p IfPredicateInstr is true we need to 'hide' each
480  /// scalarized instruction behind an if block predicated on the control
481  /// dependence of the instruction.
482  virtual void scalarizeInstruction(Instruction *Instr,
483  bool IfPredicateInstr = false);
484 
485  /// Vectorize Load and Store instructions,
486  virtual void vectorizeMemoryInstruction(Instruction *Instr);
487 
488  /// Create a broadcast instruction. This method generates a broadcast
489  /// instruction (shuffle) for loop invariant values and for the induction
490  /// value. If this is the induction variable then we extend it to N, N+1, ...
491  /// this is needed because each iteration in the loop corresponds to a SIMD
492  /// element.
493  virtual Value *getBroadcastInstrs(Value *V);
494 
495  /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
496  /// to each vector element of Val. The sequence starts at StartIndex.
497  /// \p Opcode is relevant for FP induction variable.
498  virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
499  Instruction::BinaryOps Opcode =
500  Instruction::BinaryOpsEnd);
501 
502  /// Compute scalar induction steps. \p ScalarIV is the scalar induction
503  /// variable on which to base the steps, \p Step is the size of the step, and
504  /// \p EntryVal is the value from the original loop that maps to the steps.
505  /// Note that \p EntryVal doesn't have to be an induction variable (e.g., it
506  /// can be a truncate instruction).
507  void buildScalarSteps(Value *ScalarIV, Value *Step, Value *EntryVal);
508 
509  /// Create a vector induction phi node based on an existing scalar one. This
510  /// currently only works for integer induction variables with a constant
511  /// step. \p EntryVal is the value from the original loop that maps to the
512  /// vector phi node. If \p EntryVal is a truncate instruction, instead of
513  /// widening the original IV, we widen a version of the IV truncated to \p
514  /// EntryVal's type.
515  void createVectorIntInductionPHI(const InductionDescriptor &II,
516  Instruction *EntryVal);
517 
518  /// Widen an integer induction variable \p IV. If \p Trunc is provided, the
519  /// induction variable will first be truncated to the corresponding type.
520  void widenIntInduction(PHINode *IV, TruncInst *Trunc = nullptr);
521 
522  /// Returns true if an instruction \p I should be scalarized instead of
523  /// vectorized for the chosen vectorization factor.
524  bool shouldScalarizeInstruction(Instruction *I) const;
525 
526  /// Returns true if we should generate a scalar version of \p IV.
527  bool needsScalarInduction(Instruction *IV) const;
528 
529  /// Return a constant reference to the VectorParts corresponding to \p V from
530  /// the original loop. If the value has already been vectorized, the
531  /// corresponding vector entry in VectorLoopValueMap is returned. If,
532  /// however, the value has a scalar entry in VectorLoopValueMap, we construct
533  /// new vector values on-demand by inserting the scalar values into vectors
534  /// with an insertelement sequence. If the value has been neither vectorized
535  /// nor scalarized, it must be loop invariant, so we simply broadcast the
536  /// value into vectors.
537  const VectorParts &getVectorValue(Value *V);
538 
539  /// Return a value in the new loop corresponding to \p V from the original
540  /// loop at unroll index \p Part and vector index \p Lane. If the value has
541  /// been vectorized but not scalarized, the necessary extractelement
542  /// instruction will be generated.
543  Value *getScalarValue(Value *V, unsigned Part, unsigned Lane);
544 
545  /// Try to vectorize the interleaved access group that \p Instr belongs to.
546  void vectorizeInterleaveGroup(Instruction *Instr);
547 
548  /// Generate a shuffle sequence that will reverse the vector Vec.
549  virtual Value *reverseVector(Value *Vec);
550 
551  /// Returns (and creates if needed) the original loop trip count.
552  Value *getOrCreateTripCount(Loop *NewLoop);
553 
554  /// Returns (and creates if needed) the trip count of the widened loop.
555  Value *getOrCreateVectorTripCount(Loop *NewLoop);
556 
557  /// Emit a bypass check to see if the trip count would overflow, or we
558  /// wouldn't have enough iterations to execute one vector loop.
559  void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
560  /// Emit a bypass check to see if the vector trip count is nonzero.
561  void emitVectorLoopEnteredCheck(Loop *L, BasicBlock *Bypass);
562  /// Emit a bypass check to see if all of the SCEV assumptions we've
563  /// had to make are correct.
564  void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
565  /// Emit bypass checks to check any memory assumptions we may have made.
566  void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
567 
568  /// Add additional metadata to \p To that was not present on \p Orig.
569  ///
570  /// Currently this is used to add the noalias annotations based on the
571  /// inserted memchecks. Use this for instructions that are *cloned* into the
572  /// vector loop.
573  void addNewMetadata(Instruction *To, const Instruction *Orig);
574 
575  /// Add metadata from one instruction to another.
576  ///
577  /// This includes both the original MDs from \p From and additional ones (\see
578  /// addNewMetadata). Use this for *newly created* instructions in the vector
579  /// loop.
580  void addMetadata(Instruction *To, Instruction *From);
581 
582  /// \brief Similar to the previous function but it adds the metadata to a
583  /// vector of instructions.
584  void addMetadata(ArrayRef<Value *> To, Instruction *From);
585 
586  /// This is a helper class for maintaining vectorization state. It's used for
587  /// mapping values from the original loop to their corresponding values in
588  /// the new loop. Two mappings are maintained: one for vectorized values and
589  /// one for scalarized values. Vectorized values are represented with UF
590  /// vector values in the new loop, and scalarized values are represented with
591  /// UF x VF scalar values in the new loop. UF and VF are the unroll and
592  /// vectorization factors, respectively.
593  ///
594  /// Entries can be added to either map with initVector and initScalar, which
595  /// initialize and return a constant reference to the new entry. If a
596  /// non-constant reference to a vector entry is required, getVector can be
597  /// used to retrieve a mutable entry. We currently directly modify the mapped
598  /// values during "fix-up" operations that occur once the first phase of
599  /// widening is complete. These operations include type truncation and the
600  /// second phase of recurrence widening.
601  ///
602  /// Otherwise, entries from either map should be accessed using the
603  /// getVectorValue or getScalarValue functions from InnerLoopVectorizer.
604  /// getVectorValue and getScalarValue coordinate to generate a vector or
605  /// scalar value on-demand if one is not yet available. When vectorizing a
606  /// loop, we visit the definition of an instruction before its uses. When
607  /// visiting the definition, we either vectorize or scalarize the
608  /// instruction, creating an entry for it in the corresponding map. (In some
609  /// cases, such as induction variables, we will create both vector and scalar
610  /// entries.) Then, as we encounter uses of the definition, we derive values
611  /// for each scalar or vector use unless such a value is already available.
612  /// For example, if we scalarize a definition and one of its uses is vector,
613  /// we build the required vector on-demand with an insertelement sequence
614  /// when visiting the use. Otherwise, if the use is scalar, we can use the
615  /// existing scalar definition.
616  struct ValueMap {
617 
618  /// Construct an empty map with the given unroll and vectorization factors.
619  ValueMap(unsigned UnrollFactor, unsigned VecWidth)
620  : UF(UnrollFactor), VF(VecWidth) {
621  // The unroll and vectorization factors are only used in asserts builds
622  // to verify map entries are sized appropriately.
623  (void)UF;
624  (void)VF;
625  }
626 
627  /// \return True if the map has a vector entry for \p Key.
628  bool hasVector(Value *Key) const { return VectorMapStorage.count(Key); }
629 
630  /// \return True if the map has a scalar entry for \p Key.
631  bool hasScalar(Value *Key) const { return ScalarMapStorage.count(Key); }
632 
633  /// \brief Map \p Key to the given VectorParts \p Entry, and return a
634  /// constant reference to the new vector map entry. The given key should
635  /// not already be in the map, and the given VectorParts should be
636  /// correctly sized for the current unroll factor.
637  const VectorParts &initVector(Value *Key, const VectorParts &Entry) {
638  assert(!hasVector(Key) && "Vector entry already initialized");
639  assert(Entry.size() == UF && "VectorParts has wrong dimensions");
640  VectorMapStorage[Key] = Entry;
641  return VectorMapStorage[Key];
642  }
643 
644  /// \brief Map \p Key to the given ScalarParts \p Entry, and return a
645  /// constant reference to the new scalar map entry. The given key should
646  /// not already be in the map, and the given ScalarParts should be
647  /// correctly sized for the current unroll and vectorization factors.
648  const ScalarParts &initScalar(Value *Key, const ScalarParts &Entry) {
649  assert(!hasScalar(Key) && "Scalar entry already initialized");
650  assert(Entry.size() == UF &&
651  all_of(make_range(Entry.begin(), Entry.end()),
652  [&](const SmallVectorImpl<Value *> &Values) -> bool {
653  return Values.size() == VF;
654  }) &&
655  "ScalarParts has wrong dimensions");
656  ScalarMapStorage[Key] = Entry;
657  return ScalarMapStorage[Key];
658  }
659 
660  /// \return A reference to the vector map entry corresponding to \p Key.
661  /// The key should already be in the map. This function should only be used
662  /// when it's necessary to update values that have already been vectorized.
663  /// This is the case for "fix-up" operations including type truncation and
664  /// the second phase of recurrence vectorization. If a non-const reference
665  /// isn't required, getVectorValue should be used instead.
666  VectorParts &getVector(Value *Key) {
667  assert(hasVector(Key) && "Vector entry not initialized");
668  return VectorMapStorage.find(Key)->second;
669  }
670 
671  /// Retrieve an entry from the vector or scalar maps. The preferred way to
672  /// access an existing mapped entry is with getVectorValue or
673  /// getScalarValue from InnerLoopVectorizer. Until those functions can be
674  /// moved inside ValueMap, we have to declare them as friends.
675  friend const VectorParts &InnerLoopVectorizer::getVectorValue(Value *V);
676  friend Value *InnerLoopVectorizer::getScalarValue(Value *V, unsigned Part,
677  unsigned Lane);
678 
679  private:
680  /// The unroll factor. Each entry in the vector map contains UF vector
681  /// values.
682  unsigned UF;
683 
684  /// The vectorization factor. Each entry in the scalar map contains UF x VF
685  /// scalar values.
686  unsigned VF;
687 
688  /// The vector and scalar map storage. We use std::map and not DenseMap
689  /// because insertions to DenseMap invalidate its iterators.
690  std::map<Value *, VectorParts> VectorMapStorage;
691  std::map<Value *, ScalarParts> ScalarMapStorage;
692  };
693 
694  /// The original loop.
695  Loop *OrigLoop;
696  /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
697  /// dynamic knowledge to simplify SCEV expressions and converts them to a
698  /// more usable form.
700  /// Loop Info.
701  LoopInfo *LI;
702  /// Dominator Tree.
703  DominatorTree *DT;
704  /// Alias Analysis.
705  AliasAnalysis *AA;
706  /// Target Library Info.
707  const TargetLibraryInfo *TLI;
708  /// Target Transform Info.
709  const TargetTransformInfo *TTI;
710  /// Assumption Cache.
711  AssumptionCache *AC;
712  /// Interface to emit optimization remarks.
714 
715  /// \brief LoopVersioning. It's only set up (non-null) if memchecks were
716  /// used.
717  ///
718  /// This is currently only used to add no-alias metadata based on the
719  /// memchecks. The actually versioning is performed manually.
720  std::unique_ptr<LoopVersioning> LVer;
721 
722  /// The vectorization SIMD factor to use. Each vector will have this many
723  /// vector elements.
724  unsigned VF;
725 
726 protected:
727  /// The vectorization unroll factor to use. Each scalar is vectorized to this
728  /// many different vector instructions.
729  unsigned UF;
730 
731  /// The builder that we use
732  IRBuilder<> Builder;
733 
734  // --- Vectorization state ---
735 
736  /// The vector-loop preheader.
737  BasicBlock *LoopVectorPreHeader;
738  /// The scalar-loop preheader.
739  BasicBlock *LoopScalarPreHeader;
740  /// Middle Block between the vector and the scalar.
741  BasicBlock *LoopMiddleBlock;
742  /// The ExitBlock of the scalar loop.
743  BasicBlock *LoopExitBlock;
744  /// The vector loop body.
745  BasicBlock *LoopVectorBody;
746  /// The scalar loop body.
747  BasicBlock *LoopScalarBody;
748  /// A list of all bypass blocks. The first block is the entry of the loop.
749  SmallVector<BasicBlock *, 4> LoopBypassBlocks;
750 
751  /// The new Induction variable which was added to the new block.
752  PHINode *Induction;
753  /// The induction variable of the old basic block.
754  PHINode *OldInduction;
755 
756  /// Maps values from the original loop to their corresponding values in the
757  /// vectorized loop. A key value can map to either vector values, scalar
758  /// values or both kinds of values, depending on whether the key was
759  /// vectorized and scalarized.
760  ValueMap VectorLoopValueMap;
761 
762  /// Store instructions that should be predicated, as a pair
763  /// <StoreInst, Predicate>
764  SmallVector<std::pair<Instruction *, Value *>, 4> PredicatedInstructions;
765  EdgeMaskCache MaskCache;
766  /// Trip count of the original loop.
767  Value *TripCount;
768  /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
769  Value *VectorTripCount;
770 
771  /// The legality analysis.
772  LoopVectorizationLegality *Legal;
773 
774  /// The profitablity analysis.
775  LoopVectorizationCostModel *Cost;
776 
777  // Record whether runtime checks are added.
778  bool AddedSafetyChecks;
779 
780  // Holds instructions from the original loop whose counterparts in the
781  // vectorized loop would be trivially dead if generated. For example,
782  // original induction update instructions can become dead because we
783  // separately emit induction "steps" when generating code for the new loop.
784  // Similarly, we create a new latch condition when setting up the structure
785  // of the new loop, so the old one can become dead.
786  SmallPtrSet<Instruction *, 4> DeadInstructions;
787 
788  // Holds the end values for each induction variable. We save the end values
789  // so we can later fix-up the external users of the induction variables.
790  DenseMap<PHINode *, Value *> IVEndValues;
791 };
792 
793 class InnerLoopUnroller : public InnerLoopVectorizer {
794 public:
795  InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
796  LoopInfo *LI, DominatorTree *DT,
797  const TargetLibraryInfo *TLI,
798  const TargetTransformInfo *TTI, AssumptionCache *AC,
799  OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
800  LoopVectorizationLegality *LVL,
801  LoopVectorizationCostModel *CM)
802  : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
803  UnrollFactor, LVL, CM) {}
804 
805 private:
806  void scalarizeInstruction(Instruction *Instr,
807  bool IfPredicateInstr = false) override;
808  void vectorizeMemoryInstruction(Instruction *Instr) override;
809  Value *getBroadcastInstrs(Value *V) override;
810  Value *getStepVector(Value *Val, int StartIdx, Value *Step,
811  Instruction::BinaryOps Opcode =
812  Instruction::BinaryOpsEnd) override;
813  Value *reverseVector(Value *Vec) override;
814 };
815 
816 /// \brief Look for a meaningful debug location on the instruction or it's
817 /// operands.
818 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
819  if (!I)
820  return I;
821 
822  DebugLoc Empty;
823  if (I->getDebugLoc() != Empty)
824  return I;
825 
826  for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
827  if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
828  if (OpInst->getDebugLoc() != Empty)
829  return OpInst;
830  }
831 
832  return I;
833 }
834 
835 /// \brief Set the debug location in the builder using the debug location in the
836 /// instruction.
837 static void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
838  if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr))
839  B.SetCurrentDebugLocation(Inst->getDebugLoc());
840  else
842 }
843 
844 #ifndef NDEBUG
845 /// \return string containing a file name and a line # for the given loop.
846 static std::string getDebugLocString(const Loop *L) {
847  std::string Result;
848  if (L) {
849  raw_string_ostream OS(Result);
850  if (const DebugLoc LoopDbgLoc = L->getStartLoc())
851  LoopDbgLoc.print(OS);
852  else
853  // Just print the module name.
854  OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
855  OS.flush();
856  }
857  return Result;
858 }
859 #endif
860 
861 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
862  const Instruction *Orig) {
863  // If the loop was versioned with memchecks, add the corresponding no-alias
864  // metadata.
865  if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
866  LVer->annotateInstWithNoAlias(To, Orig);
867 }
868 
869 void InnerLoopVectorizer::addMetadata(Instruction *To,
870  Instruction *From) {
871  propagateMetadata(To, From);
872  addNewMetadata(To, From);
873 }
874 
875 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
876  Instruction *From) {
877  for (Value *V : To) {
878  if (Instruction *I = dyn_cast<Instruction>(V))
879  addMetadata(I, From);
880  }
881 }
882 
883 /// \brief The group of interleaved loads/stores sharing the same stride and
884 /// close to each other.
885 ///
886 /// Each member in this group has an index starting from 0, and the largest
887 /// index should be less than interleaved factor, which is equal to the absolute
888 /// value of the access's stride.
889 ///
890 /// E.g. An interleaved load group of factor 4:
891 /// for (unsigned i = 0; i < 1024; i+=4) {
892 /// a = A[i]; // Member of index 0
893 /// b = A[i+1]; // Member of index 1
894 /// d = A[i+3]; // Member of index 3
895 /// ...
896 /// }
897 ///
898 /// An interleaved store group of factor 4:
899 /// for (unsigned i = 0; i < 1024; i+=4) {
900 /// ...
901 /// A[i] = a; // Member of index 0
902 /// A[i+1] = b; // Member of index 1
903 /// A[i+2] = c; // Member of index 2
904 /// A[i+3] = d; // Member of index 3
905 /// }
906 ///
907 /// Note: the interleaved load group could have gaps (missing members), but
908 /// the interleaved store group doesn't allow gaps.
909 class InterleaveGroup {
910 public:
911  InterleaveGroup(Instruction *Instr, int Stride, unsigned Align)
912  : Align(Align), SmallestKey(0), LargestKey(0), InsertPos(Instr) {
913  assert(Align && "The alignment should be non-zero");
914 
915  Factor = std::abs(Stride);
916  assert(Factor > 1 && "Invalid interleave factor");
917 
918  Reverse = Stride < 0;
919  Members[0] = Instr;
920  }
921 
922  bool isReverse() const { return Reverse; }
923  unsigned getFactor() const { return Factor; }
924  unsigned getAlignment() const { return Align; }
925  unsigned getNumMembers() const { return Members.size(); }
926 
927  /// \brief Try to insert a new member \p Instr with index \p Index and
928  /// alignment \p NewAlign. The index is related to the leader and it could be
929  /// negative if it is the new leader.
930  ///
931  /// \returns false if the instruction doesn't belong to the group.
932  bool insertMember(Instruction *Instr, int Index, unsigned NewAlign) {
933  assert(NewAlign && "The new member's alignment should be non-zero");
934 
935  int Key = Index + SmallestKey;
936 
937  // Skip if there is already a member with the same index.
938  if (Members.count(Key))
939  return false;
940 
941  if (Key > LargestKey) {
942  // The largest index is always less than the interleave factor.
943  if (Index >= static_cast<int>(Factor))
944  return false;
945 
946  LargestKey = Key;
947  } else if (Key < SmallestKey) {
948  // The largest index is always less than the interleave factor.
949  if (LargestKey - Key >= static_cast<int>(Factor))
950  return false;
951 
952  SmallestKey = Key;
953  }
954 
955  // It's always safe to select the minimum alignment.
956  Align = std::min(Align, NewAlign);
957  Members[Key] = Instr;
958  return true;
959  }
960 
961  /// \brief Get the member with the given index \p Index
962  ///
963  /// \returns nullptr if contains no such member.
964  Instruction *getMember(unsigned Index) const {
965  int Key = SmallestKey + Index;
966  if (!Members.count(Key))
967  return nullptr;
968 
969  return Members.find(Key)->second;
970  }
971 
972  /// \brief Get the index for the given member. Unlike the key in the member
973  /// map, the index starts from 0.
974  unsigned getIndex(Instruction *Instr) const {
975  for (auto I : Members)
976  if (I.second == Instr)
977  return I.first - SmallestKey;
978 
979  llvm_unreachable("InterleaveGroup contains no such member");
980  }
981 
982  Instruction *getInsertPos() const { return InsertPos; }
983  void setInsertPos(Instruction *Inst) { InsertPos = Inst; }
984 
985 private:
986  unsigned Factor; // Interleave Factor.
987  bool Reverse;
988  unsigned Align;
990  int SmallestKey;
991  int LargestKey;
992 
993  // To avoid breaking dependences, vectorized instructions of an interleave
994  // group should be inserted at either the first load or the last store in
995  // program order.
996  //
997  // E.g. %even = load i32 // Insert Position
998  // %add = add i32 %even // Use of %even
999  // %odd = load i32
1000  //
1001  // store i32 %even
1002  // %odd = add i32 // Def of %odd
1003  // store i32 %odd // Insert Position
1004  Instruction *InsertPos;
1005 };
1006 
1007 /// \brief Drive the analysis of interleaved memory accesses in the loop.
1008 ///
1009 /// Use this class to analyze interleaved accesses only when we can vectorize
1010 /// a loop. Otherwise it's meaningless to do analysis as the vectorization
1011 /// on interleaved accesses is unsafe.
1012 ///
1013 /// The analysis collects interleave groups and records the relationships
1014 /// between the member and the group in a map.
1015 class InterleavedAccessInfo {
1016 public:
1017  InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,
1018  DominatorTree *DT, LoopInfo *LI)
1019  : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(nullptr),
1020  RequiresScalarEpilogue(false) {}
1021 
1022  ~InterleavedAccessInfo() {
1024  // Avoid releasing a pointer twice.
1025  for (auto &I : InterleaveGroupMap)
1026  DelSet.insert(I.second);
1027  for (auto *Ptr : DelSet)
1028  delete Ptr;
1029  }
1030 
1031  /// \brief Analyze the interleaved accesses and collect them in interleave
1032  /// groups. Substitute symbolic strides using \p Strides.
1033  void analyzeInterleaving(const ValueToValueMap &Strides);
1034 
1035  /// \brief Check if \p Instr belongs to any interleave group.
1036  bool isInterleaved(Instruction *Instr) const {
1037  return InterleaveGroupMap.count(Instr);
1038  }
1039 
1040  /// \brief Return the maximum interleave factor of all interleaved groups.
1041  unsigned getMaxInterleaveFactor() const {
1042  unsigned MaxFactor = 1;
1043  for (auto &Entry : InterleaveGroupMap)
1044  MaxFactor = std::max(MaxFactor, Entry.second->getFactor());
1045  return MaxFactor;
1046  }
1047 
1048  /// \brief Get the interleave group that \p Instr belongs to.
1049  ///
1050  /// \returns nullptr if doesn't have such group.
1051  InterleaveGroup *getInterleaveGroup(Instruction *Instr) const {
1052  if (InterleaveGroupMap.count(Instr))
1053  return InterleaveGroupMap.find(Instr)->second;
1054  return nullptr;
1055  }
1056 
1057  /// \brief Returns true if an interleaved group that may access memory
1058  /// out-of-bounds requires a scalar epilogue iteration for correctness.
1059  bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; }
1060 
1061  /// \brief Initialize the LoopAccessInfo used for dependence checking.
1062  void setLAI(const LoopAccessInfo *Info) { LAI = Info; }
1063 
1064 private:
1065  /// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
1066  /// Simplifies SCEV expressions in the context of existing SCEV assumptions.
1067  /// The interleaved access analysis can also add new predicates (for example
1068  /// by versioning strides of pointers).
1070  Loop *TheLoop;
1071  DominatorTree *DT;
1072  LoopInfo *LI;
1073  const LoopAccessInfo *LAI;
1074 
1075  /// True if the loop may contain non-reversed interleaved groups with
1076  /// out-of-bounds accesses. We ensure we don't speculatively access memory
1077  /// out-of-bounds by executing at least one scalar epilogue iteration.
1078  bool RequiresScalarEpilogue;
1079 
1080  /// Holds the relationships between the members and the interleave group.
1081  DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap;
1082 
1083  /// Holds dependences among the memory accesses in the loop. It maps a source
1084  /// access to a set of dependent sink accesses.
1086 
1087  /// \brief The descriptor for a strided memory access.
1088  struct StrideDescriptor {
1089  StrideDescriptor(int64_t Stride, const SCEV *Scev, uint64_t Size,
1090  unsigned Align)
1091  : Stride(Stride), Scev(Scev), Size(Size), Align(Align) {}
1092 
1093  StrideDescriptor() = default;
1094 
1095  // The access's stride. It is negative for a reverse access.
1096  int64_t Stride = 0;
1097  const SCEV *Scev = nullptr; // The scalar expression of this access
1098  uint64_t Size = 0; // The size of the memory object.
1099  unsigned Align = 0; // The alignment of this access.
1100  };
1101 
1102  /// \brief A type for holding instructions and their stride descriptors.
1103  typedef std::pair<Instruction *, StrideDescriptor> StrideEntry;
1104 
1105  /// \brief Create a new interleave group with the given instruction \p Instr,
1106  /// stride \p Stride and alignment \p Align.
1107  ///
1108  /// \returns the newly created interleave group.
1109  InterleaveGroup *createInterleaveGroup(Instruction *Instr, int Stride,
1110  unsigned Align) {
1111  assert(!InterleaveGroupMap.count(Instr) &&
1112  "Already in an interleaved access group");
1113  InterleaveGroupMap[Instr] = new InterleaveGroup(Instr, Stride, Align);
1114  return InterleaveGroupMap[Instr];
1115  }
1116 
1117  /// \brief Release the group and remove all the relationships.
1118  void releaseGroup(InterleaveGroup *Group) {
1119  for (unsigned i = 0; i < Group->getFactor(); i++)
1120  if (Instruction *Member = Group->getMember(i))
1121  InterleaveGroupMap.erase(Member);
1122 
1123  delete Group;
1124  }
1125 
1126  /// \brief Collect all the accesses with a constant stride in program order.
1127  void collectConstStrideAccesses(
1129  const ValueToValueMap &Strides);
1130 
1131  /// \brief Returns true if \p Stride is allowed in an interleaved group.
1132  static bool isStrided(int Stride) {
1133  unsigned Factor = std::abs(Stride);
1134  return Factor >= 2 && Factor <= MaxInterleaveGroupFactor;
1135  }
1136 
1137  /// \brief Returns true if \p BB is a predicated block.
1138  bool isPredicated(BasicBlock *BB) const {
1139  return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
1140  }
1141 
1142  /// \brief Returns true if LoopAccessInfo can be used for dependence queries.
1143  bool areDependencesValid() const {
1144  return LAI && LAI->getDepChecker().getDependences();
1145  }
1146 
1147  /// \brief Returns true if memory accesses \p A and \p B can be reordered, if
1148  /// necessary, when constructing interleaved groups.
1149  ///
1150  /// \p A must precede \p B in program order. We return false if reordering is
1151  /// not necessary or is prevented because \p A and \p B may be dependent.
1152  bool canReorderMemAccessesForInterleavedGroups(StrideEntry *A,
1153  StrideEntry *B) const {
1154 
1155  // Code motion for interleaved accesses can potentially hoist strided loads
1156  // and sink strided stores. The code below checks the legality of the
1157  // following two conditions:
1158  //
1159  // 1. Potentially moving a strided load (B) before any store (A) that
1160  // precedes B, or
1161  //
1162  // 2. Potentially moving a strided store (A) after any load or store (B)
1163  // that A precedes.
1164  //
1165  // It's legal to reorder A and B if we know there isn't a dependence from A
1166  // to B. Note that this determination is conservative since some
1167  // dependences could potentially be reordered safely.
1168 
1169  // A is potentially the source of a dependence.
1170  auto *Src = A->first;
1171  auto SrcDes = A->second;
1172 
1173  // B is potentially the sink of a dependence.
1174  auto *Sink = B->first;
1175  auto SinkDes = B->second;
1176 
1177  // Code motion for interleaved accesses can't violate WAR dependences.
1178  // Thus, reordering is legal if the source isn't a write.
1179  if (!Src->mayWriteToMemory())
1180  return true;
1181 
1182  // At least one of the accesses must be strided.
1183  if (!isStrided(SrcDes.Stride) && !isStrided(SinkDes.Stride))
1184  return true;
1185 
1186  // If dependence information is not available from LoopAccessInfo,
1187  // conservatively assume the instructions can't be reordered.
1188  if (!areDependencesValid())
1189  return false;
1190 
1191  // If we know there is a dependence from source to sink, assume the
1192  // instructions can't be reordered. Otherwise, reordering is legal.
1193  return !Dependences.count(Src) || !Dependences.lookup(Src).count(Sink);
1194  }
1195 
1196  /// \brief Collect the dependences from LoopAccessInfo.
1197  ///
1198  /// We process the dependences once during the interleaved access analysis to
1199  /// enable constant-time dependence queries.
1200  void collectDependences() {
1201  if (!areDependencesValid())
1202  return;
1203  auto *Deps = LAI->getDepChecker().getDependences();
1204  for (auto Dep : *Deps)
1205  Dependences[Dep.getSource(*LAI)].insert(Dep.getDestination(*LAI));
1206  }
1207 };
1208 
1209 /// Utility class for getting and setting loop vectorizer hints in the form
1210 /// of loop metadata.
1211 /// This class keeps a number of loop annotations locally (as member variables)
1212 /// and can, upon request, write them back as metadata on the loop. It will
1213 /// initially scan the loop for existing metadata, and will update the local
1214 /// values based on information in the loop.
1215 /// We cannot write all values to metadata, as the mere presence of some info,
1216 /// for example 'force', means a decision has been made. So, we need to be
1217 /// careful NOT to add them if the user hasn't specifically asked so.
1218 class LoopVectorizeHints {
1219  enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE };
1220 
1221  /// Hint - associates name and validation with the hint value.
1222  struct Hint {
1223  const char *Name;
1224  unsigned Value; // This may have to change for non-numeric values.
1225  HintKind Kind;
1226 
1227  Hint(const char *Name, unsigned Value, HintKind Kind)
1228  : Name(Name), Value(Value), Kind(Kind) {}
1229 
1230  bool validate(unsigned Val) {
1231  switch (Kind) {
1232  case HK_WIDTH:
1233  return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth;
1234  case HK_UNROLL:
1235  return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
1236  case HK_FORCE:
1237  return (Val <= 1);
1238  }
1239  return false;
1240  }
1241  };
1242 
1243  /// Vectorization width.
1244  Hint Width;
1245  /// Vectorization interleave factor.
1246  Hint Interleave;
1247  /// Vectorization forced
1248  Hint Force;
1249 
1250  /// Return the loop metadata prefix.
1251  static StringRef Prefix() { return "llvm.loop."; }
1252 
1253  /// True if there is any unsafe math in the loop.
1254  bool PotentiallyUnsafe;
1255 
1256 public:
1257  enum ForceKind {
1258  FK_Undefined = -1, ///< Not selected.
1259  FK_Disabled = 0, ///< Forcing disabled.
1260  FK_Enabled = 1, ///< Forcing enabled.
1261  };
1262 
1263  LoopVectorizeHints(const Loop *L, bool DisableInterleaving,
1265  : Width("vectorize.width", VectorizerParams::VectorizationFactor,
1266  HK_WIDTH),
1267  Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
1268  Force("vectorize.enable", FK_Undefined, HK_FORCE),
1269  PotentiallyUnsafe(false), TheLoop(L), ORE(ORE) {
1270  // Populate values with existing loop metadata.
1271  getHintsFromMetadata();
1272 
1273  // force-vector-interleave overrides DisableInterleaving.
1275  Interleave.Value = VectorizerParams::VectorizationInterleave;
1276 
1277  DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()
1278  << "LV: Interleaving disabled by the pass manager\n");
1279  }
1280 
1281  /// Mark the loop L as already vectorized by setting the width to 1.
1282  void setAlreadyVectorized() {
1283  Width.Value = Interleave.Value = 1;
1284  Hint Hints[] = {Width, Interleave};
1285  writeHintsToMetadata(Hints);
1286  }
1287 
1288  bool allowVectorization(Function *F, Loop *L, bool AlwaysVectorize) const {
1289  if (getForce() == LoopVectorizeHints::FK_Disabled) {
1290  DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
1291  emitRemarkWithHints();
1292  return false;
1293  }
1294 
1295  if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) {
1296  DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
1297  emitRemarkWithHints();
1298  return false;
1299  }
1300 
1301  if (getWidth() == 1 && getInterleave() == 1) {
1302  // FIXME: Add a separate metadata to indicate when the loop has already
1303  // been vectorized instead of setting width and count to 1.
1304  DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
1305  // FIXME: Add interleave.disable metadata. This will allow
1306  // vectorize.disable to be used without disabling the pass and errors
1307  // to differentiate between disabled vectorization and a width of 1.
1308  ORE.emit(OptimizationRemarkAnalysis(vectorizeAnalysisPassName(),
1309  "AllDisabled", L->getStartLoc(),
1310  L->getHeader())
1311  << "loop not vectorized: vectorization and interleaving are "
1312  "explicitly disabled, or vectorize width and interleave "
1313  "count are both set to 1");
1314  return false;
1315  }
1316 
1317  return true;
1318  }
1319 
1320  /// Dumps all the hint information.
1321  void emitRemarkWithHints() const {
1322  using namespace ore;
1323  if (Force.Value == LoopVectorizeHints::FK_Disabled)
1324  ORE.emit(OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled",
1325  TheLoop->getStartLoc(),
1326  TheLoop->getHeader())
1327  << "loop not vectorized: vectorization is explicitly disabled");
1328  else {
1329  OptimizationRemarkMissed R(LV_NAME, "MissedDetails",
1330  TheLoop->getStartLoc(), TheLoop->getHeader());
1331  R << "loop not vectorized";
1332  if (Force.Value == LoopVectorizeHints::FK_Enabled) {
1333  R << " (Force=" << NV("Force", true);
1334  if (Width.Value != 0)
1335  R << ", Vector Width=" << NV("VectorWidth", Width.Value);
1336  if (Interleave.Value != 0)
1337  R << ", Interleave Count=" << NV("InterleaveCount", Interleave.Value);
1338  R << ")";
1339  }
1340  ORE.emit(R);
1341  }
1342  }
1343 
1344  unsigned getWidth() const { return Width.Value; }
1345  unsigned getInterleave() const { return Interleave.Value; }
1346  enum ForceKind getForce() const { return (ForceKind)Force.Value; }
1347 
1348  /// \brief If hints are provided that force vectorization, use the AlwaysPrint
1349  /// pass name to force the frontend to print the diagnostic.
1350  const char *vectorizeAnalysisPassName() const {
1351  if (getWidth() == 1)
1352  return LV_NAME;
1353  if (getForce() == LoopVectorizeHints::FK_Disabled)
1354  return LV_NAME;
1355  if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0)
1356  return LV_NAME;
1358  }
1359 
1360  bool allowReordering() const {
1361  // When enabling loop hints are provided we allow the vectorizer to change
1362  // the order of operations that is given by the scalar loop. This is not
1363  // enabled by default because can be unsafe or inefficient. For example,
1364  // reordering floating-point operations will change the way round-off
1365  // error accumulates in the loop.
1366  return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1;
1367  }
1368 
1369  bool isPotentiallyUnsafe() const {
1370  // Avoid FP vectorization if the target is unsure about proper support.
1371  // This may be related to the SIMD unit in the target not handling
1372  // IEEE 754 FP ops properly, or bad single-to-double promotions.
1373  // Otherwise, a sequence of vectorized loops, even without reduction,
1374  // could lead to different end results on the destination vectors.
1375  return getForce() != LoopVectorizeHints::FK_Enabled && PotentiallyUnsafe;
1376  }
1377 
1378  void setPotentiallyUnsafe() { PotentiallyUnsafe = true; }
1379 
1380 private:
1381  /// Find hints specified in the loop metadata and update local values.
1382  void getHintsFromMetadata() {
1383  MDNode *LoopID = TheLoop->getLoopID();
1384  if (!LoopID)
1385  return;
1386 
1387  // First operand should refer to the loop id itself.
1388  assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
1389  assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
1390 
1391  for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
1392  const MDString *S = nullptr;
1394 
1395  // The expected hint is either a MDString or a MDNode with the first
1396  // operand a MDString.
1397  if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
1398  if (!MD || MD->getNumOperands() == 0)
1399  continue;
1400  S = dyn_cast<MDString>(MD->getOperand(0));
1401  for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
1402  Args.push_back(MD->getOperand(i));
1403  } else {
1404  S = dyn_cast<MDString>(LoopID->getOperand(i));
1405  assert(Args.size() == 0 && "too many arguments for MDString");
1406  }
1407 
1408  if (!S)
1409  continue;
1410 
1411  // Check if the hint starts with the loop metadata prefix.
1412  StringRef Name = S->getString();
1413  if (Args.size() == 1)
1414  setHint(Name, Args[0]);
1415  }
1416  }
1417 
1418  /// Checks string hint with one operand and set value if valid.
1419  void setHint(StringRef Name, Metadata *Arg) {
1420  if (!Name.startswith(Prefix()))
1421  return;
1422  Name = Name.substr(Prefix().size(), StringRef::npos);
1423 
1424  const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
1425  if (!C)
1426  return;
1427  unsigned Val = C->getZExtValue();
1428 
1429  Hint *Hints[] = {&Width, &Interleave, &Force};
1430  for (auto H : Hints) {
1431  if (Name == H->Name) {
1432  if (H->validate(Val))
1433  H->Value = Val;
1434  else
1435  DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
1436  break;
1437  }
1438  }
1439  }
1440 
1441  /// Create a new hint from name / value pair.
1442  MDNode *createHintMetadata(StringRef Name, unsigned V) const {
1443  LLVMContext &Context = TheLoop->getHeader()->getContext();
1444  Metadata *MDs[] = {MDString::get(Context, Name),
1446  ConstantInt::get(Type::getInt32Ty(Context), V))};
1447  return MDNode::get(Context, MDs);
1448  }
1449 
1450  /// Matches metadata with hint name.
1451  bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) {
1452  MDString *Name = dyn_cast<MDString>(Node->getOperand(0));
1453  if (!Name)
1454  return false;
1455 
1456  for (auto H : HintTypes)
1457  if (Name->getString().endswith(H.Name))
1458  return true;
1459  return false;
1460  }
1461 
1462  /// Sets current hints into loop metadata, keeping other values intact.
1463  void writeHintsToMetadata(ArrayRef<Hint> HintTypes) {
1464  if (HintTypes.size() == 0)
1465  return;
1466 
1467  // Reserve the first element to LoopID (see below).
1469  // If the loop already has metadata, then ignore the existing operands.
1470  MDNode *LoopID = TheLoop->getLoopID();
1471  if (LoopID) {
1472  for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
1473  MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
1474  // If node in update list, ignore old value.
1475  if (!matchesHintMetadataName(Node, HintTypes))
1476  MDs.push_back(Node);
1477  }
1478  }
1479 
1480  // Now, add the missing hints.
1481  for (auto H : HintTypes)
1482  MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));
1483 
1484  // Replace current metadata node with new one.
1485  LLVMContext &Context = TheLoop->getHeader()->getContext();
1486  MDNode *NewLoopID = MDNode::get(Context, MDs);
1487  // Set operand 0 to refer to the loop id itself.
1488  NewLoopID->replaceOperandWith(0, NewLoopID);
1489 
1490  TheLoop->setLoopID(NewLoopID);
1491  }
1492 
1493  /// The loop these hints belong to.
1494  const Loop *TheLoop;
1495 
1496  /// Interface to emit optimization remarks.
1498 };
1499 
1500 static void emitAnalysisDiag(const Loop *TheLoop,
1501  const LoopVectorizeHints &Hints,
1503  const LoopAccessReport &Message) {
1504  const char *Name = Hints.vectorizeAnalysisPassName();
1505  LoopAccessReport::emitAnalysis(Message, TheLoop, Name, ORE);
1506 }
1507 
1508 static void emitMissedWarning(Function *F, Loop *L,
1509  const LoopVectorizeHints &LH,
1511  LH.emitRemarkWithHints();
1512 
1513  if (LH.getForce() == LoopVectorizeHints::FK_Enabled) {
1514  if (LH.getWidth() != 1)
1516  F->getContext(), *F, L->getStartLoc(),
1517  "failed explicitly specified loop vectorization");
1518  else if (LH.getInterleave() != 1)
1520  F->getContext(), *F, L->getStartLoc(),
1521  "failed explicitly specified loop interleaving");
1522  }
1523 }
1524 
1525 /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
1526 /// to what vectorization factor.
1527 /// This class does not look at the profitability of vectorization, only the
1528 /// legality. This class has two main kinds of checks:
1529 /// * Memory checks - The code in canVectorizeMemory checks if vectorization
1530 /// will change the order of memory accesses in a way that will change the
1531 /// correctness of the program.
1532 /// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory
1533 /// checks for a number of different conditions, such as the availability of a
1534 /// single induction variable, that all types are supported and vectorize-able,
1535 /// etc. This code reflects the capabilities of InnerLoopVectorizer.
1536 /// This class is also used by InnerLoopVectorizer for identifying
1537 /// induction variable and the different reduction variables.
1538 class LoopVectorizationLegality {
1539 public:
1540  LoopVectorizationLegality(
1543  const TargetTransformInfo *TTI,
1544  std::function<const LoopAccessInfo &(Loop &)> *GetLAA, LoopInfo *LI,
1545  OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R,
1546  LoopVectorizeHints *H)
1547  : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TTI(TTI), DT(DT),
1548  GetLAA(GetLAA), LAI(nullptr), ORE(ORE), InterleaveInfo(PSE, L, DT, LI),
1549  Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false),
1550  Requirements(R), Hints(H) {}
1551 
1552  /// ReductionList contains the reduction descriptors for all
1553  /// of the reductions that were found in the loop.
1554  typedef DenseMap<PHINode *, RecurrenceDescriptor> ReductionList;
1555 
1556  /// InductionList saves induction variables and maps them to the
1557  /// induction descriptor.
1558  typedef MapVector<PHINode *, InductionDescriptor> InductionList;
1559 
1560  /// RecurrenceSet contains the phi nodes that are recurrences other than
1561  /// inductions and reductions.
1562  typedef SmallPtrSet<const PHINode *, 8> RecurrenceSet;
1563 
1564  /// Returns true if it is legal to vectorize this loop.
1565  /// This does not mean that it is profitable to vectorize this
1566  /// loop, only that it is legal to do so.
1567  bool canVectorize();
1568 
1569  /// Returns the Induction variable.
1570  PHINode *getInduction() { return Induction; }
1571 
1572  /// Returns the reduction variables found in the loop.
1573  ReductionList *getReductionVars() { return &Reductions; }
1574 
1575  /// Returns the induction variables found in the loop.
1576  InductionList *getInductionVars() { return &Inductions; }
1577 
1578  /// Return the first-order recurrences found in the loop.
1579  RecurrenceSet *getFirstOrderRecurrences() { return &FirstOrderRecurrences; }
1580 
1581  /// Returns the widest induction type.
1582  Type *getWidestInductionType() { return WidestIndTy; }
1583 
1584  /// Returns True if V is an induction variable in this loop.
1585  bool isInductionVariable(const Value *V);
1586 
1587  /// Returns True if PN is a reduction variable in this loop.
1588  bool isReductionVariable(PHINode *PN) { return Reductions.count(PN); }
1589 
1590  /// Returns True if Phi is a first-order recurrence in this loop.
1591  bool isFirstOrderRecurrence(const PHINode *Phi);
1592 
1593  /// Return true if the block BB needs to be predicated in order for the loop
1594  /// to be vectorized.
1595  bool blockNeedsPredication(BasicBlock *BB);
1596 
1597  /// Check if this pointer is consecutive when vectorizing. This happens
1598  /// when the last index of the GEP is the induction variable, or that the
1599  /// pointer itself is an induction variable.
1600  /// This check allows us to vectorize A[idx] into a wide load/store.
1601  /// Returns:
1602  /// 0 - Stride is unknown or non-consecutive.
1603  /// 1 - Address is consecutive.
1604  /// -1 - Address is consecutive, and decreasing.
1605  int isConsecutivePtr(Value *Ptr);
1606 
1607  /// Returns true if the value V is uniform within the loop.
1608  bool isUniform(Value *V);
1609 
1610  /// Returns true if \p I is known to be uniform after vectorization.
1611  bool isUniformAfterVectorization(Instruction *I) { return Uniforms.count(I); }
1612 
1613  /// Returns true if \p I is known to be scalar after vectorization.
1614  bool isScalarAfterVectorization(Instruction *I) { return Scalars.count(I); }
1615 
1616  /// Returns the information that we collected about runtime memory check.
1617  const RuntimePointerChecking *getRuntimePointerChecking() const {
1618  return LAI->getRuntimePointerChecking();
1619  }
1620 
1621  const LoopAccessInfo *getLAI() const { return LAI; }
1622 
1623  /// \brief Check if \p Instr belongs to any interleaved access group.
1624  bool isAccessInterleaved(Instruction *Instr) {
1625  return InterleaveInfo.isInterleaved(Instr);
1626  }
1627 
1628  /// \brief Return the maximum interleave factor of all interleaved groups.
1629  unsigned getMaxInterleaveFactor() const {
1630  return InterleaveInfo.getMaxInterleaveFactor();
1631  }
1632 
1633  /// \brief Get the interleaved access group that \p Instr belongs to.
1634  const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) {
1635  return InterleaveInfo.getInterleaveGroup(Instr);
1636  }
1637 
1638  /// \brief Returns true if an interleaved group requires a scalar iteration
1639  /// to handle accesses with gaps.
1640  bool requiresScalarEpilogue() const {
1641  return InterleaveInfo.requiresScalarEpilogue();
1642  }
1643 
1644  unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); }
1645 
1646  bool hasStride(Value *V) { return LAI->hasStride(V); }
1647 
1648  /// Returns true if the target machine supports masked store operation
1649  /// for the given \p DataType and kind of access to \p Ptr.
1650  bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
1651  return isConsecutivePtr(Ptr) && TTI->isLegalMaskedStore(DataType);
1652  }
1653  /// Returns true if the target machine supports masked load operation
1654  /// for the given \p DataType and kind of access to \p Ptr.
1655  bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
1656  return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType);
1657  }
1658  /// Returns true if the target machine supports masked scatter operation
1659  /// for the given \p DataType.
1660  bool isLegalMaskedScatter(Type *DataType) {
1661  return TTI->isLegalMaskedScatter(DataType);
1662  }
1663  /// Returns true if the target machine supports masked gather operation
1664  /// for the given \p DataType.
1665  bool isLegalMaskedGather(Type *DataType) {
1666  return TTI->isLegalMaskedGather(DataType);
1667  }
1668  /// Returns true if the target machine can represent \p V as a masked gather
1669  /// or scatter operation.
1670  bool isLegalGatherOrScatter(Value *V) {
1671  auto *LI = dyn_cast<LoadInst>(V);
1672  auto *SI = dyn_cast<StoreInst>(V);
1673  if (!LI && !SI)
1674  return false;
1675  auto *Ptr = getPointerOperand(V);
1676  auto *Ty = cast<PointerType>(Ptr->getType())->getElementType();
1677  return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
1678  }
1679 
1680  /// Returns true if vector representation of the instruction \p I
1681  /// requires mask.
1682  bool isMaskRequired(const Instruction *I) { return (MaskedOp.count(I) != 0); }
1683  unsigned getNumStores() const { return LAI->getNumStores(); }
1684  unsigned getNumLoads() const { return LAI->getNumLoads(); }
1685  unsigned getNumPredStores() const { return NumPredStores; }
1686 
1687  /// Returns true if \p I is an instruction that will be scalarized with
1688  /// predication. Such instructions include conditional stores and
1689  /// instructions that may divide by zero.
1690  bool isScalarWithPredication(Instruction *I);
1691 
1692  /// Returns true if \p I is a memory instruction that has a consecutive or
1693  /// consecutive-like pointer operand. Consecutive-like pointers are pointers
1694  /// that are treated like consecutive pointers during vectorization. The
1695  /// pointer operands of interleaved accesses are an example.
1696  bool hasConsecutiveLikePtrOperand(Instruction *I);
1697 
1698  /// Returns true if \p I is a memory instruction that must be scalarized
1699  /// during vectorization.
1700  bool memoryInstructionMustBeScalarized(Instruction *I, unsigned VF = 1);
1701 
1702 private:
1703  /// Check if a single basic block loop is vectorizable.
1704  /// At this point we know that this is a loop with a constant trip count
1705  /// and we only need to check individual instructions.
1706  bool canVectorizeInstrs();
1707 
1708  /// When we vectorize loops we may change the order in which
1709  /// we read and write from memory. This method checks if it is
1710  /// legal to vectorize the code, considering only memory constrains.
1711  /// Returns true if the loop is vectorizable
1712  bool canVectorizeMemory();
1713 
1714  /// Return true if we can vectorize this loop using the IF-conversion
1715  /// transformation.
1716  bool canVectorizeWithIfConvert();
1717 
1718  /// Collect the instructions that are uniform after vectorization. An
1719  /// instruction is uniform if we represent it with a single scalar value in
1720  /// the vectorized loop corresponding to each vector iteration. Examples of
1721  /// uniform instructions include pointer operands of consecutive or
1722  /// interleaved memory accesses. Note that although uniformity implies an
1723  /// instruction will be scalar, the reverse is not true. In general, a
1724  /// scalarized instruction will be represented by VF scalar values in the
1725  /// vectorized loop, each corresponding to an iteration of the original
1726  /// scalar loop.
1727  void collectLoopUniforms();
1728 
1729  /// Collect the instructions that are scalar after vectorization. An
1730  /// instruction is scalar if it is known to be uniform or will be scalarized
1731  /// during vectorization. Non-uniform scalarized instructions will be
1732  /// represented by VF values in the vectorized loop, each corresponding to an
1733  /// iteration of the original scalar loop.
1734  void collectLoopScalars();
1735 
1736  /// Return true if all of the instructions in the block can be speculatively
1737  /// executed. \p SafePtrs is a list of addresses that are known to be legal
1738  /// and we know that we can read from them without segfault.
1739  bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs);
1740 
1741  /// Updates the vectorization state by adding \p Phi to the inductions list.
1742  /// This can set \p Phi as the main induction of the loop if \p Phi is a
1743  /// better choice for the main induction than the existing one.
1744  void addInductionPhi(PHINode *Phi, const InductionDescriptor &ID,
1745  SmallPtrSetImpl<Value *> &AllowedExit);
1746 
1747  /// Report an analysis message to assist the user in diagnosing loops that are
1748  /// not vectorized. These are handled as LoopAccessReport rather than
1749  /// VectorizationReport because the << operator of VectorizationReport returns
1750  /// LoopAccessReport.
1751  void emitAnalysis(const LoopAccessReport &Message) const {
1752  emitAnalysisDiag(TheLoop, *Hints, *ORE, Message);
1753  }
1754 
1755  /// Create an analysis remark that explains why vectorization failed
1756  ///
1757  /// \p RemarkName is the identifier for the remark. If \p I is passed it is
1758  /// an instruction that prevents vectorization. Otherwise the loop is used
1759  /// for the location of the remark. \return the remark object that can be
1760  /// streamed to.
1762  createMissedAnalysis(StringRef RemarkName, Instruction *I = nullptr) const {
1763  return ::createMissedAnalysis(Hints->vectorizeAnalysisPassName(),
1764  RemarkName, TheLoop, I);
1765  }
1766 
1767  /// \brief If an access has a symbolic strides, this maps the pointer value to
1768  /// the stride symbol.
1769  const ValueToValueMap *getSymbolicStrides() {
1770  // FIXME: Currently, the set of symbolic strides is sometimes queried before
1771  // it's collected. This happens from canVectorizeWithIfConvert, when the
1772  // pointer is checked to reference consecutive elements suitable for a
1773  // masked access.
1774  return LAI ? &LAI->getSymbolicStrides() : nullptr;
1775  }
1776 
1777  unsigned NumPredStores;
1778 
1779  /// The loop that we evaluate.
1780  Loop *TheLoop;
1781  /// A wrapper around ScalarEvolution used to add runtime SCEV checks.
1782  /// Applies dynamic knowledge to simplify SCEV expressions in the context
1783  /// of existing SCEV assumptions. The analysis will also add a minimal set
1784  /// of new predicates if this is required to enable vectorization and
1785  /// unrolling.
1787  /// Target Library Info.
1788  TargetLibraryInfo *TLI;
1789  /// Target Transform Info
1790  const TargetTransformInfo *TTI;
1791  /// Dominator Tree.
1792  DominatorTree *DT;
1793  // LoopAccess analysis.
1794  std::function<const LoopAccessInfo &(Loop &)> *GetLAA;
1795  // And the loop-accesses info corresponding to this loop. This pointer is
1796  // null until canVectorizeMemory sets it up.
1797  const LoopAccessInfo *LAI;
1798  /// Interface to emit optimization remarks.
1800 
1801  /// The interleave access information contains groups of interleaved accesses
1802  /// with the same stride and close to each other.
1803  InterleavedAccessInfo InterleaveInfo;
1804 
1805  // --- vectorization state --- //
1806 
1807  /// Holds the integer induction variable. This is the counter of the
1808  /// loop.
1809  PHINode *Induction;
1810  /// Holds the reduction variables.
1811  ReductionList Reductions;
1812  /// Holds all of the induction variables that we found in the loop.
1813  /// Notice that inductions don't need to start at zero and that induction
1814  /// variables can be pointers.
1815  InductionList Inductions;
1816  /// Holds the phi nodes that are first-order recurrences.
1817  RecurrenceSet FirstOrderRecurrences;
1818  /// Holds the widest induction type encountered.
1819  Type *WidestIndTy;
1820 
1821  /// Allowed outside users. This holds the induction and reduction
1822  /// vars which can be accessed from outside the loop.
1823  SmallPtrSet<Value *, 4> AllowedExit;
1824 
1825  /// Holds the instructions known to be uniform after vectorization.
1827 
1828  /// Holds the instructions known to be scalar after vectorization.
1830 
1831  /// Can we assume the absence of NaNs.
1832  bool HasFunNoNaNAttr;
1833 
1834  /// Vectorization requirements that will go through late-evaluation.
1835  LoopVectorizationRequirements *Requirements;
1836 
1837  /// Used to emit an analysis of any legality issues.
1838  LoopVectorizeHints *Hints;
1839 
1840  /// While vectorizing these instructions we have to generate a
1841  /// call to the appropriate masked intrinsic
1843 };
1844 
1845 /// LoopVectorizationCostModel - estimates the expected speedups due to
1846 /// vectorization.
1847 /// In many cases vectorization is not profitable. This can happen because of
1848 /// a number of reasons. In this class we mainly attempt to predict the
1849 /// expected speedup/slowdowns due to the supported instruction set. We use the
1850 /// TargetTransformInfo to query the different backends for the cost of
1851 /// different operations.
1852 class LoopVectorizationCostModel {
1853 public:
1854  LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE,
1855  LoopInfo *LI, LoopVectorizationLegality *Legal,
1856  const TargetTransformInfo &TTI,
1857  const TargetLibraryInfo *TLI, DemandedBits *DB,
1858  AssumptionCache *AC,
1859  OptimizationRemarkEmitter *ORE, const Function *F,
1860  const LoopVectorizeHints *Hints)
1861  : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
1862  AC(AC), ORE(ORE), TheFunction(F), Hints(Hints) {}
1863 
1864  /// Information about vectorization costs
1865  struct VectorizationFactor {
1866  unsigned Width; // Vector width with best cost
1867  unsigned Cost; // Cost of the loop with that width
1868  };
1869  /// \return The most profitable vectorization factor and the cost of that VF.
1870  /// This method checks every power of two up to VF. If UserVF is not ZERO
1871  /// then this vectorization factor will be selected if vectorization is
1872  /// possible.
1873  VectorizationFactor selectVectorizationFactor(bool OptForSize);
1874 
1875  /// \return The size (in bits) of the smallest and widest types in the code
1876  /// that needs to be vectorized. We ignore values that remain scalar such as
1877  /// 64 bit loop indices.
1878  std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1879 
1880  /// \return The desired interleave count.
1881  /// If interleave count has been specified by metadata it will be returned.
1882  /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1883  /// are the selected vectorization factor and the cost of the selected VF.
1884  unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
1885  unsigned LoopCost);
1886 
1887  /// \brief A struct that represents some properties of the register usage
1888  /// of a loop.
1889  struct RegisterUsage {
1890  /// Holds the number of loop invariant values that are used in the loop.
1891  unsigned LoopInvariantRegs;
1892  /// Holds the maximum number of concurrent live intervals in the loop.
1893  unsigned MaxLocalUsers;
1894  /// Holds the number of instructions in the loop.
1895  unsigned NumInstructions;
1896  };
1897 
1898  /// \return Returns information about the register usages of the loop for the
1899  /// given vectorization factors.
1900  SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
1901 
1902  /// Collect values we want to ignore in the cost model.
1903  void collectValuesToIgnore();
1904 
1905  /// \returns The smallest bitwidth each instruction can be represented with.
1906  /// The vector equivalents of these instructions should be truncated to this
1907  /// type.
1908  const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1909  return MinBWs;
1910  }
1911 
1912  /// \returns True if it is more profitable to scalarize instruction \p I for
1913  /// vectorization factor \p VF.
1914  bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1915  auto Scalars = InstsToScalarize.find(VF);
1916  assert(Scalars != InstsToScalarize.end() &&
1917  "VF not yet analyzed for scalarization profitability");
1918  return Scalars->second.count(I);
1919  }
1920 
1921  /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1922  /// for vectorization factor \p VF.
1923  bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1924  return VF > 1 && MinBWs.count(I) && !isProfitableToScalarize(I, VF) &&
1925  !Legal->isScalarAfterVectorization(I);
1926  }
1927 
1928 private:
1929  /// The vectorization cost is a combination of the cost itself and a boolean
1930  /// indicating whether any of the contributing operations will actually
1931  /// operate on
1932  /// vector values after type legalization in the backend. If this latter value
1933  /// is
1934  /// false, then all operations will be scalarized (i.e. no vectorization has
1935  /// actually taken place).
1936  typedef std::pair<unsigned, bool> VectorizationCostTy;
1937 
1938  /// Returns the expected execution cost. The unit of the cost does
1939  /// not matter because we use the 'cost' units to compare different
1940  /// vector widths. The cost that is returned is *not* normalized by
1941  /// the factor width.
1942  VectorizationCostTy expectedCost(unsigned VF);
1943 
1944  /// Returns the execution time cost of an instruction for a given vector
1945  /// width. Vector width of one means scalar.
1946  VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1947 
1948  /// The cost-computation logic from getInstructionCost which provides
1949  /// the vector type as an output parameter.
1950  unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1951 
1952  /// Returns whether the instruction is a load or store and will be a emitted
1953  /// as a vector operation.
1954  bool isConsecutiveLoadOrStore(Instruction *I);
1955 
1956  /// Create an analysis remark that explains why vectorization failed
1957  ///
1958  /// \p RemarkName is the identifier for the remark. \return the remark object
1959  /// that can be streamed to.
1961  return ::createMissedAnalysis(Hints->vectorizeAnalysisPassName(),
1962  RemarkName, TheLoop);
1963  }
1964 
1965  /// Map of scalar integer values to the smallest bitwidth they can be legally
1966  /// represented as. The vector equivalents of these values should be truncated
1967  /// to this type.
1969 
1970  /// A type representing the costs for instructions if they were to be
1971  /// scalarized rather than vectorized. The entries are Instruction-Cost
1972  /// pairs.
1973  typedef DenseMap<Instruction *, unsigned> ScalarCostsTy;
1974 
1975  /// A map holding scalar costs for different vectorization factors. The
1976  /// presence of a cost for an instruction in the mapping indicates that the
1977  /// instruction will be scalarized when vectorizing with the associated
1978  /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1979  DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1980 
1981  /// Returns the expected difference in cost from scalarizing the expression
1982  /// feeding a predicated instruction \p PredInst. The instructions to
1983  /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1984  /// non-negative return value implies the expression will be scalarized.
1985  /// Currently, only single-use chains are considered for scalarization.
1986  int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1987  unsigned VF);
1988 
1989  /// Collects the instructions to scalarize for each predicated instruction in
1990  /// the loop.
1991  void collectInstsToScalarize(unsigned VF);
1992 
1993 public:
1994  /// The loop that we evaluate.
1995  Loop *TheLoop;
1996  /// Predicated scalar evolution analysis.
1998  /// Loop Info analysis.
1999  LoopInfo *LI;
2000  /// Vectorization legality.
2001  LoopVectorizationLegality *Legal;
2002  /// Vector target information.
2003  const TargetTransformInfo &TTI;
2004  /// Target Library Info.
2005  const TargetLibraryInfo *TLI;
2006  /// Demanded bits analysis.
2007  DemandedBits *DB;
2008  /// Assumption cache.
2009  AssumptionCache *AC;
2010  /// Interface to emit optimization remarks.
2012 
2013  const Function *TheFunction;
2014  /// Loop Vectorize Hint.
2015  const LoopVectorizeHints *Hints;
2016  /// Values to ignore in the cost model.
2017  SmallPtrSet<const Value *, 16> ValuesToIgnore;
2018  /// Values to ignore in the cost model when VF > 1.
2019  SmallPtrSet<const Value *, 16> VecValuesToIgnore;
2020 };
2021 
2022 /// \brief This holds vectorization requirements that must be verified late in
2023 /// the process. The requirements are set by legalize and costmodel. Once
2024 /// vectorization has been determined to be possible and profitable the
2025 /// requirements can be verified by looking for metadata or compiler options.
2026 /// For example, some loops require FP commutativity which is only allowed if
2027 /// vectorization is explicitly specified or if the fast-math compiler option
2028 /// has been provided.
2029 /// Late evaluation of these requirements allows helpful diagnostics to be
2030 /// composed that tells the user what need to be done to vectorize the loop. For
2031 /// example, by specifying #pragma clang loop vectorize or -ffast-math. Late
2032 /// evaluation should be used only when diagnostics can generated that can be
2033 /// followed by a non-expert user.
2034 class LoopVectorizationRequirements {
2035 public:
2036  LoopVectorizationRequirements(OptimizationRemarkEmitter &ORE)
2037  : NumRuntimePointerChecks(0), UnsafeAlgebraInst(nullptr), ORE(ORE) {}
2038 
2039  void addUnsafeAlgebraInst(Instruction *I) {
2040  // First unsafe algebra instruction.
2041  if (!UnsafeAlgebraInst)
2042  UnsafeAlgebraInst = I;
2043  }
2044 
2045  void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; }
2046 
2047  bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints) {
2048  const char *PassName = Hints.vectorizeAnalysisPassName();
2049  bool Failed = false;
2050  if (UnsafeAlgebraInst && !Hints.allowReordering()) {
2051  ORE.emit(
2052  OptimizationRemarkAnalysisFPCommute(PassName, "CantReorderFPOps",
2053  UnsafeAlgebraInst->getDebugLoc(),
2054  UnsafeAlgebraInst->getParent())
2055  << "loop not vectorized: cannot prove it is safe to reorder "
2056  "floating-point operations");
2057  Failed = true;
2058  }
2059 
2060  // Test if runtime memcheck thresholds are exceeded.
2061  bool PragmaThresholdReached =
2062  NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
2063  bool ThresholdReached =
2064  NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
2065  if ((ThresholdReached && !Hints.allowReordering()) ||
2066  PragmaThresholdReached) {
2067  ORE.emit(OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps",
2068  L->getStartLoc(),
2069  L->getHeader())
2070  << "loop not vectorized: cannot prove it is safe to reorder "
2071  "memory operations");
2072  DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
2073  Failed = true;
2074  }
2075 
2076  return Failed;
2077  }
2078 
2079 private:
2080  unsigned NumRuntimePointerChecks;
2081  Instruction *UnsafeAlgebraInst;
2082 
2083  /// Interface to emit optimization remarks.
2085 };
2086 
2087 static void addAcyclicInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {
2088  if (L.empty()) {
2089  if (!hasCyclesInLoopBody(L))
2090  V.push_back(&L);
2091  return;
2092  }
2093  for (Loop *InnerL : L)
2094  addAcyclicInnerLoop(*InnerL, V);
2095 }
2096 
2097 /// The LoopVectorize Pass.
2098 struct LoopVectorize : public FunctionPass {
2099  /// Pass identification, replacement for typeid
2100  static char ID;
2101 
2102  explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true)
2103  : FunctionPass(ID) {
2104  Impl.DisableUnrolling = NoUnrolling;
2105  Impl.AlwaysVectorize = AlwaysVectorize;
2107  }
2108 
2109  LoopVectorizePass Impl;
2110 
2111  bool runOnFunction(Function &F) override {
2112  if (skipFunction(F))
2113  return false;
2114 
2115  auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2116  auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2117  auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2118  auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2119  auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2120  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2121  auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
2122  auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2123  auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2124  auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2125  auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2126  auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2127 
2128  std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2129  [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2130 
2131  return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2132  GetLAA, *ORE);
2133  }
2134 
2135  void getAnalysisUsage(AnalysisUsage &AU) const override {
2138  AU.addRequiredID(LCSSAID);
2152  }
2153 };
2154 
2155 } // end anonymous namespace
2156 
2157 //===----------------------------------------------------------------------===//
2158 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2159 // LoopVectorizationCostModel.
2160 //===----------------------------------------------------------------------===//
2161 
2162 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2163  // We need to place the broadcast of invariant variables outside the loop.
2164  Instruction *Instr = dyn_cast<Instruction>(V);
2165  bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody);
2166  bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
2167 
2168  // Place the code for broadcasting invariant variables in the new preheader.
2169  IRBuilder<>::InsertPointGuard Guard(Builder);
2170  if (Invariant)
2171  Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2172 
2173  // Broadcast the scalar into all locations in the vector.
2174  Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2175 
2176  return Shuf;
2177 }
2178 
2179 void InnerLoopVectorizer::createVectorIntInductionPHI(
2180  const InductionDescriptor &II, Instruction *EntryVal) {
2181  Value *Start = II.getStartValue();
2182  ConstantInt *Step = II.getConstIntStepValue();
2183  assert(Step && "Can not widen an IV with a non-constant step");
2184 
2185  // Construct the initial value of the vector IV in the vector loop preheader
2186  auto CurrIP = Builder.saveIP();
2187  Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2188  if (isa<TruncInst>(EntryVal)) {
2189  auto *TruncType = cast<IntegerType>(EntryVal->getType());
2190  Step = ConstantInt::getSigned(TruncType, Step->getSExtValue());
2191  Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2192  }
2193  Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
2194  Value *SteppedStart = getStepVector(SplatStart, 0, Step);
2195  Builder.restoreIP(CurrIP);
2196 
2197  Value *SplatVF =
2198  ConstantVector::getSplat(VF, ConstantInt::getSigned(Start->getType(),
2199  VF * Step->getSExtValue()));
2200  // We may need to add the step a number of times, depending on the unroll
2201  // factor. The last of those goes into the PHI.
2202  PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2203  &*LoopVectorBody->getFirstInsertionPt());
2204  Instruction *LastInduction = VecInd;
2205  VectorParts Entry(UF);
2206  for (unsigned Part = 0; Part < UF; ++Part) {
2207  Entry[Part] = LastInduction;
2208  LastInduction = cast<Instruction>(
2209  Builder.CreateAdd(LastInduction, SplatVF, "step.add"));
2210  }
2211  VectorLoopValueMap.initVector(EntryVal, Entry);
2212  if (isa<TruncInst>(EntryVal))
2213  addMetadata(Entry, EntryVal);
2214 
2215  // Move the last step to the end of the latch block. This ensures consistent
2216  // placement of all induction updates.
2217  auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2218  auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2219  auto *ICmp = cast<Instruction>(Br->getCondition());
2220  LastInduction->moveBefore(ICmp);
2221  LastInduction->setName("vec.ind.next");
2222 
2223  VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2224  VecInd->addIncoming(LastInduction, LoopVectorLatch);
2225 }
2226 
2227 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
2228  return Legal->isScalarAfterVectorization(I) ||
2229  Cost->isProfitableToScalarize(I, VF);
2230 }
2231 
2232 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
2233  if (shouldScalarizeInstruction(IV))
2234  return true;
2235  auto isScalarInst = [&](User *U) -> bool {
2236  auto *I = cast<Instruction>(U);
2237  return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
2238  };
2239  return any_of(IV->users(), isScalarInst);
2240 }
2241 
2242 void InnerLoopVectorizer::widenIntInduction(PHINode *IV, TruncInst *Trunc) {
2243 
2244  auto II = Legal->getInductionVars()->find(IV);
2245  assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
2246 
2247  auto ID = II->second;
2248  assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
2249 
2250  // The scalar value to broadcast. This will be derived from the canonical
2251  // induction variable.
2252  Value *ScalarIV = nullptr;
2253 
2254  // The step of the induction.
2255  Value *Step = nullptr;
2256 
2257  // The value from the original loop to which we are mapping the new induction
2258  // variable.
2259  Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2260 
2261  // True if we have vectorized the induction variable.
2262  auto VectorizedIV = false;
2263 
2264  // Determine if we want a scalar version of the induction variable. This is
2265  // true if the induction variable itself is not widened, or if it has at
2266  // least one user in the loop that is not widened.
2267  auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
2268 
2269  // If the induction variable has a constant integer step value, go ahead and
2270  // get it now.
2271  if (ID.getConstIntStepValue())
2272  Step = ID.getConstIntStepValue();
2273 
2274  // Try to create a new independent vector induction variable. If we can't
2275  // create the phi node, we will splat the scalar induction variable in each
2276  // loop iteration.
2277  if (VF > 1 && IV->getType() == Induction->getType() && Step &&
2278  !shouldScalarizeInstruction(EntryVal)) {
2279  createVectorIntInductionPHI(ID, EntryVal);
2280  VectorizedIV = true;
2281  }
2282 
2283  // If we haven't yet vectorized the induction variable, or if we will create
2284  // a scalar one, we need to define the scalar induction variable and step
2285  // values. If we were given a truncation type, truncate the canonical
2286  // induction variable and constant step. Otherwise, derive these values from
2287  // the induction descriptor.
2288  if (!VectorizedIV || NeedsScalarIV) {
2289  if (Trunc) {
2290  auto *TruncType = cast<IntegerType>(Trunc->getType());
2291  assert(Step && "Truncation requires constant integer step");
2292  auto StepInt = cast<ConstantInt>(Step)->getSExtValue();
2293  ScalarIV = Builder.CreateCast(Instruction::Trunc, Induction, TruncType);
2294  Step = ConstantInt::getSigned(TruncType, StepInt);
2295  } else {
2296  ScalarIV = Induction;
2297  auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2298  if (IV != OldInduction) {
2299  ScalarIV = Builder.CreateSExtOrTrunc(ScalarIV, IV->getType());
2300  ScalarIV = ID.transform(Builder, ScalarIV, PSE.getSE(), DL);
2301  ScalarIV->setName("offset.idx");
2302  }
2303  if (!Step) {
2304  SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2305  Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
2306  &*Builder.GetInsertPoint());
2307  }
2308  }
2309  }
2310 
2311  // If we haven't yet vectorized the induction variable, splat the scalar
2312  // induction variable, and build the necessary step vectors.
2313  if (!VectorizedIV) {
2314  Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2315  VectorParts Entry(UF);
2316  for (unsigned Part = 0; Part < UF; ++Part)
2317  Entry[Part] = getStepVector(Broadcasted, VF * Part, Step);
2318  VectorLoopValueMap.initVector(EntryVal, Entry);
2319  if (Trunc)
2320  addMetadata(Entry, Trunc);
2321  }
2322 
2323  // If an induction variable is only used for counting loop iterations or
2324  // calculating addresses, it doesn't need to be widened. Create scalar steps
2325  // that can be used by instructions we will later scalarize. Note that the
2326  // addition of the scalar steps will not increase the number of instructions
2327  // in the loop in the common case prior to InstCombine. We will be trading
2328  // one vector extract for each scalar step.
2329  if (NeedsScalarIV)
2330  buildScalarSteps(ScalarIV, Step, EntryVal);
2331 }
2332 
2333 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2334  Instruction::BinaryOps BinOp) {
2335  // Create and check the types.
2336  assert(Val->getType()->isVectorTy() && "Must be a vector");
2337  int VLen = Val->getType()->getVectorNumElements();
2338 
2339  Type *STy = Val->getType()->getScalarType();
2340  assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2341  "Induction Step must be an integer or FP");
2342  assert(Step->getType() == STy && "Step has wrong type");
2343 
2345 
2346  if (STy->isIntegerTy()) {
2347  // Create a vector of consecutive numbers from zero to VF.
2348  for (int i = 0; i < VLen; ++i)
2349  Indices.push_back(ConstantInt::get(STy, StartIdx + i));
2350 
2351  // Add the consecutive indices to the vector value.
2352  Constant *Cv = ConstantVector::get(Indices);
2353  assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
2354  Step = Builder.CreateVectorSplat(VLen, Step);
2355  assert(Step->getType() == Val->getType() && "Invalid step vec");
2356  // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2357  // which can be found from the original scalar operations.
2358  Step = Builder.CreateMul(Cv, Step);
2359  return Builder.CreateAdd(Val, Step, "induction");
2360  }
2361 
2362  // Floating point induction.
2363  assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2364  "Binary Opcode should be specified for FP induction");
2365  // Create a vector of consecutive numbers from zero to VF.
2366  for (int i = 0; i < VLen; ++i)
2367  Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
2368 
2369  // Add the consecutive indices to the vector value.
2370  Constant *Cv = ConstantVector::get(Indices);
2371 
2372  Step = Builder.CreateVectorSplat(VLen, Step);
2373 
2374  // Floating point operations had to be 'fast' to enable the induction.
2376  Flags.setUnsafeAlgebra();
2377 
2378  Value *MulOp = Builder.CreateFMul(Cv, Step);
2379  if (isa<Instruction>(MulOp))
2380  // Have to check, MulOp may be a constant
2381  cast<Instruction>(MulOp)->setFastMathFlags(Flags);
2382 
2383  Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2384  if (isa<Instruction>(BOp))
2385  cast<Instruction>(BOp)->setFastMathFlags(Flags);
2386  return BOp;
2387 }
2388 
2389 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2390  Value *EntryVal) {
2391 
2392  // We shouldn't have to build scalar steps if we aren't vectorizing.
2393  assert(VF > 1 && "VF should be greater than one");
2394 
2395  // Get the value type and ensure it and the step have the same integer type.
2396  Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2397  assert(ScalarIVTy->isIntegerTy() && ScalarIVTy == Step->getType() &&
2398  "Val and Step should have the same integer type");
2399 
2400  // Determine the number of scalars we need to generate for each unroll
2401  // iteration. If EntryVal is uniform, we only need to generate the first
2402  // lane. Otherwise, we generate all VF values.
2403  unsigned Lanes =
2404  Legal->isUniformAfterVectorization(cast<Instruction>(EntryVal)) ? 1 : VF;
2405 
2406  // Compute the scalar steps and save the results in VectorLoopValueMap.
2407  ScalarParts Entry(UF);
2408  for (unsigned Part = 0; Part < UF; ++Part) {
2409  Entry[Part].resize(VF);
2410  for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2411  auto *StartIdx = ConstantInt::get(ScalarIVTy, VF * Part + Lane);
2412  auto *Mul = Builder.CreateMul(StartIdx, Step);
2413  auto *Add = Builder.CreateAdd(ScalarIV, Mul);
2414  Entry[Part][Lane] = Add;
2415  }
2416  }
2417  VectorLoopValueMap.initScalar(EntryVal, Entry);
2418 }
2419 
2420 int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
2421 
2422  const ValueToValueMap &Strides = getSymbolicStrides() ? *getSymbolicStrides() :
2423  ValueToValueMap();
2424 
2425  int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, true, false);
2426  if (Stride == 1 || Stride == -1)
2427  return Stride;
2428  return 0;
2429 }
2430 
2431 bool LoopVectorizationLegality::isUniform(Value *V) {
2432  return LAI->isUniform(V);
2433 }
2434 
2436 InnerLoopVectorizer::getVectorValue(Value *V) {
2437  assert(V != Induction && "The new induction variable should not be used.");
2438  assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2439  assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2440 
2441  // If we have a stride that is replaced by one, do it here.
2442  if (Legal->hasStride(V))
2443  V = ConstantInt::get(V->getType(), 1);
2444 
2445  // If we have this scalar in the map, return it.
2446  if (VectorLoopValueMap.hasVector(V))
2447  return VectorLoopValueMap.VectorMapStorage[V];
2448 
2449  // If the value has not been vectorized, check if it has been scalarized
2450  // instead. If it has been scalarized, and we actually need the value in
2451  // vector form, we will construct the vector values on demand.
2452  if (VectorLoopValueMap.hasScalar(V)) {
2453 
2454  // Initialize a new vector map entry.
2455  VectorParts Entry(UF);
2456 
2457  // If we've scalarized a value, that value should be an instruction.
2458  auto *I = cast<Instruction>(V);
2459 
2460  // If we aren't vectorizing, we can just copy the scalar map values over to
2461  // the vector map.
2462  if (VF == 1) {
2463  for (unsigned Part = 0; Part < UF; ++Part)
2464  Entry[Part] = getScalarValue(V, Part, 0);
2465  return VectorLoopValueMap.initVector(V, Entry);
2466  }
2467 
2468  // Get the last scalar instruction we generated for V. If the value is
2469  // known to be uniform after vectorization, this corresponds to lane zero
2470  // of the last unroll iteration. Otherwise, the last instruction is the one
2471  // we created for the last vector lane of the last unroll iteration.
2472  unsigned LastLane = Legal->isUniformAfterVectorization(I) ? 0 : VF - 1;
2473  auto *LastInst = cast<Instruction>(getScalarValue(V, UF - 1, LastLane));
2474 
2475  // Set the insert point after the last scalarized instruction. This ensures
2476  // the insertelement sequence will directly follow the scalar definitions.
2477  auto OldIP = Builder.saveIP();
2478  auto NewIP = std::next(BasicBlock::iterator(LastInst));
2479  Builder.SetInsertPoint(&*NewIP);
2480 
2481  // However, if we are vectorizing, we need to construct the vector values.
2482  // If the value is known to be uniform after vectorization, we can just
2483  // broadcast the scalar value corresponding to lane zero for each unroll
2484  // iteration. Otherwise, we construct the vector values using insertelement
2485  // instructions. Since the resulting vectors are stored in
2486  // VectorLoopValueMap, we will only generate the insertelements once.
2487  for (unsigned Part = 0; Part < UF; ++Part) {
2488  Value *VectorValue = nullptr;
2489  if (Legal->isUniformAfterVectorization(I)) {
2490  VectorValue = getBroadcastInstrs(getScalarValue(V, Part, 0));
2491  } else {
2492  VectorValue = UndefValue::get(VectorType::get(V->getType(), VF));
2493  for (unsigned Lane = 0; Lane < VF; ++Lane)
2494  VectorValue = Builder.CreateInsertElement(
2495  VectorValue, getScalarValue(V, Part, Lane),
2496  Builder.getInt32(Lane));
2497  }
2498  Entry[Part] = VectorValue;
2499  }
2500  Builder.restoreIP(OldIP);
2501  return VectorLoopValueMap.initVector(V, Entry);
2502  }
2503 
2504  // If this scalar is unknown, assume that it is a constant or that it is
2505  // loop invariant. Broadcast V and save the value for future uses.
2506  Value *B = getBroadcastInstrs(V);
2507  return VectorLoopValueMap.initVector(V, VectorParts(UF, B));
2508 }
2509 
2510 Value *InnerLoopVectorizer::getScalarValue(Value *V, unsigned Part,
2511  unsigned Lane) {
2512 
2513  // If the value is not an instruction contained in the loop, it should
2514  // already be scalar.
2515  if (OrigLoop->isLoopInvariant(V))
2516  return V;
2517 
2518  assert(Lane > 0 ? !Legal->isUniformAfterVectorization(cast<Instruction>(V))
2519  : true && "Uniform values only have lane zero");
2520 
2521  // If the value from the original loop has not been vectorized, it is
2522  // represented by UF x VF scalar values in the new loop. Return the requested
2523  // scalar value.
2524  if (VectorLoopValueMap.hasScalar(V))
2525  return VectorLoopValueMap.ScalarMapStorage[V][Part][Lane];
2526 
2527  // If the value has not been scalarized, get its entry in VectorLoopValueMap
2528  // for the given unroll part. If this entry is not a vector type (i.e., the
2529  // vectorization factor is one), there is no need to generate an
2530  // extractelement instruction.
2531  auto *U = getVectorValue(V)[Part];
2532  if (!U->getType()->isVectorTy()) {
2533  assert(VF == 1 && "Value not scalarized has non-vector type");
2534  return U;
2535  }
2536 
2537  // Otherwise, the value from the original loop has been vectorized and is
2538  // represented by UF vector values. Extract and return the requested scalar
2539  // value from the appropriate vector lane.
2540  return Builder.CreateExtractElement(U, Builder.getInt32(Lane));
2541 }
2542 
2543 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2544  assert(Vec->getType()->isVectorTy() && "Invalid type");
2545  SmallVector<Constant *, 8> ShuffleMask;
2546  for (unsigned i = 0; i < VF; ++i)
2547  ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2548 
2549  return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2550  ConstantVector::get(ShuffleMask),
2551  "reverse");
2552 }
2553 
2554 // Get a mask to interleave \p NumVec vectors into a wide vector.
2555 // I.e. <0, VF, VF*2, ..., VF*(NumVec-1), 1, VF+1, VF*2+1, ...>
2556 // E.g. For 2 interleaved vectors, if VF is 4, the mask is:
2557 // <0, 4, 1, 5, 2, 6, 3, 7>
2558 static Constant *getInterleavedMask(IRBuilder<> &Builder, unsigned VF,
2559  unsigned NumVec) {
2561  for (unsigned i = 0; i < VF; i++)
2562  for (unsigned j = 0; j < NumVec; j++)
2563  Mask.push_back(Builder.getInt32(j * VF + i));
2564 
2565  return ConstantVector::get(Mask);
2566 }
2567 
2568 // Get the strided mask starting from index \p Start.
2569 // I.e. <Start, Start + Stride, ..., Start + Stride*(VF-1)>
2570 static Constant *getStridedMask(IRBuilder<> &Builder, unsigned Start,
2571  unsigned Stride, unsigned VF) {
2573  for (unsigned i = 0; i < VF; i++)
2574  Mask.push_back(Builder.getInt32(Start + i * Stride));
2575 
2576  return ConstantVector::get(Mask);
2577 }
2578 
2579 // Get a mask of two parts: The first part consists of sequential integers
2580 // starting from 0, The second part consists of UNDEFs.
2581 // I.e. <0, 1, 2, ..., NumInt - 1, undef, ..., undef>
2582 static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned NumInt,
2583  unsigned NumUndef) {
2585  for (unsigned i = 0; i < NumInt; i++)
2586  Mask.push_back(Builder.getInt32(i));
2587 
2588  Constant *Undef = UndefValue::get(Builder.getInt32Ty());
2589  for (unsigned i = 0; i < NumUndef; i++)
2590  Mask.push_back(Undef);
2591 
2592  return ConstantVector::get(Mask);
2593 }
2594 
2595 // Concatenate two vectors with the same element type. The 2nd vector should
2596 // not have more elements than the 1st vector. If the 2nd vector has less
2597 // elements, extend it with UNDEFs.
2599  Value *V2) {
2600  VectorType *VecTy1 = dyn_cast<VectorType>(V1->getType());
2601  VectorType *VecTy2 = dyn_cast<VectorType>(V2->getType());
2602  assert(VecTy1 && VecTy2 &&
2603  VecTy1->getScalarType() == VecTy2->getScalarType() &&
2604  "Expect two vectors with the same element type");
2605 
2606  unsigned NumElts1 = VecTy1->getNumElements();
2607  unsigned NumElts2 = VecTy2->getNumElements();
2608  assert(NumElts1 >= NumElts2 && "Unexpect the first vector has less elements");
2609 
2610  if (NumElts1 > NumElts2) {
2611  // Extend with UNDEFs.
2612  Constant *ExtMask =
2613  getSequentialMask(Builder, NumElts2, NumElts1 - NumElts2);
2614  V2 = Builder.CreateShuffleVector(V2, UndefValue::get(VecTy2), ExtMask);
2615  }
2616 
2617  Constant *Mask = getSequentialMask(Builder, NumElts1 + NumElts2, 0);
2618  return Builder.CreateShuffleVector(V1, V2, Mask);
2619 }
2620 
2621 // Concatenate vectors in the given list. All vectors have the same type.
2623  ArrayRef<Value *> InputList) {
2624  unsigned NumVec = InputList.size();
2625  assert(NumVec > 1 && "Should be at least two vectors");
2626 
2627  SmallVector<Value *, 8> ResList;
2628  ResList.append(InputList.begin(), InputList.end());
2629  do {
2630  SmallVector<Value *, 8> TmpList;
2631  for (unsigned i = 0; i < NumVec - 1; i += 2) {
2632  Value *V0 = ResList[i], *V1 = ResList[i + 1];
2633  assert((V0->getType() == V1->getType() || i == NumVec - 2) &&
2634  "Only the last vector may have a different type");
2635 
2636  TmpList.push_back(ConcatenateTwoVectors(Builder, V0, V1));
2637  }
2638 
2639  // Push the last vector if the total number of vectors is odd.
2640  if (NumVec % 2 != 0)
2641  TmpList.push_back(ResList[NumVec - 1]);
2642 
2643  ResList = TmpList;
2644  NumVec = ResList.size();
2645  } while (NumVec > 1);
2646 
2647  return ResList[0];
2648 }
2649 
2650 // Try to vectorize the interleave group that \p Instr belongs to.
2651 //
2652 // E.g. Translate following interleaved load group (factor = 3):
2653 // for (i = 0; i < N; i+=3) {
2654 // R = Pic[i]; // Member of index 0
2655 // G = Pic[i+1]; // Member of index 1
2656 // B = Pic[i+2]; // Member of index 2
2657 // ... // do something to R, G, B
2658 // }
2659 // To:
2660 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2661 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements
2662 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements
2663 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements
2664 //
2665 // Or translate following interleaved store group (factor = 3):
2666 // for (i = 0; i < N; i+=3) {
2667 // ... do something to R, G, B
2668 // Pic[i] = R; // Member of index 0
2669 // Pic[i+1] = G; // Member of index 1
2670 // Pic[i+2] = B; // Member of index 2
2671 // }
2672 // To:
2673 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2674 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2675 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2676 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2677 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2678 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
2679  const InterleaveGroup *Group = Legal->getInterleavedAccessGroup(Instr);
2680  assert(Group && "Fail to get an interleaved access group.");
2681 
2682  // Skip if current instruction is not the insert position.
2683  if (Instr != Group->getInsertPos())
2684  return;
2685 
2686  LoadInst *LI = dyn_cast<LoadInst>(Instr);
2687  StoreInst *SI = dyn_cast<StoreInst>(Instr);
2688  Value *Ptr = getPointerOperand(Instr);
2689 
2690  // Prepare for the vector type of the interleaved load/store.
2691  Type *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
2692  unsigned InterleaveFactor = Group->getFactor();
2693  Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2694  Type *PtrTy = VecTy->getPointerTo(Ptr->getType()->getPointerAddressSpace());
2695 
2696  // Prepare for the new pointers.
2697  setDebugLocFromInst(Builder, Ptr);
2698  SmallVector<Value *, 2> NewPtrs;
2699  unsigned Index = Group->getIndex(Instr);
2700 
2701  // If the group is reverse, adjust the index to refer to the last vector lane
2702  // instead of the first. We adjust the index from the first vector lane,
2703  // rather than directly getting the pointer for lane VF - 1, because the
2704  // pointer operand of the interleaved access is supposed to be uniform. For
2705  // uniform instructions, we're only required to generate a value for the
2706  // first vector lane in each unroll iteration.
2707  if (Group->isReverse())
2708  Index += (VF - 1) * Group->getFactor();
2709 
2710  for (unsigned Part = 0; Part < UF; Part++) {
2711  Value *NewPtr = getScalarValue(Ptr, Part, 0);
2712 
2713  // Notice current instruction could be any index. Need to adjust the address
2714  // to the member of index 0.
2715  //
2716  // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2717  // b = A[i]; // Member of index 0
2718  // Current pointer is pointed to A[i+1], adjust it to A[i].
2719  //
2720  // E.g. A[i+1] = a; // Member of index 1
2721  // A[i] = b; // Member of index 0
2722  // A[i+2] = c; // Member of index 2 (Current instruction)
2723  // Current pointer is pointed to A[i+2], adjust it to A[i].
2724  NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index));
2725 
2726  // Cast to the vector pointer type.
2727  NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
2728  }
2729 
2730  setDebugLocFromInst(Builder, Instr);
2731  Value *UndefVec = UndefValue::get(VecTy);
2732 
2733  // Vectorize the interleaved load group.
2734  if (LI) {
2735 
2736  // For each unroll part, create a wide load for the group.
2737  SmallVector<Value *, 2> NewLoads;
2738  for (unsigned Part = 0; Part < UF; Part++) {
2739  auto *NewLoad = Builder.CreateAlignedLoad(
2740  NewPtrs[Part], Group->getAlignment(), "wide.vec");
2741  addMetadata(NewLoad, Instr);
2742  NewLoads.push_back(NewLoad);
2743  }
2744 
2745  // For each member in the group, shuffle out the appropriate data from the
2746  // wide loads.
2747  for (unsigned I = 0; I < InterleaveFactor; ++I) {
2748  Instruction *Member = Group->getMember(I);
2749 
2750  // Skip the gaps in the group.
2751  if (!Member)
2752  continue;
2753 
2754  VectorParts Entry(UF);
2755  Constant *StrideMask = getStridedMask(Builder, I, InterleaveFactor, VF);
2756  for (unsigned Part = 0; Part < UF; Part++) {
2757  Value *StridedVec = Builder.CreateShuffleVector(
2758  NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2759 
2760  // If this member has different type, cast the result type.
2761  if (Member->getType() != ScalarTy) {
2762  VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2763  StridedVec = Builder.CreateBitOrPointerCast(StridedVec, OtherVTy);
2764  }
2765 
2766  Entry[Part] =
2767  Group->isReverse() ? reverseVector(StridedVec) : StridedVec;
2768  }
2769  VectorLoopValueMap.initVector(Member, Entry);
2770  }
2771  return;
2772  }
2773 
2774  // The sub vector type for current instruction.
2775  VectorType *SubVT = VectorType::get(ScalarTy, VF);
2776 
2777  // Vectorize the interleaved store group.
2778  for (unsigned Part = 0; Part < UF; Part++) {
2779  // Collect the stored vector from each member.
2780  SmallVector<Value *, 4> StoredVecs;
2781  for (unsigned i = 0; i < InterleaveFactor; i++) {
2782  // Interleaved store group doesn't allow a gap, so each index has a member
2783  Instruction *Member = Group->getMember(i);
2784  assert(Member && "Fail to get a member from an interleaved store group");
2785 
2786  Value *StoredVec =
2787  getVectorValue(cast<StoreInst>(Member)->getValueOperand())[Part];
2788  if (Group->isReverse())
2789  StoredVec = reverseVector(StoredVec);
2790 
2791  // If this member has different type, cast it to an unified type.
2792  if (StoredVec->getType() != SubVT)
2793  StoredVec = Builder.CreateBitOrPointerCast(StoredVec, SubVT);
2794 
2795  StoredVecs.push_back(StoredVec);
2796  }
2797 
2798  // Concatenate all vectors into a wide vector.
2799  Value *WideVec = ConcatenateVectors(Builder, StoredVecs);
2800 
2801  // Interleave the elements in the wide vector.
2802  Constant *IMask = getInterleavedMask(Builder, VF, InterleaveFactor);
2803  Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2804  "interleaved.vec");
2805 
2806  Instruction *NewStoreInstr =
2807  Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment());
2808  addMetadata(NewStoreInstr, Instr);
2809  }
2810 }
2811 
2812 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
2813  // Attempt to issue a wide load.
2814  LoadInst *LI = dyn_cast<LoadInst>(Instr);
2815  StoreInst *SI = dyn_cast<StoreInst>(Instr);
2816 
2817  assert((LI || SI) && "Invalid Load/Store instruction");
2818 
2819  // Try to vectorize the interleave group if this access is interleaved.
2820  if (Legal->isAccessInterleaved(Instr))
2821  return vectorizeInterleaveGroup(Instr);
2822 
2823  Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType();
2824  Type *DataTy = VectorType::get(ScalarDataTy, VF);
2825  Value *Ptr = getPointerOperand(Instr);
2826  unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
2827  // An alignment of 0 means target abi alignment. We need to use the scalar's
2828  // target abi alignment in such a case.
2829  const DataLayout &DL = Instr->getModule()->getDataLayout();
2830  if (!Alignment)
2831  Alignment = DL.getABITypeAlignment(ScalarDataTy);
2832  unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2833 
2834  // Scalarize the memory instruction if necessary.
2835  if (Legal->memoryInstructionMustBeScalarized(Instr, VF))
2836  return scalarizeInstruction(Instr, Legal->isScalarWithPredication(Instr));
2837 
2838  // Determine if the pointer operand of the access is either consecutive or
2839  // reverse consecutive.
2840  int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
2841  bool Reverse = ConsecutiveStride < 0;
2842 
2843  // Determine if either a gather or scatter operation is legal.
2844  bool CreateGatherScatter =
2845  !ConsecutiveStride && Legal->isLegalGatherOrScatter(Instr);
2846 
2847  VectorParts VectorGep;
2848 
2849  // Handle consecutive loads/stores.
2850  GetElementPtrInst *Gep = getGEPInstruction(Ptr);
2851  if (ConsecutiveStride) {
2852  if (Gep) {
2853  unsigned NumOperands = Gep->getNumOperands();
2854 #ifndef NDEBUG
2855  // The original GEP that identified as a consecutive memory access
2856  // should have only one loop-variant operand.
2857  unsigned NumOfLoopVariantOps = 0;
2858  for (unsigned i = 0; i < NumOperands; ++i)
2859  if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)),
2860  OrigLoop))
2861  NumOfLoopVariantOps++;
2862  assert(NumOfLoopVariantOps == 1 &&
2863  "Consecutive GEP should have only one loop-variant operand");
2864 #endif
2865  GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
2866  Gep2->setName("gep.indvar");
2867 
2868  // A new GEP is created for a 0-lane value of the first unroll iteration.
2869  // The GEPs for the rest of the unroll iterations are computed below as an
2870  // offset from this GEP.
2871  for (unsigned i = 0; i < NumOperands; ++i)
2872  // We can apply getScalarValue() for all GEP indices. It returns an
2873  // original value for loop-invariant operand and 0-lane for consecutive
2874  // operand.
2875  Gep2->setOperand(i, getScalarValue(Gep->getOperand(i),
2876  0, /* First unroll iteration */
2877  0 /* 0-lane of the vector */ ));
2878  setDebugLocFromInst(Builder, Gep);
2879  Ptr = Builder.Insert(Gep2);
2880 
2881  } else { // No GEP
2882  setDebugLocFromInst(Builder, Ptr);
2883  Ptr = getScalarValue(Ptr, 0, 0);
2884  }
2885  } else {
2886  // At this point we should vector version of GEP for Gather or Scatter
2887  assert(CreateGatherScatter && "The instruction should be scalarized");
2888  if (Gep) {
2889  // Vectorizing GEP, across UF parts. We want to get a vector value for base
2890  // and each index that's defined inside the loop, even if it is
2891  // loop-invariant but wasn't hoisted out. Otherwise we want to keep them
2892  // scalar.
2894  for (Value *Op : Gep->operands()) {
2895  Instruction *SrcInst = dyn_cast<Instruction>(Op);
2896  if (SrcInst && OrigLoop->contains(SrcInst))
2897  OpsV.push_back(getVectorValue(Op));
2898  else
2899  OpsV.push_back(VectorParts(UF, Op));
2900  }
2901  for (unsigned Part = 0; Part < UF; ++Part) {
2903  Value *GEPBasePtr = OpsV[0][Part];
2904  for (unsigned i = 1; i < Gep->getNumOperands(); i++)
2905  Ops.push_back(OpsV[i][Part]);
2906  Value *NewGep = Builder.CreateGEP(GEPBasePtr, Ops, "VectorGep");
2907  cast<GetElementPtrInst>(NewGep)->setIsInBounds(Gep->isInBounds());
2908  assert(NewGep->getType()->isVectorTy() && "Expected vector GEP");
2909 
2910  NewGep =
2911  Builder.CreateBitCast(NewGep, VectorType::get(Ptr->getType(), VF));
2912  VectorGep.push_back(NewGep);
2913  }
2914  } else
2915  VectorGep = getVectorValue(Ptr);
2916  }
2917 
2918  VectorParts Mask = createBlockInMask(Instr->getParent());
2919  // Handle Stores:
2920  if (SI) {
2921  assert(!Legal->isUniform(SI->getPointerOperand()) &&
2922  "We do not allow storing to uniform addresses");
2923  setDebugLocFromInst(Builder, SI);
2924  // We don't want to update the value in the map as it might be used in
2925  // another expression. So don't use a reference type for "StoredVal".
2926  VectorParts StoredVal = getVectorValue(SI->getValueOperand());
2927 
2928  for (unsigned Part = 0; Part < UF; ++Part) {
2929  Instruction *NewSI = nullptr;
2930  if (CreateGatherScatter) {
2931  Value *MaskPart = Legal->isMaskRequired(SI) ? Mask[Part] : nullptr;
2932  NewSI = Builder.CreateMaskedScatter(StoredVal[Part], VectorGep[Part],
2933  Alignment, MaskPart);
2934  } else {
2935  // Calculate the pointer for the specific unroll-part.
2936  Value *PartPtr =
2937  Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
2938 
2939  if (Reverse) {
2940  // If we store to reverse consecutive memory locations, then we need
2941  // to reverse the order of elements in the stored value.
2942  StoredVal[Part] = reverseVector(StoredVal[Part]);
2943  // If the address is consecutive but reversed, then the
2944  // wide store needs to start at the last vector element.
2945  PartPtr =
2946  Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
2947  PartPtr =
2948  Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
2949  Mask[Part] = reverseVector(Mask[Part]);
2950  }
2951 
2952  Value *VecPtr =
2953  Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2954 
2955  if (Legal->isMaskRequired(SI))
2956  NewSI = Builder.CreateMaskedStore(StoredVal[Part], VecPtr, Alignment,
2957  Mask[Part]);
2958  else
2959  NewSI =
2960  Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment);
2961  }
2962  addMetadata(NewSI, SI);
2963  }
2964  return;
2965  }
2966 
2967  // Handle loads.
2968  assert(LI && "Must have a load instruction");
2969  setDebugLocFromInst(Builder, LI);
2970  VectorParts Entry(UF);
2971  for (unsigned Part = 0; Part < UF; ++Part) {
2972  Instruction *NewLI;
2973  if (CreateGatherScatter) {
2974  Value *MaskPart = Legal->isMaskRequired(LI) ? Mask[Part] : nullptr;
2975  NewLI = Builder.CreateMaskedGather(VectorGep[Part], Alignment, MaskPart,
2976  0, "wide.masked.gather");
2977  Entry[Part] = NewLI;
2978  } else {
2979  // Calculate the pointer for the specific unroll-part.
2980  Value *PartPtr =
2981  Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
2982 
2983  if (Reverse) {
2984  // If the address is consecutive but reversed, then the
2985  // wide load needs to start at the last vector element.
2986  PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
2987  PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
2988  Mask[Part] = reverseVector(Mask[Part]);
2989  }
2990 
2991  Value *VecPtr =
2992  Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2993  if (Legal->isMaskRequired(LI))
2994  NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
2995  UndefValue::get(DataTy),
2996  "wide.masked.load");
2997  else
2998  NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");
2999  Entry[Part] = Reverse ? reverseVector(NewLI) : NewLI;
3000  }
3001  addMetadata(NewLI, LI);
3002  }
3003  VectorLoopValueMap.initVector(Instr, Entry);
3004 }
3005 
3006 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
3007  bool IfPredicateInstr) {
3008  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
3009  DEBUG(dbgs() << "LV: Scalarizing"
3010  << (IfPredicateInstr ? " and predicating:" : ":") << *Instr
3011  << '\n');
3012  // Holds vector parameters or scalars, in case of uniform vals.
3014 
3015  setDebugLocFromInst(Builder, Instr);
3016 
3017  // Does this instruction return a value ?
3018  bool IsVoidRetTy = Instr->getType()->isVoidTy();
3019 
3020  // Initialize a new scalar map entry.
3021  ScalarParts Entry(UF);
3022 
3023  VectorParts Cond;
3024  if (IfPredicateInstr)
3025  Cond = createBlockInMask(Instr->getParent());
3026 
3027  // Determine the number of scalars we need to generate for each unroll
3028  // iteration. If the instruction is uniform, we only need to generate the
3029  // first lane. Otherwise, we generate all VF values.
3030  unsigned Lanes = Legal->isUniformAfterVectorization(Instr) ? 1 : VF;
3031 
3032  // For each vector unroll 'part':
3033  for (unsigned Part = 0; Part < UF; ++Part) {
3034  Entry[Part].resize(VF);
3035  // For each scalar that we create:
3036  for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
3037 
3038  // Start if-block.
3039  Value *Cmp = nullptr;
3040  if (IfPredicateInstr) {
3041  Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Lane));
3042  Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp,
3043  ConstantInt::get(Cmp->getType(), 1));
3044  }
3045 
3046  Instruction *Cloned = Instr->clone();
3047  if (!IsVoidRetTy)
3048  Cloned->setName(Instr->getName() + ".cloned");
3049 
3050  // Replace the operands of the cloned instructions with their scalar
3051  // equivalents in the new loop.
3052  for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
3053  auto *NewOp = getScalarValue(Instr->getOperand(op), Part, Lane);
3054  Cloned->setOperand(op, NewOp);
3055  }
3056  addNewMetadata(Cloned, Instr);
3057 
3058  // Place the cloned scalar in the new loop.
3059  Builder.Insert(Cloned);
3060 
3061  // Add the cloned scalar to the scalar map entry.
3062  Entry[Part][Lane] = Cloned;
3063 
3064  // If we just cloned a new assumption, add it the assumption cache.
3065  if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
3066  if (II->getIntrinsicID() == Intrinsic::assume)
3067  AC->registerAssumption(II);
3068 
3069  // End if-block.
3070  if (IfPredicateInstr)
3071  PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp));
3072  }
3073  }
3074  VectorLoopValueMap.initScalar(Instr, Entry);
3075 }
3076 
3077 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
3078  Value *End, Value *Step,
3079  Instruction *DL) {
3080  BasicBlock *Header = L->getHeader();
3081  BasicBlock *Latch = L->getLoopLatch();
3082  // As we're just creating this loop, it's possible no latch exists
3083  // yet. If so, use the header as this will be a single block loop.
3084  if (!Latch)
3085  Latch = Header;
3086 
3087  IRBuilder<> Builder(&*Header->getFirstInsertionPt());
3088  Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
3089  setDebugLocFromInst(Builder, OldInst);
3090  auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
3091 
3092  Builder.SetInsertPoint(Latch->getTerminator());
3093  setDebugLocFromInst(Builder, OldInst);
3094 
3095  // Create i+1 and fill the PHINode.
3096  Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
3097  Induction->addIncoming(Start, L->getLoopPreheader());
3098  Induction->addIncoming(Next, Latch);
3099  // Create the compare.
3100  Value *ICmp = Builder.CreateICmpEQ(Next, End);
3101  Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
3102 
3103  // Now we have two terminators. Remove the old one from the block.
3104  Latch->getTerminator()->eraseFromParent();
3105 
3106  return Induction;
3107 }
3108 
3109 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
3110  if (TripCount)
3111  return TripCount;
3112 
3113  IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3114  // Find the loop boundaries.
3115  ScalarEvolution *SE = PSE.getSE();
3116  const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
3117  assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
3118  "Invalid loop count");
3119 
3120  Type *IdxTy = Legal->getWidestInductionType();
3121 
3122  // The exit count might have the type of i64 while the phi is i32. This can
3123  // happen if we have an induction variable that is sign extended before the
3124  // compare. The only way that we get a backedge taken count is that the
3125  // induction variable was signed and as such will not overflow. In such a case
3126  // truncation is legal.
3127  if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
3128  IdxTy->getPrimitiveSizeInBits())
3129  BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
3130  BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
3131 
3132  // Get the total trip count from the count by adding 1.
3133  const SCEV *ExitCount = SE->getAddExpr(
3134  BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3135 
3136  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
3137 
3138  // Expand the trip count and place the new instructions in the preheader.
3139  // Notice that the pre-header does not change, only the loop body.
3140  SCEVExpander Exp(*SE, DL, "induction");
3141 
3142  // Count holds the overall loop count (N).
3143  TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
3144  L->getLoopPreheader()->getTerminator());
3145 
3146  if (TripCount->getType()->isPointerTy())
3147  TripCount =
3148  CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
3149  L->getLoopPreheader()->getTerminator());
3150 
3151  return TripCount;
3152 }
3153 
3154 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
3155  if (VectorTripCount)
3156  return VectorTripCount;
3157 
3158  Value *TC = getOrCreateTripCount(L);
3159  IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3160 
3161  // Now we need to generate the expression for the part of the loop that the
3162  // vectorized body will execute. This is equal to N - (N % Step) if scalar
3163  // iterations are not required for correctness, or N - Step, otherwise. Step
3164  // is equal to the vectorization factor (number of SIMD elements) times the
3165  // unroll factor (number of SIMD instructions).
3166  Constant *Step = ConstantInt::get(TC->getType(), VF * UF);
3167  Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
3168 
3169  // If there is a non-reversed interleaved group that may speculatively access
3170  // memory out-of-bounds, we need to ensure that there will be at least one
3171  // iteration of the scalar epilogue loop. Thus, if the step evenly divides
3172  // the trip count, we set the remainder to be equal to the step. If the step
3173  // does not evenly divide the trip count, no adjustment is necessary since
3174  // there will already be scalar iterations. Note that the minimum iterations
3175  // check ensures that N >= Step.
3176  if (VF > 1 && Legal->requiresScalarEpilogue()) {
3177  auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3178  R = Builder.CreateSelect(IsZero, Step, R);
3179  }
3180 
3181  VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3182 
3183  return VectorTripCount;
3184 }
3185 
3186 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3187  BasicBlock *Bypass) {
3188  Value *Count = getOrCreateTripCount(L);
3189  BasicBlock *BB = L->getLoopPreheader();
3190  IRBuilder<> Builder(BB->getTerminator());
3191 
3192  // Generate code to check that the loop's trip count that we computed by
3193  // adding one to the backedge-taken count will not overflow.
3194  Value *CheckMinIters = Builder.CreateICmpULT(
3195  Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check");
3196 
3197  BasicBlock *NewBB =
3198  BB->splitBasicBlock(BB->getTerminator(), "min.iters.checked");
3199  // Update dominator tree immediately if the generated block is a
3200  // LoopBypassBlock because SCEV expansions to generate loop bypass
3201  // checks may query it before the current function is finished.
3202  DT->addNewBlock(NewBB, BB);
3203  if (L->getParentLoop())
3204  L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
3206  BranchInst::Create(Bypass, NewBB, CheckMinIters));
3207  LoopBypassBlocks.push_back(BB);
3208 }
3209 
3210 void InnerLoopVectorizer::emitVectorLoopEnteredCheck(Loop *L,
3211  BasicBlock *Bypass) {
3212  Value *TC = getOrCreateVectorTripCount(L);
3213  BasicBlock *BB = L->getLoopPreheader();
3214  IRBuilder<> Builder(BB->getTerminator());
3215 
3216  // Now, compare the new count to zero. If it is zero skip the vector loop and
3217  // jump to the scalar loop.
3218  Value *Cmp = Builder.CreateICmpEQ(TC, Constant::getNullValue(TC->getType()),
3219  "cmp.zero");
3220 
3221  // Generate code to check that the loop's trip count that we computed by
3222  // adding one to the backedge-taken count will not overflow.
3223  BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
3224  // Update dominator tree immediately if the generated block is a
3225  // LoopBypassBlock because SCEV expansions to generate loop bypass
3226  // checks may query it before the current function is finished.
3227  DT->addNewBlock(NewBB, BB);
3228  if (L->getParentLoop())
3229  L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
3231  BranchInst::Create(Bypass, NewBB, Cmp));
3232  LoopBypassBlocks.push_back(BB);
3233 }
3234 
3235 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3236  BasicBlock *BB = L->getLoopPreheader();
3237 
3238  // Generate the code to check that the SCEV assumptions that we made.
3239  // We want the new basic block to start at the first instruction in a
3240  // sequence of instructions that form a check.
3241  SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
3242  "scev.check");
3243  Value *SCEVCheck =
3244  Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
3245 
3246  if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
3247  if (C->isZero())
3248  return;
3249 
3250  // Create a new block containing the stride check.
3251  BB->setName("vector.scevcheck");
3252  auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
3253  // Update dominator tree immediately if the generated block is a
3254  // LoopBypassBlock because SCEV expansions to generate loop bypass
3255  // checks may query it before the current function is finished.
3256  DT->addNewBlock(NewBB, BB);
3257  if (L->getParentLoop())
3258  L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
3260  BranchInst::Create(Bypass, NewBB, SCEVCheck));
3261  LoopBypassBlocks.push_back(BB);
3262  AddedSafetyChecks = true;
3263 }
3264 
3265 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
3266  BasicBlock *BB = L->getLoopPreheader();
3267 
3268  // Generate the code that checks in runtime if arrays overlap. We put the
3269  // checks into a separate block to make the more common case of few elements
3270  // faster.
3271  Instruction *FirstCheckInst;
3272  Instruction *MemRuntimeCheck;
3273  std::tie(FirstCheckInst, MemRuntimeCheck) =
3274  Legal->getLAI()->addRuntimeChecks(BB->getTerminator());
3275  if (!MemRuntimeCheck)
3276  return;
3277 
3278  // Create a new block containing the memory check.
3279  BB->setName("vector.memcheck");
3280  auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
3281  // Update dominator tree immediately if the generated block is a
3282  // LoopBypassBlock because SCEV expansions to generate loop bypass
3283  // checks may query it before the current function is finished.
3284  DT->addNewBlock(NewBB, BB);
3285  if (L->getParentLoop())
3286  L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
3288  BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
3289  LoopBypassBlocks.push_back(BB);
3290  AddedSafetyChecks = true;
3291 
3292  // We currently don't use LoopVersioning for the actual loop cloning but we
3293  // still use it to add the noalias metadata.
3294  LVer = llvm::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
3295  PSE.getSE());
3296  LVer->prepareNoAliasMetadata();
3297 }
3298 
3299 void InnerLoopVectorizer::createEmptyLoop() {
3300  /*
3301  In this function we generate a new loop. The new loop will contain
3302  the vectorized instructions while the old loop will continue to run the
3303  scalar remainder.
3304 
3305  [ ] <-- loop iteration number check.
3306  / |
3307  / v
3308  | [ ] <-- vector loop bypass (may consist of multiple blocks).
3309  | / |
3310  | / v
3311  || [ ] <-- vector pre header.
3312  |/ |
3313  | v
3314  | [ ] \
3315  | [ ]_| <-- vector loop.
3316  | |
3317  | v
3318  | -[ ] <--- middle-block.
3319  | / |
3320  | / v
3321  -|- >[ ] <--- new preheader.
3322  | |
3323  | v
3324  | [ ] \
3325  | [ ]_| <-- old scalar loop to handle remainder.
3326  \ |
3327  \ v
3328  >[ ] <-- exit block.
3329  ...
3330  */
3331 
3332  BasicBlock *OldBasicBlock = OrigLoop->getHeader();
3333  BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
3334  BasicBlock *ExitBlock = OrigLoop->getExitBlock();
3335  assert(VectorPH && "Invalid loop structure");
3336  assert(ExitBlock && "Must have an exit block");
3337 
3338  // Some loops have a single integer induction variable, while other loops
3339  // don't. One example is c++ iterators that often have multiple pointer
3340  // induction variables. In the code below we also support a case where we
3341  // don't have a single induction variable.
3342  //
3343  // We try to obtain an induction variable from the original loop as hard
3344  // as possible. However if we don't find one that:
3345  // - is an integer
3346  // - counts from zero, stepping by one
3347  // - is the size of the widest induction variable type
3348  // then we create a new one.
3349  OldInduction = Legal->getInduction();
3350  Type *IdxTy = Legal->getWidestInductionType();
3351 
3352  // Split the single block loop into the two loop structure described above.
3353  BasicBlock *VecBody =
3354  VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
3355  BasicBlock *MiddleBlock =
3356  VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
3357  BasicBlock *ScalarPH =
3358  MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
3359 
3360  // Create and register the new vector loop.
3361  Loop *Lp = new Loop();
3362  Loop *ParentLoop = OrigLoop->getParentLoop();
3363 
3364  // Insert the new loop into the loop nest and register the new basic blocks
3365  // before calling any utilities such as SCEV that require valid LoopInfo.
3366  if (ParentLoop) {
3367  ParentLoop->addChildLoop(Lp);
3368  ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
3369  ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
3370  } else {
3371  LI->addTopLevelLoop(Lp);
3372  }
3373  Lp->addBasicBlockToLoop(VecBody, *LI);
3374 
3375  // Find the loop boundaries.
3376  Value *Count = getOrCreateTripCount(Lp);
3377 
3378  Value *StartIdx = ConstantInt::get(IdxTy, 0);
3379 
3380  // We need to test whether the backedge-taken count is uint##_max. Adding one
3381  // to it will cause overflow and an incorrect loop trip count in the vector
3382  // body. In case of overflow we want to directly jump to the scalar remainder
3383  // loop.
3384  emitMinimumIterationCountCheck(Lp, ScalarPH);
3385  // Now, compare the new count to zero. If it is zero skip the vector loop and
3386  // jump to the scalar loop.
3387  emitVectorLoopEnteredCheck(Lp, ScalarPH);
3388  // Generate the code to check any assumptions that we've made for SCEV
3389  // expressions.
3390  emitSCEVChecks(Lp, ScalarPH);
3391 
3392  // Generate the code that checks in runtime if arrays overlap. We put the
3393  // checks into a separate block to make the more common case of few elements
3394  // faster.
3395  emitMemRuntimeChecks(Lp, ScalarPH);
3396 
3397  // Generate the induction variable.
3398  // The loop step is equal to the vectorization factor (num of SIMD elements)
3399  // times the unroll factor (num of SIMD instructions).
3400  Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3401  Constant *Step = ConstantInt::get(IdxTy, VF * UF);
3402  Induction =
3403  createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3404  getDebugLocFromInstOrOperands(OldInduction));
3405 
3406  // We are going to resume the execution of the scalar loop.
3407  // Go over all of the induction variables that we found and fix the
3408  // PHIs that are left in the scalar version of the loop.
3409  // The starting values of PHI nodes depend on the counter of the last
3410  // iteration in the vectorized loop.
3411  // If we come from a bypass edge then we need to start from the original
3412  // start value.
3413 
3414  // This variable saves the new starting index for the scalar loop. It is used
3415  // to test if there are any tail iterations left once the vector loop has
3416  // completed.
3417  LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
3418  for (auto &InductionEntry : *List) {
3419  PHINode *OrigPhi = InductionEntry.first;
3420  InductionDescriptor II = InductionEntry.second;
3421 
3422  // Create phi nodes to merge from the backedge-taken check block.
3423  PHINode *BCResumeVal = PHINode::Create(
3424  OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
3425  Value *&EndValue = IVEndValues[OrigPhi];
3426  if (OrigPhi == OldInduction) {
3427  // We know what the end value is.
3428  EndValue = CountRoundDown;
3429  } else {
3430  IRBuilder<> B(LoopBypassBlocks.back()->getTerminator());
3431  Type *StepType = II.getStep()->getType();
3432  Instruction::CastOps CastOp =
3433  CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
3434  Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
3435  const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
3436  EndValue = II.transform(B, CRD, PSE.getSE(), DL);
3437  EndValue->setName("ind.end");
3438  }
3439 
3440  // The new PHI merges the original incoming value, in case of a bypass,
3441  // or the value at the end of the vectorized loop.
3442  BCResumeVal->addIncoming(EndValue, MiddleBlock);
3443 
3444  // Fix the scalar body counter (PHI node).
3445  unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
3446 
3447  // The old induction's phi node in the scalar body needs the truncated
3448  // value.
3449  for (BasicBlock *BB : LoopBypassBlocks)
3450  BCResumeVal->addIncoming(II.getStartValue(), BB);
3451  OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
3452  }
3453 
3454  // Add a check in the middle block to see if we have completed
3455  // all of the iterations in the first vector loop.
3456  // If (N - N%VF) == N, then we *don't* need to run the remainder.
3457  Value *CmpN =
3458  CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3459  CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
3460  ReplaceInstWithInst(MiddleBlock->getTerminator(),
3461  BranchInst::Create(ExitBlock, ScalarPH, CmpN));
3462 
3463  // Get ready to start creating new instructions into the vectorized body.
3464  Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());
3465 
3466  // Save the state.
3467  LoopVectorPreHeader = Lp->getLoopPreheader();
3468  LoopScalarPreHeader = ScalarPH;
3469  LoopMiddleBlock = MiddleBlock;
3470  LoopExitBlock = ExitBlock;
3471  LoopVectorBody = VecBody;
3472  LoopScalarBody = OldBasicBlock;
3473 
3474  // Keep all loop hints from the original loop on the vector loop (we'll
3475  // replace the vectorizer-specific hints below).
3476  if (MDNode *LID = OrigLoop->getLoopID())
3477  Lp->setLoopID(LID);
3478 
3479  LoopVectorizeHints Hints(Lp, true, *ORE);
3480  Hints.setAlreadyVectorized();
3481 }
3482 
3483 // Fix up external users of the induction variable. At this point, we are
3484 // in LCSSA form, with all external PHIs that use the IV having one input value,
3485 // coming from the remainder loop. We need those PHIs to also have a correct
3486 // value for the IV when arriving directly from the middle block.
3487 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3488  const InductionDescriptor &II,
3489  Value *CountRoundDown, Value *EndValue,
3490  BasicBlock *MiddleBlock) {
3491  // There are two kinds of external IV usages - those that use the value
3492  // computed in the last iteration (the PHI) and those that use the penultimate
3493  // value (the value that feeds into the phi from the loop latch).
3494  // We allow both, but they, obviously, have different values.
3495 
3496  assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3497 
3498  DenseMap<Value *, Value *> MissingVals;
3499 
3500  // An external user of the last iteration's value should see the value that
3501  // the remainder loop uses to initialize its own IV.
3502  Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3503  for (User *U : PostInc->users()) {
3504  Instruction *UI = cast<Instruction>(U);
3505  if (!OrigLoop->contains(UI)) {
3506  assert(isa<PHINode>(UI) && "Expected LCSSA form");
3507  MissingVals[UI] = EndValue;
3508  }
3509  }
3510 
3511  // An external user of the penultimate value need to see EndValue - Step.
3512  // The simplest way to get this is to recompute it from the constituent SCEVs,
3513  // that is Start + (Step * (CRD - 1)).
3514  for (User *U : OrigPhi->users()) {
3515  auto *UI = cast<Instruction>(U);
3516  if (!OrigLoop->contains(UI)) {
3517  const DataLayout &DL =
3518  OrigLoop->getHeader()->getModule()->getDataLayout();
3519  assert(isa<PHINode>(UI) && "Expected LCSSA form");
3520 
3521  IRBuilder<> B(MiddleBlock->getTerminator());
3522  Value *CountMinusOne = B.CreateSub(
3523  CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3524  Value *CMO = B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType(),
3525  "cast.cmo");
3526  Value *Escape = II.transform(B, CMO, PSE.getSE(), DL);
3527  Escape->setName("ind.escape");
3528  MissingVals[UI] = Escape;
3529  }
3530  }
3531 
3532  for (auto &I : MissingVals) {
3533  PHINode *PHI = cast<PHINode>(I.first);
3534  // One corner case we have to handle is two IVs "chasing" each-other,
3535  // that is %IV2 = phi [...], [ %IV1, %latch ]
3536  // In this case, if IV1 has an external use, we need to avoid adding both
3537  // "last value of IV1" and "penultimate value of IV2". So, verify that we
3538  // don't already have an incoming value for the middle block.
3539  if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3540  PHI->addIncoming(I.second, MiddleBlock);
3541  }
3542 }
3543 
3544 namespace {
3545 struct CSEDenseMapInfo {
3546  static bool canHandle(Instruction *I) {
3547  return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3548  isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3549  }
3550  static inline Instruction *getEmptyKey() {
3552  }
3553  static inline Instruction *getTombstoneKey() {
3555  }
3556  static unsigned getHashValue(Instruction *I) {
3557  assert(canHandle(I) && "Unknown instruction!");
3559  I->value_op_end()));
3560  }
3561  static bool isEqual(Instruction *LHS, Instruction *RHS) {
3562  if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3563  LHS == getTombstoneKey() || RHS == getTombstoneKey())
3564  return LHS == RHS;
3565  return LHS->isIdenticalTo(RHS);
3566  }
3567 };
3568 }
3569 
3570 ///\brief Perform cse of induction variable instructions.
3571 static void cse(BasicBlock *BB) {
3572  // Perform simple cse.
3574  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3575  Instruction *In = &*I++;
3576 
3577  if (!CSEDenseMapInfo::canHandle(In))
3578  continue;
3579 
3580  // Check if we can replace this instruction with any of the
3581  // visited instructions.
3582  if (Instruction *V = CSEMap.lookup(In)) {
3583  In->replaceAllUsesWith(V);
3584  In->eraseFromParent();
3585  continue;
3586  }
3587 
3588  CSEMap[In] = In;
3589  }
3590 }
3591 
3592 /// \brief Adds a 'fast' flag to floating point operations.
3594  if (isa<FPMathOperator>(V)) {
3596  Flags.setUnsafeAlgebra();
3597  cast<Instruction>(V)->setFastMathFlags(Flags);
3598  }
3599  return V;
3600 }
3601 
3602 /// \brief Estimate the overhead of scalarizing a value based on its type.
3603 /// Insert and Extract are set if the result needs to be inserted and/or
3604 /// extracted from vectors.
3605 static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract,
3606  const TargetTransformInfo &TTI) {
3607  if (Ty->isVoidTy())
3608  return 0;
3609 
3610  assert(Ty->isVectorTy() && "Can only scalarize vectors");
3611  unsigned Cost = 0;
3612 
3613  for (unsigned I = 0, E = Ty->getVectorNumElements(); I < E; ++I) {
3614  if (Extract)
3615  Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, Ty, I);
3616  if (Insert)
3617  Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, I);
3618  }
3619 
3620  return Cost;
3621 }
3622 
3623 /// \brief Estimate the overhead of scalarizing an Instruction based on the
3624 /// types of its operands and return value.
3626  Type *RetTy,
3627  const TargetTransformInfo &TTI) {
3628  unsigned ScalarizationCost =
3629  getScalarizationOverhead(RetTy, true, false, TTI);
3630 
3631  for (Type *Ty : OpTys)
3632  ScalarizationCost += getScalarizationOverhead(Ty, false, true, TTI);
3633 
3634  return ScalarizationCost;
3635 }
3636 
3637 /// \brief Estimate the overhead of scalarizing an instruction. This is a
3638 /// convenience wrapper for the type-based getScalarizationOverhead API.
3639 static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,
3640  const TargetTransformInfo &TTI) {
3641  if (VF == 1)
3642  return 0;
3643 
3644  Type *RetTy = ToVectorTy(I->getType(), VF);
3645 
3646  SmallVector<Type *, 4> OpTys;
3647  unsigned OperandsNum = I->getNumOperands();
3648  for (unsigned OpInd = 0; OpInd < OperandsNum; ++OpInd)
3649  OpTys.push_back(ToVectorTy(I->getOperand(OpInd)->getType(), VF));
3650 
3651  return getScalarizationOverhead(OpTys, RetTy, TTI);
3652 }
3653 
3654 // Estimate cost of a call instruction CI if it were vectorized with factor VF.
3655 // Return the cost of the instruction, including scalarization overhead if it's
3656 // needed. The flag NeedToScalarize shows if the call needs to be scalarized -
3657 // i.e. either vector version isn't available, or is too expensive.
3658 static unsigned getVectorCallCost(CallInst *CI, unsigned VF,
3659  const TargetTransformInfo &TTI,
3660  const TargetLibraryInfo *TLI,
3661  bool &NeedToScalarize) {
3662  Function *F = CI->getCalledFunction();
3663  StringRef FnName = CI->getCalledFunction()->getName();
3664  Type *ScalarRetTy = CI->getType();
3665  SmallVector<Type *, 4> Tys, ScalarTys;
3666  for (auto &ArgOp : CI->arg_operands())
3667  ScalarTys.push_back(ArgOp->getType());
3668 
3669  // Estimate cost of scalarized vector call. The source operands are assumed
3670  // to be vectors, so we need to extract individual elements from there,
3671  // execute VF scalar calls, and then gather the result into the vector return
3672  // value.
3673  unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3674  if (VF == 1)
3675  return ScalarCallCost;
3676 
3677  // Compute corresponding vector type for return value and arguments.
3678  Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3679  for (Type *ScalarTy : ScalarTys)
3680  Tys.push_back(ToVectorTy(ScalarTy, VF));
3681 
3682  // Compute costs of unpacking argument values for the scalar calls and
3683  // packing the return values to a vector.
3684  unsigned ScalarizationCost = getScalarizationOverhead(Tys, RetTy, TTI);
3685 
3686  unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3687 
3688  // If we can't emit a vector call for this function, then the currently found
3689  // cost is the cost we need to return.
3690  NeedToScalarize = true;
3691  if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
3692  return Cost;
3693 
3694  // If the corresponding vector cost is cheaper, return its cost.
3695  unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3696  if (VectorCallCost < Cost) {
3697  NeedToScalarize = false;
3698  return VectorCallCost;
3699  }
3700  return Cost;
3701 }
3702 
3703 // Estimate cost of an intrinsic call instruction CI if it were vectorized with
3704 // factor VF. Return the cost of the instruction, including scalarization
3705 // overhead if it's needed.
3706 static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF,
3707  const TargetTransformInfo &TTI,
3708  const TargetLibraryInfo *TLI) {
3710  assert(ID && "Expected intrinsic call!");
3711 
3712  Type *RetTy = ToVectorTy(CI->getType(), VF);
3714  for (Value *ArgOperand : CI->arg_operands())
3715  Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
3716 
3717  FastMathFlags FMF;
3718  if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3719  FMF = FPMO->getFastMathFlags();
3720 
3721  return TTI.getIntrinsicInstrCost(ID, RetTy, Tys, FMF);
3722 }
3723 
3725  auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3726  auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3727  return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3728 }
3730  auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3731  auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3732  return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3733 }
3734 
3735 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3736  // For every instruction `I` in MinBWs, truncate the operands, create a
3737  // truncated version of `I` and reextend its result. InstCombine runs
3738  // later and will remove any ext/trunc pairs.
3739  //
3740  SmallPtrSet<Value *, 4> Erased;
3741  for (const auto &KV : Cost->getMinimalBitwidths()) {
3742  // If the value wasn't vectorized, we must maintain the original scalar
3743  // type. The absence of the value from VectorLoopValueMap indicates that it
3744  // wasn't vectorized.
3745  if (!VectorLoopValueMap.hasVector(KV.first))
3746  continue;
3747  VectorParts &Parts = VectorLoopValueMap.getVector(KV.first);
3748  for (Value *&I : Parts) {
3749  if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3750  continue;
3751  Type *OriginalTy = I->getType();
3752  Type *ScalarTruncatedTy =
3753  IntegerType::get(OriginalTy->getContext(), KV.second);
3754  Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3755  OriginalTy->getVectorNumElements());
3756  if (TruncatedTy == OriginalTy)
3757  continue;
3758 
3759  IRBuilder<> B(cast<Instruction>(I));
3760  auto ShrinkOperand = [&](Value *V) -> Value * {
3761  if (auto *ZI = dyn_cast<ZExtInst>(V))
3762  if (ZI->getSrcTy() == TruncatedTy)
3763  return ZI->getOperand(0);
3764  return B.CreateZExtOrTrunc(V, TruncatedTy);
3765  };
3766 
3767  // The actual instruction modification depends on the instruction type,
3768  // unfortunately.
3769  Value *NewI = nullptr;
3770  if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3771  NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3772  ShrinkOperand(BO->getOperand(1)));
3773  cast<BinaryOperator>(NewI)->copyIRFlags(I);
3774  } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3775  NewI =
3776  B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3777  ShrinkOperand(CI->getOperand(1)));
3778  } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3779  NewI = B.CreateSelect(SI->getCondition(),
3780  ShrinkOperand(SI->getTrueValue()),
3781  ShrinkOperand(SI->getFalseValue()));
3782  } else if (auto *CI = dyn_cast<CastInst>(I)) {
3783  switch (CI->getOpcode()) {
3784  default:
3785  llvm_unreachable("Unhandled cast!");
3786  case Instruction::Trunc:
3787  NewI = ShrinkOperand(CI->getOperand(0));
3788  break;
3789  case Instruction::SExt:
3790  NewI = B.CreateSExtOrTrunc(
3791  CI->getOperand(0),
3792  smallestIntegerVectorType(OriginalTy, TruncatedTy));
3793  break;
3794  case Instruction::ZExt:
3795  NewI = B.CreateZExtOrTrunc(
3796  CI->getOperand(0),
3797  smallestIntegerVectorType(OriginalTy, TruncatedTy));
3798  break;
3799  }
3800  } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3801  auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3802  auto *O0 = B.CreateZExtOrTrunc(
3803  SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3804  auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3805  auto *O1 = B.CreateZExtOrTrunc(
3806  SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3807 
3808  NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3809  } else if (isa<LoadInst>(I)) {
3810  // Don't do anything with the operands, just extend the result.
3811  continue;
3812  } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3813  auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3814  auto *O0 = B.CreateZExtOrTrunc(
3815  IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3816  auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3817  NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3818  } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3819  auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3820  auto *O0 = B.CreateZExtOrTrunc(
3821  EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3822  NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3823  } else {
3824  llvm_unreachable("Unhandled instruction type!");
3825  }
3826 
3827  // Lastly, extend the result.
3828  NewI->takeName(cast<Instruction>(I));
3829  Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3830  I->replaceAllUsesWith(Res);
3831  cast<Instruction>(I)->eraseFromParent();
3832  Erased.insert(I);
3833  I = Res;
3834  }
3835  }
3836 
3837  // We'll have created a bunch of ZExts that are now parentless. Clean up.
3838  for (const auto &KV : Cost->getMinimalBitwidths()) {
3839  // If the value wasn't vectorized, we must maintain the original scalar
3840  // type. The absence of the value from VectorLoopValueMap indicates that it
3841  // wasn't vectorized.
3842  if (!VectorLoopValueMap.hasVector(KV.first))
3843  continue;
3844  VectorParts &Parts = VectorLoopValueMap.getVector(KV.first);
3845  for (Value *&I : Parts) {
3846  ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3847  if (Inst && Inst->use_empty()) {
3848  Value *NewI = Inst->getOperand(0);
3849  Inst->eraseFromParent();
3850  I = NewI;
3851  }
3852  }
3853  }
3854 }
3855 
3856 void InnerLoopVectorizer::vectorizeLoop() {
3857  //===------------------------------------------------===//
3858  //
3859  // Notice: any optimization or new instruction that go
3860  // into the code below should be also be implemented in
3861  // the cost-model.
3862  //
3863  //===------------------------------------------------===//
3864  Constant *Zero = Builder.getInt32(0);
3865 
3866  // In order to support recurrences we need to be able to vectorize Phi nodes.
3867  // Phi nodes have cycles, so we need to vectorize them in two stages. First,
3868  // we create a new vector PHI node with no incoming edges. We use this value
3869  // when we vectorize all of the instructions that use the PHI. Next, after
3870  // all of the instructions in the block are complete we add the new incoming
3871  // edges to the PHI. At this point all of the instructions in the basic block
3872  // are vectorized, so we can use them to construct the PHI.
3873  PhiVector PHIsToFix;
3874 
3875  // Collect instructions from the original loop that will become trivially
3876  // dead in the vectorized loop. We don't need to vectorize these
3877  // instructions.
3878  collectTriviallyDeadInstructions();
3879 
3880  // Scan the loop in a topological order to ensure that defs are vectorized
3881  // before users.
3882  LoopBlocksDFS DFS(OrigLoop);
3883  DFS.perform(LI);
3884 
3885  // Vectorize all of the blocks in the original loop.
3886  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
3887  vectorizeBlockInLoop(BB, &PHIsToFix);
3888 
3889  // Insert truncates and extends for any truncated instructions as hints to
3890  // InstCombine.
3891  if (VF > 1)
3892  truncateToMinimalBitwidths();
3893 
3894  // At this point every instruction in the original loop is widened to a
3895  // vector form. Now we need to fix the recurrences in PHIsToFix. These PHI
3896  // nodes are currently empty because we did not want to introduce cycles.
3897  // This is the second stage of vectorizing recurrences.
3898  for (PHINode *Phi : PHIsToFix) {
3899  assert(Phi && "Unable to recover vectorized PHI");
3900 
3901  // Handle first-order recurrences that need to be fixed.
3902  if (Legal->isFirstOrderRecurrence(Phi)) {
3903  fixFirstOrderRecurrence(Phi);
3904  continue;
3905  }
3906 
3907  // If the phi node is not a first-order recurrence, it must be a reduction.
3908  // Get it's reduction variable descriptor.
3909  assert(Legal->isReductionVariable(Phi) &&
3910  "Unable to find the reduction variable");
3911  RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
3912 
3914  TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3915  Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3917  RdxDesc.getMinMaxRecurrenceKind();
3918  setDebugLocFromInst(Builder, ReductionStartValue);
3919 
3920  // We need to generate a reduction vector from the incoming scalar.
3921  // To do so, we need to generate the 'identity' vector and override
3922  // one of the elements with the incoming scalar reduction. We need
3923  // to do it in the vector-loop preheader.
3924  Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator());
3925 
3926  // This is the vector-clone of the value that leaves the loop.
3927  const VectorParts &VectorExit = getVectorValue(LoopExitInst);
3928  Type *VecTy = VectorExit[0]->getType();
3929 
3930  // Find the reduction identity variable. Zero for addition, or, xor,
3931  // one for multiplication, -1 for And.
3932  Value *Identity;
3933  Value *VectorStart;
3936  // MinMax reduction have the start value as their identify.
3937  if (VF == 1) {
3938  VectorStart = Identity = ReductionStartValue;
3939  } else {
3940  VectorStart = Identity =
3941  Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3942  }
3943  } else {
3944  // Handle other reduction kinds:
3946  RK, VecTy->getScalarType());
3947  if (VF == 1) {
3948  Identity = Iden;
3949  // This vector is the Identity vector where the first element is the
3950  // incoming scalar reduction.
3951  VectorStart = ReductionStartValue;
3952  } else {
3953  Identity = ConstantVector::getSplat(VF, Iden);
3954 
3955  // This vector is the Identity vector where the first element is the
3956  // incoming scalar reduction.
3957  VectorStart =
3958  Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3959  }
3960  }
3961 
3962  // Fix the vector-loop phi.
3963 
3964  // Reductions do not have to start at zero. They can start with
3965  // any loop invariant values.
3966  const VectorParts &VecRdxPhi = getVectorValue(Phi);
3967  BasicBlock *Latch = OrigLoop->getLoopLatch();
3968  Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3969  const VectorParts &Val = getVectorValue(LoopVal);
3970  for (unsigned part = 0; part < UF; ++part) {
3971  // Make sure to add the reduction stat value only to the
3972  // first unroll part.
3973  Value *StartVal = (part == 0) ? VectorStart : Identity;
3974  cast<PHINode>(VecRdxPhi[part])
3975  ->addIncoming(StartVal, LoopVectorPreHeader);
3976  cast<PHINode>(VecRdxPhi[part])
3977  ->addIncoming(Val[part], LoopVectorBody);
3978  }
3979 
3980  // Before each round, move the insertion point right between
3981  // the PHIs and the values we are going to write.
3982  // This allows us to write both PHINodes and the extractelement
3983  // instructions.
3984  Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3985 
3986  VectorParts &RdxParts = VectorLoopValueMap.getVector(LoopExitInst);
3987  setDebugLocFromInst(Builder, LoopExitInst);
3988 
3989  // If the vector reduction can be performed in a smaller type, we truncate
3990  // then extend the loop exit value to enable InstCombine to evaluate the
3991  // entire expression in the smaller type.
3992  if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3993  Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3994  Builder.SetInsertPoint(LoopVectorBody->getTerminator());
3995  for (unsigned part = 0; part < UF; ++part) {
3996  Value *Trunc = Builder.CreateTrunc(RdxParts[part], RdxVecTy);
3997  Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3998  : Builder.CreateZExt(Trunc, VecTy);
3999  for (Value::user_iterator UI = RdxParts[part]->user_begin();
4000  UI != RdxParts[part]->user_end();)
4001  if (*UI != Trunc) {
4002  (*UI++)->replaceUsesOfWith(RdxParts[part], Extnd);
4003  RdxParts[part] = Extnd;
4004  } else {
4005  ++UI;
4006  }
4007  }
4008  Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4009  for (unsigned part = 0; part < UF; ++part)
4010  RdxParts[part] = Builder.CreateTrunc(RdxParts[part], RdxVecTy);
4011  }
4012 
4013  // Reduce all of the unrolled parts into a single vector.
4014  Value *ReducedPartRdx = RdxParts[0];
4016  setDebugLocFromInst(Builder, ReducedPartRdx);
4017  for (unsigned part = 1; part < UF; ++part) {
4018  if (Op != Instruction::ICmp && Op != Instruction::FCmp)
4019  // Floating point operations had to be 'fast' to enable the reduction.
4020  ReducedPartRdx = addFastMathFlag(
4021  Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part],
4022  ReducedPartRdx, "bin.rdx"));
4023  else
4024  ReducedPartRdx = RecurrenceDescriptor::createMinMaxOp(
4025  Builder, MinMaxKind, ReducedPartRdx, RdxParts[part]);
4026  }
4027 
4028  if (VF > 1) {
4029  // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
4030  // and vector ops, reducing the set of values being computed by half each
4031  // round.
4032  assert(isPowerOf2_32(VF) &&
4033  "Reduction emission only supported for pow2 vectors!");
4034  Value *TmpVec = ReducedPartRdx;
4035  SmallVector<Constant *, 32> ShuffleMask(VF, nullptr);
4036  for (unsigned i = VF; i != 1; i >>= 1) {
4037  // Move the upper half of the vector to the lower half.
4038  for (unsigned j = 0; j != i / 2; ++j)
4039  ShuffleMask[j] = Builder.getInt32(i / 2 + j);
4040 
4041  // Fill the rest of the mask with undef.
4042  std::fill(&ShuffleMask[i / 2], ShuffleMask.end(),
4043  UndefValue::get(Builder.getInt32Ty()));
4044 
4045  Value *Shuf = Builder.CreateShuffleVector(
4046  TmpVec, UndefValue::get(TmpVec->getType()),
4047  ConstantVector::get(ShuffleMask), "rdx.shuf");
4048 
4049  if (Op != Instruction::ICmp && Op != Instruction::FCmp)
4050  // Floating point operations had to be 'fast' to enable the reduction.
4051  TmpVec = addFastMathFlag(Builder.CreateBinOp(
4052  (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx"));
4053  else
4054  TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind,
4055  TmpVec, Shuf);
4056  }
4057 
4058  // The result is in the first element of the vector.
4059  ReducedPartRdx =
4060  Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
4061 
4062  // If the reduction can be performed in a smaller type, we need to extend
4063  // the reduction to the wider type before we branch to the original loop.
4064  if (Phi->getType() != RdxDesc.getRecurrenceType())
4065  ReducedPartRdx =
4066  RdxDesc.isSigned()
4067  ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
4068  : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
4069  }
4070 
4071  // Create a phi node that merges control-flow from the backedge-taken check
4072  // block and the middle block.
4073  PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
4074  LoopScalarPreHeader->getTerminator());
4075  for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4076  BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4077  BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4078 
4079  // Now, we need to fix the users of the reduction variable
4080  // inside and outside of the scalar remainder loop.
4081  // We know that the loop is in LCSSA form. We need to update the
4082  // PHI nodes in the exit blocks.
4083  for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
4084  LEE = LoopExitBlock->end();
4085  LEI != LEE; ++LEI) {
4086  PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
4087  if (!LCSSAPhi)
4088  break;
4089 
4090  // All PHINodes need to have a single entry edge, or two if
4091  // we already fixed them.
4092  assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
4093 
4094  // We found our reduction value exit-PHI. Update it with the
4095  // incoming bypass edge.
4096  if (LCSSAPhi->getIncomingValue(0) == LoopExitInst) {
4097  // Add an edge coming from the bypass.
4098  LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4099  break;
4100  }
4101  } // end of the LCSSA phi scan.
4102 
4103  // Fix the scalar loop reduction variable with the incoming reduction sum
4104  // from the vector body and from the backedge value.
4105  int IncomingEdgeBlockIdx =
4106  Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4107  assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4108  // Pick the other block.
4109  int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4110  Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4111  Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4112  } // end of for each Phi in PHIsToFix.
4113 
4114  // Update the dominator tree.
4115  //
4116  // FIXME: After creating the structure of the new loop, the dominator tree is
4117  // no longer up-to-date, and it remains that way until we update it
4118  // here. An out-of-date dominator tree is problematic for SCEV,
4119  // because SCEVExpander uses it to guide code generation. The
4120  // vectorizer use SCEVExpanders in several places. Instead, we should
4121  // keep the dominator tree up-to-date as we go.
4122  updateAnalysis();
4123 
4124  // Fix-up external users of the induction variables.
4125  for (auto &Entry : *Legal->getInductionVars())
4126  fixupIVUsers(Entry.first, Entry.second,
4127  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
4128  IVEndValues[Entry.first], LoopMiddleBlock);
4129 
4130  fixLCSSAPHIs();
4131  predicateInstructions();
4132 
4133  // Remove redundant induction instructions.
4134  cse(LoopVectorBody);
4135 }
4136 
4137 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
4138 
4139  // This is the second phase of vectorizing first-order recurrences. An
4140  // overview of the transformation is described below. Suppose we have the
4141  // following loop.
4142  //
4143  // for (int i = 0; i < n; ++i)
4144  // b[i] = a[i] - a[i - 1];
4145  //
4146  // There is a first-order recurrence on "a". For this loop, the shorthand
4147  // scalar IR looks like:
4148  //
4149  // scalar.ph:
4150  // s_init = a[-1]
4151  // br scalar.body
4152  //
4153  // scalar.body:
4154  // i = phi [0, scalar.ph], [i+1, scalar.body]
4155  // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
4156  // s2 = a[i]
4157  // b[i] = s2 - s1
4158  // br cond, scalar.body, ...
4159  //
4160  // In this example, s1 is a recurrence because it's value depends on the
4161  // previous iteration. In the first phase of vectorization, we created a
4162  // temporary value for s1. We now complete the vectorization and produce the
4163  // shorthand vector IR shown below (for VF = 4, UF = 1).
4164  //
4165  // vector.ph:
4166  // v_init = vector(..., ..., ..., a[-1])
4167  // br vector.body
4168  //
4169  // vector.body
4170  // i = phi [0, vector.ph], [i+4, vector.body]
4171  // v1 = phi [v_init, vector.ph], [v2, vector.body]
4172  // v2 = a[i, i+1, i+2, i+3];
4173  // v3 = vector(v1(3), v2(0, 1, 2))
4174  // b[i, i+1, i+2, i+3] = v2 - v3
4175  // br cond, vector.body, middle.block
4176  //
4177  // middle.block:
4178  // x = v2(3)
4179  // br scalar.ph
4180  //
4181  // scalar.ph:
4182  // s_init = phi [x, middle.block], [a[-1], otherwise]
4183  // br scalar.body
4184  //
4185  // After execution completes the vector loop, we extract the next value of
4186  // the recurrence (x) to use as the initial value in the scalar loop.
4187 
4188  // Get the original loop preheader and single loop latch.
4189  auto *Preheader = OrigLoop->getLoopPreheader();
4190  auto *Latch = OrigLoop->getLoopLatch();
4191 
4192  // Get the initial and previous values of the scalar recurrence.
4193  auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
4194  auto *Previous = Phi->getIncomingValueForBlock(Latch);
4195 
4196  // Create a vector from the initial value.
4197  auto *VectorInit = ScalarInit;
4198  if (VF > 1) {
4199  Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4200  VectorInit = Builder.CreateInsertElement(
4201  UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
4202  Builder.getInt32(VF - 1), "vector.recur.init");
4203  }
4204 
4205  // We constructed a temporary phi node in the first phase of vectorization.
4206  // This phi node will eventually be deleted.
4207  VectorParts &PhiParts = VectorLoopValueMap.getVector(Phi);
4208  Builder.SetInsertPoint(cast<Instruction>(PhiParts[0]));
4209 
4210  // Create a phi node for the new recurrence. The current value will either be
4211  // the initial value inserted into a vector or loop-varying vector value.
4212  auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
4213  VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
4214 
4215  // Get the vectorized previous value. We ensured the previous values was an
4216  // instruction when detecting the recurrence.
4217  auto &PreviousParts = getVectorValue(Previous);
4218 
4219  // Set the insertion point to be after this instruction. We ensured the
4220  // previous value dominated all uses of the phi when detecting the
4221  // recurrence.
4222  Builder.SetInsertPoint(
4223  &*++BasicBlock::iterator(cast<Instruction>(PreviousParts[UF - 1])));
4224 
4225  // We will construct a vector for the recurrence by combining the values for
4226  // the current and previous iterations. This is the required shuffle mask.
4227  SmallVector<Constant *, 8> ShuffleMask(VF);
4228  ShuffleMask[0] = Builder.getInt32(VF - 1);
4229  for (unsigned I = 1; I < VF; ++I)
4230  ShuffleMask[I] = Builder.getInt32(I + VF - 1);
4231 
4232  // The vector from which to take the initial value for the current iteration
4233  // (actual or unrolled). Initially, this is the vector phi node.
4234  Value *Incoming = VecPhi;
4235 
4236  // Shuffle the current and previous vector and update the vector parts.
4237  for (unsigned Part = 0; Part < UF; ++Part) {
4238  auto *Shuffle =
4239  VF > 1
4240  ? Builder.CreateShuffleVector(Incoming, PreviousParts[Part],
4241  ConstantVector::get(ShuffleMask))
4242  : Incoming;
4243  PhiParts[Part]->replaceAllUsesWith(Shuffle);
4244  cast<Instruction>(PhiParts[Part])->eraseFromParent();
4245  PhiParts[Part] = Shuffle;
4246  Incoming = PreviousParts[Part];
4247  }
4248 
4249  // Fix the latch value of the new recurrence in the vector loop.
4250  VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4251 
4252  // Extract the last vector element in the middle block. This will be the
4253  // initial value for the recurrence when jumping to the scalar loop.
4254  auto *Extract = Incoming;
4255  if (VF > 1) {
4256  Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4257  Extract = Builder.CreateExtractElement(Extract, Builder.getInt32(VF - 1),
4258  "vector.recur.extract");
4259  }
4260 
4261  // Fix the initial value of the original recurrence in the scalar loop.
4262  Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4263  auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4264  for (auto *BB : predecessors(LoopScalarPreHeader)) {
4265  auto *Incoming = BB == LoopMiddleBlock ? Extract : ScalarInit;
4266  Start->addIncoming(Incoming, BB);
4267  }
4268 
4269  Phi->setIncomingValue(Phi->getBasicBlockIndex(LoopScalarPreHeader), Start);
4270  Phi->setName("scalar.recur");
4271 
4272  // Finally, fix users of the recurrence outside the loop. The users will need
4273  // either the last value of the scalar recurrence or the last value of the
4274  // vector recurrence we extracted in the middle block. Since the loop is in
4275  // LCSSA form, we just need to find the phi node for the original scalar
4276  // recurrence in the exit block, and then add an edge for the middle block.
4277  for (auto &I : *LoopExitBlock) {
4278  auto *LCSSAPhi = dyn_cast<PHINode>(&I);
4279  if (!LCSSAPhi)
4280  break;
4281  if (LCSSAPhi->getIncomingValue(0) == Phi) {
4282  LCSSAPhi->addIncoming(Extract, LoopMiddleBlock);
4283  break;
4284  }
4285  }
4286 }
4287 
4288 void InnerLoopVectorizer::fixLCSSAPHIs() {
4289  for (Instruction &LEI : *LoopExitBlock) {
4290  auto *LCSSAPhi = dyn_cast<PHINode>(&LEI);
4291  if (!LCSSAPhi)
4292  break;
4293  if (LCSSAPhi->getNumIncomingValues() == 1)
4294  LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()),
4295  LoopMiddleBlock);
4296  }
4297 }
4298 
4299 void InnerLoopVectorizer::collectTriviallyDeadInstructions() {
4300  BasicBlock *Latch = OrigLoop->getLoopLatch();
4301 
4302  // We create new control-flow for the vectorized loop, so the original
4303  // condition will be dead after vectorization if it's only used by the
4304  // branch.
4305  auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4306  if (Cmp && Cmp->hasOneUse())
4307  DeadInstructions.insert(Cmp);
4308 
4309  // We create new "steps" for induction variable updates to which the original
4310  // induction variables map. An original update instruction will be dead if
4311  // all its users except the induction variable are dead.
4312  for (auto &Induction : *Legal->getInductionVars()) {
4313  PHINode *Ind = Induction.first;
4314  auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4315  if (all_of(IndUpdate->users(), [&](User *U) -> bool {
4316  return U == Ind || DeadInstructions.count(cast<Instruction>(U));
4317  }))
4318  DeadInstructions.insert(IndUpdate);
4319  }
4320 }
4321 
4322 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4323 
4324  // The basic block and loop containing the predicated instruction.
4325  auto *PredBB = PredInst->getParent();
4326  auto *VectorLoop = LI->getLoopFor(PredBB);
4327 
4328  // Initialize a worklist with the operands of the predicated instruction.
4329  SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4330 
4331  // Holds instructions that we need to analyze again. An instruction may be
4332  // reanalyzed if we don't yet know if we can sink it or not.
4333  SmallVector<Instruction *, 8> InstsToReanalyze;
4334 
4335  // Returns true if a given use occurs in the predicated block. Phi nodes use
4336  // their operands in their corresponding predecessor blocks.
4337  auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4338  auto *I = cast<Instruction>(U.getUser());
4339  BasicBlock *BB = I->getParent();
4340  if (auto *Phi = dyn_cast<PHINode>(I))
4341  BB = Phi->getIncomingBlock(
4342  PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4343  return BB == PredBB;
4344  };
4345 
4346  // Iteratively sink the scalarized operands of the predicated instruction
4347  // into the block we created for it. When an instruction is sunk, it's
4348  // operands are then added to the worklist. The algorithm ends after one pass
4349  // through the worklist doesn't sink a single instruction.
4350  bool Changed;
4351  do {
4352 
4353  // Add the instructions that need to be reanalyzed to the worklist, and
4354  // reset the changed indicator.
4355  Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4356  InstsToReanalyze.clear();
4357  Changed = false;
4358 
4359  while (!Worklist.empty()) {
4360  auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4361 
4362  // We can't sink an instruction if it is a phi node, is already in the
4363  // predicated block, is not in the loop, or may have side effects.
4364  if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4365  !VectorLoop->contains(I) || I->mayHaveSideEffects())
4366  continue;
4367 
4368  // It's legal to sink the instruction if all its uses occur in the
4369  // predicated block. Otherwise, there's nothing to do yet, and we may
4370  // need to reanalyze the instruction.
4371  if (!all_of(I->uses(), isBlockOfUsePredicated)) {
4372  InstsToReanalyze.push_back(I);
4373  continue;
4374  }
4375 
4376  // Move the instruction to the beginning of the predicated block, and add
4377  // it's operands to the worklist.
4378  I->moveBefore(&*PredBB->getFirstInsertionPt());
4379  Worklist.insert(I->op_begin(), I->op_end());
4380 
4381  // The sinking may have enabled other instructions to be sunk, so we will
4382  // need to iterate.
4383  Changed = true;
4384  }
4385  } while (Changed);
4386 }
4387 
4388 void InnerLoopVectorizer::predicateInstructions() {
4389 
4390  // For each instruction I marked for predication on value C, split I into its
4391  // own basic block to form an if-then construct over C. Since I may be fed by
4392  // an extractelement instruction or other scalar operand, we try to
4393  // iteratively sink its scalar operands into the predicated block. If I feeds
4394  // an insertelement instruction, we try to move this instruction into the
4395  // predicated block as well. For non-void types, a phi node will be created
4396  // for the resulting value (either vector or scalar).
4397  //
4398  // So for some predicated instruction, e.g. the conditional sdiv in:
4399  //
4400  // for.body:
4401  // ...
4402  // %add = add nsw i32 %mul, %0
4403  // %cmp5 = icmp sgt i32 %2, 7
4404  // br i1 %cmp5, label %if.then, label %if.end
4405  //
4406  // if.then:
4407  // %div = sdiv i32 %0, %1
4408  // br label %if.end
4409  //
4410  // if.end:
4411  // %x.0 = phi i32 [ %div, %if.then ], [ %add, %for.body ]
4412  //
4413  // the sdiv at this point is scalarized and if-converted using a select.
4414  // The inactive elements in the vector are not used, but the predicated
4415  // instruction is still executed for all vector elements, essentially:
4416  //
4417  // vector.body:
4418  // ...
4419  // %17 = add nsw <2 x i32> %16, %wide.load
4420  // %29 = extractelement <2 x i32> %wide.load, i32 0
4421  // %30 = extractelement <2 x i32> %wide.load51, i32 0
4422  // %31 = sdiv i32 %29, %30
4423  // %32 = insertelement <2 x i32> undef, i32 %31, i32 0
4424  // %35 = extractelement <2 x i32> %wide.load, i32 1
4425  // %36 = extractelement <2 x i32> %wide.load51, i32 1
4426  // %37 = sdiv i32 %35, %36
4427  // %38 = insertelement <2 x i32> %32, i32 %37, i32 1
4428  // %predphi = select <2 x i1> %26, <2 x i32> %38, <2 x i32> %17
4429  //
4430  // Predication will now re-introduce the original control flow to avoid false
4431  // side-effects by the sdiv instructions on the inactive elements, yielding
4432  // (after cleanup):
4433  //
4434  // vector.body:
4435  // ...
4436  // %5 = add nsw <2 x i32> %4, %wide.load
4437  // %8 = icmp sgt <2 x i32> %wide.load52, <i32 7, i32 7>
4438  // %9 = extractelement <2 x i1> %8, i32 0
4439  // br i1 %9, label %pred.sdiv.if, label %pred.sdiv.continue
4440  //
4441  // pred.sdiv.if:
4442  // %10 = extractelement <2 x i32> %wide.load, i32 0
4443  // %11 = extractelement <2 x i32> %wide.load51, i32 0
4444  // %12 = sdiv i32 %10, %11
4445  // %13 = insertelement <2 x i32> undef, i32 %12, i32 0
4446  // br label %pred.sdiv.continue
4447  //
4448  // pred.sdiv.continue:
4449  // %14 = phi <2 x i32> [ undef, %vector.body ], [ %13, %pred.sdiv.if ]
4450  // %15 = extractelement <2 x i1> %8, i32 1
4451  // br i1 %15, label %pred.sdiv.if54, label %pred.sdiv.continue55
4452  //
4453  // pred.sdiv.if54:
4454  // %16 = extractelement <2 x i32> %wide.load, i32 1
4455  // %17 = extractelement <2 x i32> %wide.load51, i32 1
4456  // %18 = sdiv i32 %16, %17
4457  // %19 = insertelement <2 x i32> %14, i32 %18, i32 1
4458  // br label %pred.sdiv.continue55
4459  //
4460  // pred.sdiv.continue55:
4461  // %20 = phi <2 x i32> [ %14, %pred.sdiv.continue ], [ %19, %pred.sdiv.if54 ]
4462  // %predphi = select <2 x i1> %8, <2 x i32> %20, <2 x i32> %5
4463 
4464  for (auto KV : PredicatedInstructions) {
4465  BasicBlock::iterator I(KV.first);
4466  BasicBlock *Head = I->getParent();
4467  auto *BB = SplitBlock(Head, &*std::next(I), DT, LI);
4468  auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false,
4469  /*BranchWeights=*/nullptr, DT, LI);
4470  I->moveBefore(T);
4471  sinkScalarOperands(&*I);
4472 
4473  I->getParent()->setName(Twine("pred.") + I->getOpcodeName() + ".if");
4474  BB->setName(Twine("pred.") + I->getOpcodeName() + ".continue");
4475 
4476  // If the instruction is non-void create a Phi node at reconvergence point.
4477  if (!I->getType()->isVoidTy()) {
4478  Value *IncomingTrue = nullptr;
4479  Value *IncomingFalse = nullptr;
4480 
4481  if (I->hasOneUse() && isa<InsertElementInst>(*I->user_begin())) {
4482  // If the predicated instruction is feeding an insert-element, move it
4483  // into the Then block; Phi node will be created for the vector.
4484  InsertElementInst *IEI = cast<InsertElementInst>(*I->user_begin());
4485  IEI->moveBefore(T);
4486  IncomingTrue = IEI; // the new vector with the inserted element.
4487  IncomingFalse = IEI->getOperand(0); // the unmodified vector
4488  } else {
4489  // Phi node will be created for the scalar predicated instruction.
4490  IncomingTrue = &*I;
4491  IncomingFalse = UndefValue::get(I->getType());
4492  }
4493 
4494  BasicBlock *PostDom = I->getParent()->getSingleSuccessor();
4495  assert(PostDom && "Then block has multiple successors");
4496  PHINode *Phi =
4497  PHINode::Create(IncomingTrue->getType(), 2, "", &PostDom->front());
4498  IncomingTrue->replaceAllUsesWith(Phi);
4499  Phi->addIncoming(IncomingFalse, Head);
4500  Phi->addIncoming(IncomingTrue, I->getParent());
4501  }
4502  }
4503 
4504  DEBUG(DT->verifyDomTree());
4505 }
4506 
4508 InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
4509  assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
4510 
4511  // Look for cached value.
4512  std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
4513  EdgeMaskCache::iterator ECEntryIt = MaskCache.find(Edge);
4514  if (ECEntryIt != MaskCache.end())
4515  return ECEntryIt->second;
4516 
4517  VectorParts SrcMask = createBlockInMask(Src);
4518 
4519  // The terminator has to be a branch inst!
4521  assert(BI && "Unexpected terminator found");
4522 
4523  if (BI->isConditional()) {
4524  VectorParts EdgeMask = getVectorValue(BI->getCondition());
4525 
4526  if (BI->getSuccessor(0) != Dst)
4527  for (unsigned part = 0; part < UF; ++part)
4528  EdgeMask[part] = Builder.CreateNot(EdgeMask[part]);
4529 
4530  for (unsigned part = 0; part < UF; ++part)
4531  EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]);
4532 
4533  MaskCache[Edge] = EdgeMask;
4534  return EdgeMask;
4535  }
4536 
4537  MaskCache[Edge] = SrcMask;
4538  return SrcMask;
4539 }
4540 
4542 InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
4543  assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
4544 
4545  // Loop incoming mask is all-one.
4546  if (OrigLoop->getHeader() == BB) {
4548  return getVectorValue(C);
4549  }
4550 
4551  // This is the block mask. We OR all incoming edges, and with zero.
4553  VectorParts BlockMask = getVectorValue(Zero);
4554 
4555  // For each pred:
4556  for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) {
4557  VectorParts EM = createEdgeMask(*it, BB);
4558  for (unsigned part = 0; part < UF; ++part)
4559  BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]);
4560  }
4561 
4562  return BlockMask;
4563 }
4564 
4565 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4566  unsigned VF, PhiVector *PV) {
4567  PHINode *P = cast<PHINode>(PN);
4568  // Handle recurrences.
4569  if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4570  VectorParts Entry(UF);
4571  for (unsigned part = 0; part < UF; ++part) {
4572  // This is phase one of vectorizing PHIs.
4573  Type *VecTy =
4574  (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4575  Entry[part] = PHINode::Create(
4576  VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4577  }
4578  VectorLoopValueMap.initVector(P, Entry);
4579  PV->push_back(P);
4580  return;
4581  }
4582 
4583  setDebugLocFromInst(Builder, P);
4584  // Check for PHI nodes that are lowered to vector selects.
4585  if (P->getParent() != OrigLoop->getHeader()) {
4586  // We know that all PHIs in non-header blocks are converted into
4587  // selects, so we don't have to worry about the insertion order and we
4588  // can just use the builder.
4589  // At this point we generate the predication tree. There may be
4590  // duplications since this is a simple recursive scan, but future
4591  // optimizations will clean it up.
4592 
4593  unsigned NumIncoming = P->getNumIncomingValues();
4594 
4595  // Generate a sequence of selects of the form:
4596  // SELECT(Mask3, In3,
4597  // SELECT(Mask2, In2,
4598  // ( ...)))
4599  VectorParts Entry(UF);
4600  for (unsigned In = 0; In < NumIncoming; In++) {
4601  VectorParts Cond =
4602  createEdgeMask(P->getIncomingBlock(In), P->getParent());
4603  const VectorParts &In0 = getVectorValue(P->getIncomingValue(In));
4604 
4605  for (unsigned part = 0; part < UF; ++part) {
4606  // We might have single edge PHIs (blocks) - use an identity
4607  // 'select' for the first PHI operand.
4608  if (In == 0)
4609  Entry[part] = Builder.CreateSelect(Cond[part], In0[part], In0[part]);
4610  else
4611  // Select between the current value and the previous incoming edge
4612  // based on the incoming mask.
4613  Entry[part] = Builder.CreateSelect(Cond[part], In0[part], Entry[part],
4614  "predphi");
4615  }
4616  }
4617  VectorLoopValueMap.initVector(P, Entry);
4618  return;
4619  }
4620 
4621  // This PHINode must be an induction variable.
4622  // Make sure that we know about it.
4623  assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
4624 
4625  InductionDescriptor II = Legal->getInductionVars()->lookup(P);
4626  const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4627 
4628  // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4629  // which can be found from the original scalar operations.
4630  switch (II.getKind()) {
4632  llvm_unreachable("Unknown induction");
4634  return widenIntInduction(P);
4636  // Handle the pointer induction variable case.
4637  assert(P->getType()->isPointerTy() && "Unexpected type.");
4638  // This is the normalized GEP that starts counting at zero.
4639  Value *PtrInd = Induction;
4640  PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
4641  // Determine the number of scalars we need to generate for each unroll
4642  // iteration. If the instruction is uniform, we only need to generate the
4643  // first lane. Otherwise, we generate all VF values.
4644  unsigned Lanes = Legal->isUniformAfterVectorization(P) ? 1 : VF;
4645  // These are the scalar results. Notice that we don't generate vector GEPs
4646  // because scalar GEPs result in better code.
4647  ScalarParts Entry(UF);
4648  for (unsigned Part = 0; Part < UF; ++Part) {
4649  Entry[Part].resize(VF);
4650  for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4651  Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
4652  Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4653  Value *SclrGep = II.transform(Builder, GlobalIdx, PSE.getSE(), DL);
4654  SclrGep->setName("next.gep");
4655  Entry[Part][Lane] = SclrGep;
4656  }
4657  }
4658  VectorLoopValueMap.initScalar(P, Entry);
4659  return;
4660  }
4662  assert(P->getType() == II.getStartValue()->getType() &&
4663  "Types must match");
4664  // Handle other induction variables that are now based on the
4665  // canonical one.
4666  assert(P != OldInduction && "Primary induction can be integer only");
4667 
4668  Value *V = Builder.CreateCast(Instruction::SIToFP, Induction, P->getType());
4669  V = II.transform(Builder, V, PSE.getSE(), DL);
4670  V->setName("fp.offset.idx");
4671 
4672  // Now we have scalar op: %fp.offset.idx = StartVal +/- Induction*StepVal
4673 
4674  Value *Broadcasted = getBroadcastInstrs(V);
4675  // After broadcasting the induction variable we need to make the vector
4676  // consecutive by adding StepVal*0, StepVal*1, StepVal*2, etc.
4677  Value *StepVal = cast<SCEVUnknown>(II.getStep())->getValue();
4678  VectorParts Entry(UF);
4679  for (unsigned part = 0; part < UF; ++part)
4680  Entry[part] = getStepVector(Broadcasted, VF * part, StepVal,
4681  II.getInductionOpcode());
4682  VectorLoopValueMap.initVector(P, Entry);
4683  return;
4684  }
4685  }
4686 }
4687 
4688 /// A helper function for checking whether an integer division-related
4689 /// instruction may divide by zero (in which case it must be predicated if
4690 /// executed conditionally in the scalar code).
4691 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4692 /// Non-zero divisors that are non compile-time constants will not be
4693 /// converted into multiplication, so we will still end up scalarizing
4694 /// the division, but can do so w/o predication.
4695 static bool mayDivideByZero(Instruction &I) {
4696  assert((I.getOpcode() == Instruction::UDiv ||
4697  I.getOpcode() == Instruction::SDiv ||
4698  I.getOpcode() == Instruction::URem ||
4699  I.getOpcode() == Instruction::SRem) &&
4700  "Unexpected instruction");
4701  Value *Divisor = I.getOperand(1);
4702  auto *CInt = dyn_cast<ConstantInt>(Divisor);
4703  return !CInt || CInt->isZero();
4704 }
4705 
4706 void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
4707  // For each instruction in the old loop.
4708  for (Instruction &I : *BB) {
4709 
4710  // If the instruction will become trivially dead when vectorized, we don't
4711  // need to generate it.
4712  if (DeadInstructions.count(&I))
4713  continue;
4714 
4715  // Scalarize instructions that should remain scalar after vectorization.
4716  if (VF > 1 &&
4717  !(isa<BranchInst>(&I) || isa<PHINode>(&I) ||
4718  isa<DbgInfoIntrinsic>(&I)) &&
4719  shouldScalarizeInstruction(&I)) {
4720  scalarizeInstruction(&I, Legal->isScalarWithPredication(&I));
4721  continue;
4722  }
4723 
4724  switch (I.getOpcode()) {
4725  case Instruction::Br:
4726  // Nothing to do for PHIs and BR, since we already took care of the
4727  // loop control flow instructions.
4728  continue;
4729  case Instruction::PHI: {
4730  // Vectorize PHINodes.
4731  widenPHIInstruction(&I, UF, VF, PV);
4732  continue;
4733  } // End of PHI.
4734 
4735  case Instruction::UDiv:
4736  case Instruction::SDiv:
4737  case Instruction::SRem:
4738  case Instruction::URem:
4739  // Scalarize with predication if this instruction may divide by zero and
4740  // block execution is conditional, otherwise fallthrough.
4741  if (Legal->isScalarWithPredication(&I)) {
4742  scalarizeInstruction(&I, true);
4743  continue;
4744  }
4745  case Instruction::Add:
4746  case Instruction::FAdd:
4747  case Instruction::Sub:
4748  case Instruction::FSub:
4749  case Instruction::Mul:
4750  case Instruction::FMul:
4751  case Instruction::FDiv:
4752  case Instruction::FRem:
4753  case Instruction::Shl:
4754  case Instruction::LShr:
4755  case Instruction::AShr:
4756  case Instruction::And:
4757  case Instruction::Or:
4758  case Instruction::Xor: {
4759  // Just widen binops.
4760  auto *BinOp = cast<BinaryOperator>(&I);
4761  setDebugLocFromInst(Builder, BinOp);
4762  const VectorParts &A = getVectorValue(BinOp->getOperand(0));
4763  const VectorParts &B = getVectorValue(BinOp->getOperand(1));
4764 
4765  // Use this vector value for all users of the original instruction.
4766  VectorParts Entry(UF);
4767  for (unsigned Part = 0; Part < UF; ++Part) {
4768  Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]);
4769 
4770  if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
4771  VecOp->copyIRFlags(BinOp);
4772 
4773  Entry[Part] = V;
4774  }
4775 
4776  VectorLoopValueMap.initVector(&I, Entry);
4777  addMetadata(Entry, BinOp);
4778  break;
4779  }
4780  case Instruction::Select: {
4781  // Widen selects.
4782  // If the selector is loop invariant we can create a select
4783  // instruction with a scalar condition. Otherwise, use vector-select.
4784  auto *SE = PSE.getSE();
4785  bool InvariantCond =
4786  SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
4787  setDebugLocFromInst(Builder, &I);
4788 
4789  // The condition can be loop invariant but still defined inside the
4790  // loop. This means that we can't just use the original 'cond' value.
4791  // We have to take the 'vectorized' value and pick the first lane.
4792  // Instcombine will make this a no-op.
4793  const VectorParts &Cond = getVectorValue(I.getOperand(0));
4794  const VectorParts &Op0 = getVectorValue(I.getOperand(1));
4795  const VectorParts &Op1 = getVectorValue(I.getOperand(2));
4796 
4797  auto *ScalarCond = getScalarValue(I.getOperand(0), 0, 0);
4798 
4799  VectorParts Entry(UF);
4800  for (unsigned Part = 0; Part < UF; ++Part) {
4801  Entry[Part] = Builder.CreateSelect(
4802  InvariantCond ? ScalarCond : Cond[Part], Op0[Part], Op1[Part]);
4803  }
4804 
4805  VectorLoopValueMap.initVector(&I, Entry);
4806  addMetadata(Entry, &I);
4807  break;
4808  }
4809 
4810  case Instruction::ICmp:
4811  case Instruction::FCmp: {
4812  // Widen compares. Generate vector compares.
4813  bool FCmp = (I.getOpcode() == Instruction::FCmp);
4814  auto *Cmp = dyn_cast<CmpInst>(&I);
4815  setDebugLocFromInst(Builder, Cmp);
4816  const VectorParts &A = getVectorValue(Cmp->getOperand(0));
4817  const VectorParts &B = getVectorValue(Cmp->getOperand(1));
4818  VectorParts Entry(UF);
4819  for (unsigned Part = 0; Part < UF; ++Part) {
4820  Value *C = nullptr;
4821  if (FCmp) {
4822  C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]);
4823  cast<FCmpInst>(C)->copyFastMathFlags(Cmp);
4824  } else {
4825  C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]);
4826  }
4827  Entry[Part] = C;
4828  }
4829 
4830  VectorLoopValueMap.initVector(&I, Entry);
4831  addMetadata(Entry, &I);
4832  break;
4833  }
4834 
4835  case Instruction::Store:
4836  case Instruction::Load:
4837  vectorizeMemoryInstruction(&I);
4838  break;
4839  case Instruction::ZExt:
4840  case Instruction::SExt:
4841  case Instruction::FPToUI:
4842  case Instruction::FPToSI:
4843  case Instruction::FPExt:
4844  case Instruction::PtrToInt:
4845  case Instruction::IntToPtr:
4846  case Instruction::SIToFP:
4847  case Instruction::UIToFP:
4848  case Instruction::Trunc:
4849  case Instruction::FPTrunc:
4850  case Instruction::BitCast: {
4851  auto *CI = dyn_cast<CastInst>(&I);
4852  setDebugLocFromInst(Builder, CI);
4853 
4854  // Optimize the special case where the source is a constant integer
4855  // induction variable. Notice that we can only optimize the 'trunc' case
4856  // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
4857  // (c) other casts depend on pointer size.
4858  auto ID = Legal->getInductionVars()->lookup(OldInduction);
4859  if (isa<TruncInst>(CI) && CI->getOperand(0) == OldInduction &&
4860  ID.getConstIntStepValue()) {
4861  widenIntInduction(OldInduction, cast<TruncInst>(CI));
4862  break;
4863  }
4864 
4865  /// Vectorize casts.
4866  Type *DestTy =
4867  (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4868 
4869  const VectorParts &A = getVectorValue(CI->getOperand(0));
4870  VectorParts Entry(UF);
4871  for (unsigned Part = 0; Part < UF; ++Part)
4872  Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy);
4873  VectorLoopValueMap.initVector(&I, Entry);
4874  addMetadata(Entry, &I);
4875  break;
4876  }
4877 
4878  case Instruction::Call: {
4879  // Ignore dbg intrinsics.
4880  if (isa<DbgInfoIntrinsic>(I))
4881  break;
4882  setDebugLocFromInst(Builder, &I);
4883 
4884  Module *M = BB->getParent()->getParent();
4885  auto *CI = cast<CallInst>(&I);
4886 
4887  StringRef FnName = CI->getCalledFunction()->getName();
4888  Function *F = CI->getCalledFunction();
4889  Type *RetTy = ToVectorTy(CI->getType(), VF);
4891  for (Value *ArgOperand : CI->arg_operands())
4892  Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4893 
4895  if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
4896  ID == Intrinsic::lifetime_start)) {
4897  scalarizeInstruction(&I);
4898  break;
4899  }
4900  // The flag shows whether we use Intrinsic or a usual Call for vectorized
4901  // version of the instruction.
4902  // Is it beneficial to perform intrinsic call compared to lib call?
4903  bool NeedToScalarize;
4904  unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
4905  bool UseVectorIntrinsic =
4906  ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
4907  if (!UseVectorIntrinsic && NeedToScalarize) {
4908  scalarizeInstruction(&I);
4909  break;
4910  }
4911 
4912  VectorParts Entry(UF);
4913  for (unsigned Part = 0; Part < UF; ++Part) {
4915  for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4916  Value *Arg = CI->getArgOperand(i);
4917  // Some intrinsics have a scalar argument - don't replace it with a
4918  // vector.
4919  if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) {
4920  const VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i));
4921  Arg = VectorArg[Part];
4922  }
4923  Args.push_back(Arg);
4924  }
4925 
4926  Function *VectorF;
4927  if (UseVectorIntrinsic) {
4928  // Use vector version of the intrinsic.
4929  Type *TysForDecl[] = {CI->getType()};
4930  if (VF > 1)
4931  TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4932  VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4933  } else {
4934  // Use vector version of the library call.
4935  StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
4936  assert(!VFnName.empty() && "Vector function name is empty.");
4937  VectorF = M->getFunction(VFnName);
4938  if (!VectorF) {
4939  // Generate a declaration
4940  FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
4941  VectorF =
4942  Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
4943  VectorF->copyAttributesFrom(F);
4944  }
4945  }
4946  assert(VectorF && "Can't create vector function.");
4947 
4949  CI->getOperandBundlesAsDefs(OpBundles);
4950  CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4951 
4952  if (isa<FPMathOperator>(V))
4953  V->copyFastMathFlags(CI);
4954 
4955  Entry[Part] = V;
4956  }
4957 
4958  VectorLoopValueMap.initVector(&I, Entry);
4959  addMetadata(Entry, &I);
4960  break;
4961  }
4962 
4963  default:
4964  // All other instructions are unsupported. Scalarize them.
4965  scalarizeInstruction(&I);
4966  break;
4967  } // end of switch.
4968  } // end of for_each instr.
4969 }
4970 
4971 void InnerLoopVectorizer::updateAnalysis() {
4972  // Forget the original basic block.
4973  PSE.getSE()->forgetLoop(OrigLoop);
4974 
4975  // Update the dominator tree information.
4976  assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
4977  "Entry does not dominate exit.");
4978 
4979  // We don't predicate stores by this point, so the vector body should be a
4980  // single loop.
4981  DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader);
4982 
4983  DT->addNewBlock(LoopMiddleBlock, LoopVectorBody);
4984  DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
4985  DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
4986  DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
4987 
4988  DEBUG(DT->verifyDomTree());
4989 }
4990 
4991 /// \brief Check whether it is safe to if-convert this phi node.
4992 ///
4993 /// Phi nodes with constant expressions that can trap are not safe to if
4994 /// convert.
4996  for (Instruction &I : *BB) {
4997  auto *Phi = dyn_cast<PHINode>(&I);
4998  if (!Phi)
4999  return true;
5000  for (Value *V : Phi->incoming_values())
5001  if (auto *C = dyn_cast<Constant>(V))
5002  if (C->canTrap())
5003  return false;
5004  }
5005  return true;
5006 }
5007 
5008 bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
5009  if (!EnableIfConversion) {
5010  ORE->emit(createMissedAnalysis("IfConversionDisabled")
5011  << "if-conversion is disabled");
5012  return false;
5013  }
5014 
5015  assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
5016 
5017  // A list of pointers that we can safely read and write to.
5018  SmallPtrSet<Value *, 8> SafePointes;
5019 
5020  // Collect safe addresses.
5021  for (BasicBlock *BB : TheLoop->blocks()) {
5022  if (blockNeedsPredication(BB))
5023  continue;
5024 
5025  for (Instruction &I : *BB)
5026  if (auto *Ptr = getPointerOperand(&I))
5027  SafePointes.insert(Ptr);
5028  }
5029 
5030  // Collect the blocks that need predication.
5031  BasicBlock *Header = TheLoop->getHeader();
5032  for (BasicBlock *BB : TheLoop->blocks()) {
5033  // We don't support switch statements inside loops.
5034  if (!isa<BranchInst>(BB->getTerminator())) {
5035  ORE->emit(createMissedAnalysis("LoopContainsSwitch", BB->getTerminator())
5036  << "loop contains a switch statement");
5037  return false;
5038  }
5039 
5040  // We must be able to predicate all blocks that need to be predicated.
5041  if (blockNeedsPredication(BB)) {
5042  if (!blockCanBePredicated(BB, SafePointes)) {
5043  ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator())
5044  << "control flow cannot be substituted for a select");
5045  return false;
5046  }
5047  } else if (BB != Header && !canIfConvertPHINodes(BB)) {
5048  ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator())
5049  << "control flow cannot be substituted for a select");
5050  return false;
5051  }
5052  }
5053 
5054  // We can if-convert this loop.
5055  return true;
5056 }
5057 
5058 bool LoopVectorizationLegality::canVectorize() {
5059  // We must have a loop in canonical form. Loops with indirectbr in them cannot
5060  // be canonicalized.
5061  if (!TheLoop->getLoopPreheader()) {
5062  ORE->emit(createMissedAnalysis("CFGNotUnderstood")
5063  << "loop control flow is not understood by vectorizer");
5064  return false;
5065  }
5066 
5067  // FIXME: The code is currently dead, since the loop gets sent to
5068  // LoopVectorizationLegality is already an innermost loop.
5069  //
5070  // We can only vectorize innermost loops.
5071  if (!TheLoop->empty()) {
5072  ORE->emit(createMissedAnalysis("NotInnermostLoop")
5073  << "loop is not the innermost loop");
5074  return false;
5075  }
5076 
5077  // We must have a single backedge.
5078  if (TheLoop->getNumBackEdges() != 1) {
5079  ORE->emit(createMissedAnalysis("CFGNotUnderstood")
5080  << "loop control flow is not understood by vectorizer");
5081  return false;
5082  }
5083 
5084  // We must have a single exiting block.
5085  if (!TheLoop->getExitingBlock()) {
5086  ORE->emit(createMissedAnalysis("CFGNotUnderstood")
5087  << "loop control flow is not understood by vectorizer");
5088  return false;
5089  }
5090 
5091  // We only handle bottom-tested loops, i.e. loop in which the condition is
5092  // checked at the end of each iteration. With that we can assume that all
5093  // instructions in the loop are executed the same number of times.
5094  if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5095  ORE->emit(createMissedAnalysis("CFGNotUnderstood")
5096  << "loop control flow is not understood by vectorizer");
5097  return false;
5098  }
5099 
5100  // We need to have a loop header.
5101  DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName()
5102  << '\n');
5103 
5104  // Check if we can if-convert non-single-bb loops.
5105  unsigned NumBlocks = TheLoop->getNumBlocks();
5106  if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
5107  DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
5108  return false;
5109  }
5110 
5111  // ScalarEvolution needs to be able to find the exit count.
5112  const SCEV *ExitCount = PSE.getBackedgeTakenCount();
5113  if (ExitCount == PSE.getSE()->getCouldNotCompute()) {
5114  ORE->emit(createMissedAnalysis("CantComputeNumberOfIterations")
5115  << "could not determine number of loop iterations");
5116  DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
5117  return false;
5118  }
5119 
5120  // Check if we can vectorize the instructions and CFG in this loop.
5121  if (!canVectorizeInstrs()) {
5122  DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
5123  return false;
5124  }
5125 
5126  // Go over each instruction and look at memory deps.
5127  if (!canVectorizeMemory()) {
5128  DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
5129  return false;
5130  }
5131 
5132  DEBUG(dbgs() << "LV: We can vectorize this loop"
5133  << (LAI->getRuntimePointerChecking()->Need
5134  ? " (with a runtime bound check)"
5135  : "")
5136  << "!\n");
5137 
5138  bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
5139 
5140  // If an override option has been passed in for interleaved accesses, use it.
5141  if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
5142  UseInterleaved = EnableInterleavedMemAccesses;
5143 
5144  // Analyze interleaved memory accesses.
5145  if (UseInterleaved)
5146  InterleaveInfo.analyzeInterleaving(*getSymbolicStrides());
5147 
5148  // Collect all instructions that are known to be uniform after vectorization.
5149  collectLoopUniforms();
5150 
5151  // Collect all instructions that are known to be scalar after vectorization.
5152  collectLoopScalars();
5153 
5154  unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
5155  if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
5156  SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
5157 
5158  if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) {
5159  ORE->emit(createMissedAnalysis("TooManySCEVRunTimeChecks")
5160  << "Too many SCEV assumptions need to be made and checked "
5161  << "at runtime");
5162  DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n");
5163  return false;
5164  }
5165 
5166  // Okay! We can vectorize. At this point we don't have any other mem analysis
5167  // which may limit our maximum vectorization factor, so just return true with
5168  // no restrictions.
5169  return true;
5170 }
5171 
5173  if (Ty->isPointerTy())
5174  return DL.getIntPtrType(Ty);
5175 
5176  // It is possible that char's or short's overflow when we ask for the loop's
5177  // trip count, work around this by changing the type size.
5178  if (Ty->getScalarSizeInBits() < 32)
5179  return Type::getInt32Ty(Ty->getContext());
5180 
5181  return Ty;
5182 }
5183 
5184 static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
5185  Ty0 = convertPointerToIntegerType(DL, Ty0);
5186  Ty1 = convertPointerToIntegerType(DL, Ty1);
5187  if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
5188  return Ty0;
5189  return Ty1;
5190 }
5191 
5192 /// \brief Check that the instruction has outside loop users and is not an
5193 /// identified reduction variable.
5194 static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
5195  SmallPtrSetImpl<Value *> &AllowedExit) {
5196  // Reduction and Induction instructions are allowed to have exit users. All
5197  // other instructions must not have external users.
5198  if (!AllowedExit.count(Inst))
5199  // Check that all of the users of the loop are inside the BB.
5200  for (User *U : Inst->users()) {
5201  Instruction *UI = cast<Instruction>(U);
5202  // This user may be a reduction exit value.
5203  if (!TheLoop->contains(UI)) {
5204  DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n');
5205  return true;
5206  }
5207  }
5208  return false;
5209 }
5210 
5211 void LoopVectorizationLegality::addInductionPhi(
5212  PHINode *Phi, const InductionDescriptor &ID,
5213  SmallPtrSetImpl<Value *> &AllowedExit) {
5214  Inductions[Phi] = ID;
5215  Type *PhiTy = Phi->getType();
5216  const DataLayout &DL = Phi->getModule()->getDataLayout();
5217 
5218  // Get the widest type.
5219  if (!PhiTy->isFloatingPointTy()) {
5220  if (!WidestIndTy)
5221  WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
5222  else
5223  WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
5224  }
5225 
5226  // Int inductions are special because we only allow one IV.
5228  ID.getConstIntStepValue() &&
5229  ID.getConstIntStepValue()->isOne() &&
5230  isa<Constant>(ID.getStartValue()) &&
5231  cast<Constant>(ID.getStartValue())->isNullValue()) {
5232 
5233  // Use the phi node with the widest type as induction. Use the last
5234  // one if there are multiple (no good reason for doing this other
5235  // than it is expedient). We've checked that it begins at zero and
5236  // steps by one, so this is a canonical induction variable.
5237  if (!Induction || PhiTy == WidestIndTy)
5238  Induction = Phi;
5239  }
5240 
5241  // Both the PHI node itself, and the "post-increment" value feeding
5242  // back into the PHI node may have external users.
5243  AllowedExit.insert(Phi);
5244  AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch()));
5245 
5246  DEBUG(dbgs() << "LV: Found an induction variable.\n");
5247  return;
5248 }
5249 
5250 bool LoopVectorizationLegality::canVectorizeInstrs() {
5251  BasicBlock *Header = TheLoop->getHeader();
5252 
5253  // Look for the attribute signaling the absence of NaNs.
5254  Function &F = *Header->getParent();
5255  HasFunNoNaNAttr =
5256  F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
5257 
5258  // For each block in the loop.
5259  for (BasicBlock *BB : TheLoop->blocks()) {
5260  // Scan the instructions in the block and look for hazards.
5261  for (Instruction &I : *BB) {
5262  if (auto *Phi = dyn_cast<PHINode>(&I)) {
5263  Type *PhiTy = Phi->getType();
5264  // Check that this PHI type is allowed.
5265  if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
5266  !PhiTy->isPointerTy()) {
5267  ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi)
5268  << "loop control flow is not understood by vectorizer");
5269  DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
5270  return false;
5271  }
5272 
5273  // If this PHINode is not in the header block, then we know that we
5274  // can convert it to select during if-conversion. No need to check if
5275  // the PHIs in this block are induction or reduction variables.
5276  if (BB != Header) {
5277  // Check that this instruction has no outside users or is an
5278  // identified reduction value with an outside user.
5279  if (!hasOutsideLoopUser(TheLoop, Phi, AllowedExit))
5280  continue;
5281  ORE->emit(createMissedAnalysis("NeitherInductionNorReduction", Phi)
5282  << "value could not be identified as "
5283  "an induction or reduction variable");
5284  return false;
5285  }
5286 
5287  // We only allow if-converted PHIs with exactly two incoming values.
5288  if (Phi->getNumIncomingValues() != 2) {
5289  ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi)
5290  << "control flow not understood by vectorizer");
5291  DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
5292  return false;
5293  }
5294 
5295  RecurrenceDescriptor RedDes;
5296  if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes)) {
5297  if (RedDes.hasUnsafeAlgebra())
5298  Requirements->addUnsafeAlgebraInst(RedDes.getUnsafeAlgebraInst());
5299  AllowedExit.insert(RedDes.getLoopExitInstr());
5300  Reductions[Phi] = RedDes;
5301  continue;
5302  }
5303 
5305  if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) {
5306  addInductionPhi(Phi, ID, AllowedExit);
5307  if (ID.hasUnsafeAlgebra() && !HasFunNoNaNAttr)
5308  Requirements->addUnsafeAlgebraInst(ID.getUnsafeAlgebraInst());
5309  continue;
5310  }
5311 
5312  if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop, DT)) {
5313  FirstOrderRecurrences.insert(Phi);
5314  continue;
5315  }
5316 
5317  // As a last resort, coerce the PHI to a AddRec expression
5318  // and re-try classifying it a an induction PHI.
5319  if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true)) {
5320  addInductionPhi(Phi, ID, AllowedExit);
5321  continue;
5322  }
5323 
5324  ORE->emit(createMissedAnalysis("NonReductionValueUsedOutsideLoop", Phi)
5325  << "value that could not be identified as "
5326  "reduction is used outside the loop");
5327  DEBUG(dbgs() << "LV: Found an unidentified PHI." << *Phi << "\n");
5328  return false;
5329  } // end of PHI handling
5330 
5331  // We handle calls that:
5332  // * Are debug info intrinsics.
5333  // * Have a mapping to an IR intrinsic.
5334  // * Have a vector version available.
5335  auto *CI = dyn_cast<CallInst>(&I);
5336  if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
5337  !isa<DbgInfoIntrinsic>(CI) &&
5338  !(CI->getCalledFunction() && TLI &&
5339  TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) {
5340  ORE->emit(createMissedAnalysis("CantVectorizeCall", CI)
5341  << "call instruction cannot be vectorized");
5342  DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n");
5343  return false;
5344  }
5345 
5346  // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the
5347  // second argument is the same (i.e. loop invariant)
5348  if (CI && hasVectorInstrinsicScalarOpd(
5349  getVectorIntrinsicIDForCall(CI, TLI), 1)) {
5350  auto *SE = PSE.getSE();
5351  if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) {
5352  ORE->emit(createMissedAnalysis("CantVectorizeIntrinsic", CI)
5353  << "intrinsic instruction cannot be vectorized");
5354  DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n");
5355  return false;
5356  }
5357  }
5358 
5359  // Check that the instruction return type is vectorizable.
5360  // Also, we can't vectorize extractelement instructions.
5361  if ((!VectorType::isValidElementType(I.getType()) &&
5362  !I.getType()->isVoidTy()) ||
5363  isa<ExtractElementInst>(I)) {
5364  ORE->emit(createMissedAnalysis("CantVectorizeInstructionReturnType", &I)
5365  << "instruction return type cannot be vectorized");
5366  DEBUG(dbgs() << "LV: Found unvectorizable type.\n");
5367  return false;
5368  }
5369 
5370  // Check that the stored type is vectorizable.
5371  if (auto *ST = dyn_cast<StoreInst>(&I)) {
5372  Type *T = ST->getValueOperand()->getType();
5374  ORE->emit(createMissedAnalysis("CantVectorizeStore", ST)
5375  << "store instruction cannot be vectorized");
5376  return false;
5377  }
5378 
5379  // FP instructions can allow unsafe algebra, thus vectorizable by
5380  // non-IEEE-754 compliant SIMD units.
5381  // This applies to floating-point math operations and calls, not memory
5382  // operations, shuffles, or casts, as they don't change precision or
5383  // semantics.
5384  } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) &&
5385  !I.hasUnsafeAlgebra()) {
5386  DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
5387  Hints->setPotentiallyUnsafe();
5388  }
5389 
5390  // Reduction instructions are allowed to have exit users.
5391  // All other instructions must not have external users.
5392  if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) {
5393  ORE->emit(createMissedAnalysis("ValueUsedOutsideLoop", &I)
5394  << "value cannot be used outside the loop");
5395  return false;
5396  }
5397 
5398  } // next instr.
5399  }
5400 
5401  if (!Induction) {
5402  DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
5403  if (Inductions.empty()) {
5404  ORE->emit(createMissedAnalysis("NoInductionVariable")
5405  << "loop induction variable could not be identified");
5406  return false;
5407  }
5408  }
5409 
5410  // Now we know the widest induction type, check if our found induction
5411  // is the same size. If it's not, unset it here and InnerLoopVectorizer
5412  // will create another.
5413  if (Induction && WidestIndTy != Induction->getType())
5414  Induction = nullptr;
5415 
5416  return true;
5417 }
5418 
5419 void LoopVectorizationLegality::collectLoopScalars() {
5420 
5421  // If an instruction is uniform after vectorization, it will remain scalar.
5422  Scalars.insert(Uniforms.begin(), Uniforms.end());
5423 
5424  // Collect the getelementptr instructions that will not be vectorized. A
5425  // getelementptr instruction is only vectorized if it is used for a legal
5426  // gather or scatter operation.
5427  for (auto *BB : TheLoop->blocks())
5428  for (auto &I : *BB) {
5429  if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
5430  Scalars.insert(GEP);
5431  continue;
5432  }
5433  auto *Ptr = getPointerOperand(&I);
5434  if (!Ptr)
5435  continue;
5436  auto *GEP = getGEPInstruction(Ptr);
5437  if (GEP && isLegalGatherOrScatter(&I))
5438  Scalars.erase(GEP);
5439  }
5440 
5441  // An induction variable will remain scalar if all users of the induction
5442  // variable and induction variable update remain scalar.
5443  auto *Latch = TheLoop->getLoopLatch();
5444  for (auto &Induction : *getInductionVars()) {
5445  auto *Ind = Induction.first;
5446  auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5447 
5448  // Determine if all users of the induction variable are scalar after
5449  // vectorization.
5450  auto ScalarInd = all_of(Ind->users(), [&](User *U) -> bool {
5451  auto *I = cast<Instruction>(U);
5452  return I == IndUpdate || !TheLoop->contains(I) || Scalars.count(I);
5453  });
5454  if (!ScalarInd)
5455  continue;
5456 
5457  // Determine if all users of the induction variable update instruction are
5458  // scalar after vectorization.
5459  auto ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
5460  auto *I = cast<Instruction>(U);
5461  return I == Ind || !TheLoop->contains(I) || Scalars.count(I);
5462  });
5463  if (!ScalarIndUpdate)
5464  continue;
5465 
5466  // The induction variable and its update instruction will remain scalar.
5467  Scalars.insert(Ind);
5468  Scalars.insert(IndUpdate);
5469  }
5470 }
5471 
5472 bool LoopVectorizationLegality::hasConsecutiveLikePtrOperand(Instruction *I) {
5473  if (isAccessInterleaved(I))
5474  return true;
5475  if (auto *Ptr = getPointerOperand(I))
5476  return isConsecutivePtr(Ptr);
5477  return false;
5478 }
5479 
5480 bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) {
5481  if (!blockNeedsPredication(I->getParent()))
5482  return false;
5483  switch(I->getOpcode()) {
5484  default:
5485  break;
5486  case Instruction::Store:
5487  return !isMaskRequired(I);
5488  case Instruction::UDiv:
5489  case Instruction::SDiv:
5490  case Instruction::SRem:
5491  case Instruction::URem:
5492  return mayDivideByZero(*I);
5493  }
5494  return false;
5495 }
5496 
5497 bool LoopVectorizationLegality::memoryInstructionMustBeScalarized(
5498  Instruction *I, unsigned VF) {
5499 
5500  // If the memory instruction is in an interleaved group, it will be
5501  // vectorized and its pointer will remain uniform.
5502  if (isAccessInterleaved(I))
5503  return false;
5504 
5505  // Get and ensure we have a valid memory instruction.
5506  LoadInst *LI = dyn_cast<LoadInst>(I);
5507  StoreInst *SI = dyn_cast<StoreInst>(I);
5508  assert((LI || SI) && "Invalid memory instruction");
5509 
5510  // If the pointer operand is uniform (loop invariant), the memory instruction
5511  // will be scalarized.
5512  auto *Ptr = getPointerOperand(I);
5513  if (LI && isUniform(Ptr))
5514  return true;
5515 
5516  // If the pointer operand is non-consecutive and neither a gather nor a
5517  // scatter operation is legal, the memory instruction will be scalarized.
5518  if (!isConsecutivePtr(Ptr) && !isLegalGatherOrScatter(I))
5519  return true;
5520 
5521  // If the instruction is a store located in a predicated block, it will be
5522  // scalarized.
5523  if (isScalarWithPredication(I))
5524  return true;
5525 
5526  // If the instruction's allocated size doesn't equal it's type size, it
5527  // requires padding and will be scalarized.
5528  auto &DL = I->getModule()->getDataLayout();
5529  auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
5530  if (hasIrregularType(ScalarTy, DL, VF))
5531  return true;
5532 
5533  // Otherwise, the memory instruction should be vectorized if the rest of the
5534  // loop is.
5535  return false;
5536 }
5537 
5538 void LoopVectorizationLegality::collectLoopUniforms() {
5539  // We now know that the loop is vectorizable!
5540  // Collect instructions inside the loop that will remain uniform after
5541  // vectorization.
5542 
5543  // Global values, params and instructions outside of current loop are out of
5544  // scope.
5545  auto isOutOfScope = [&](Value *V) -> bool {
5546  Instruction *I = dyn_cast<Instruction>(V);
5547  return (!I || !TheLoop->contains(I));
5548  };
5549 
5550  SetVector<Instruction *> Worklist;
5551  BasicBlock *Latch = TheLoop->getLoopLatch();
5552 
5553  // Start with the conditional branch. If the branch condition is an
5554  // instruction contained in the loop that is only used by the branch, it is
5555  // uniform.
5556  auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5557  if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
5558  Worklist.insert(Cmp);
5559  DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
5560  }
5561 
5562  // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
5563  // are pointers that are treated like consecutive pointers during
5564  // vectorization. The pointer operands of interleaved accesses are an
5565  // example.
5566  SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
5567 
5568  // Holds pointer operands of instructions that are possibly non-uniform.
5569  SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
5570 
5571  // Iterate over the instructions in the loop, and collect all
5572  // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
5573  // that a consecutive-like pointer operand will be scalarized, we collect it
5574  // in PossibleNonUniformPtrs instead. We use two sets here because a single
5575  // getelementptr instruction can be used by both vectorized and scalarized
5576  // memory instructions. For example, if a loop loads and stores from the same
5577  // location, but the store is conditional, the store will be scalarized, and
5578  // the getelementptr won't remain uniform.
5579  for (auto *BB : TheLoop->blocks())
5580  for (auto &I : *BB) {
5581 
5582  // If there's no pointer operand, there's nothing to do.
5583  auto *Ptr = dyn_cast_or_null<Instruction>(getPointerOperand(&I));
5584  if (!Ptr)
5585  continue;
5586 
5587  // True if all users of Ptr are memory accesses that have Ptr as their
5588  // pointer operand.
5589  auto UsersAreMemAccesses = all_of(Ptr->users(), [&](User *U) -> bool {
5590  return getPointerOperand(U) == Ptr;
5591  });
5592 
5593  // Ensure the memory instruction will not be scalarized, making its
5594  // pointer operand non-uniform. If the pointer operand is used by some
5595  // instruction other than a memory access, we're not going to check if
5596  // that other instruction may be scalarized here. Thus, conservatively
5597  // assume the pointer operand may be non-uniform.
5598  if (!UsersAreMemAccesses || memoryInstructionMustBeScalarized(&I))
5599  PossibleNonUniformPtrs.insert(Ptr);
5600 
5601  // If the memory instruction will be vectorized and its pointer operand
5602  // is consecutive-like, the pointer operand should remain uniform.
5603  else if (hasConsecutiveLikePtrOperand(&I))
5604  ConsecutiveLikePtrs.insert(Ptr);
5605 
5606  // Otherwise, if the memory instruction will be vectorized and its
5607  // pointer operand is non-consecutive-like, the memory instruction should
5608  // be a gather or scatter operation. Its pointer operand will be
5609  // non-uniform.
5610  else
5611  PossibleNonUniformPtrs.insert(Ptr);
5612  }
5613 
5614  // Add to the Worklist all consecutive and consecutive-like pointers that
5615  // aren't also identified as possibly non-uniform.
5616  for (auto *V : ConsecutiveLikePtrs)
5617  if (!PossibleNonUniformPtrs.count(V)) {
5618  DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
5619  Worklist.insert(V);
5620  }
5621 
5622  // Expand Worklist in topological order: whenever a new instruction
5623  // is added , its users should be either already inside Worklist, or
5624  // out of scope. It ensures a uniform instruction will only be used
5625  // by uniform instructions or out of scope instructions.
5626  unsigned idx = 0;
5627  while (idx != Worklist.size()) {
5628  Instruction *I = Worklist[idx++];
5629 
5630  for (auto OV : I->operand_values()) {
5631  if (isOutOfScope(OV))
5632  continue;
5633  auto *OI = cast<Instruction>(OV);
5634  if (all_of(OI->users(), [&](User *U) -> bool {
5635  return isOutOfScope(U) || Worklist.count(cast<Instruction>(U));
5636  })) {
5637  Worklist.insert(OI);
5638  DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
5639  }
5640  }
5641  }
5642 
5643  // Returns true if Ptr is the pointer operand of a memory access instruction
5644  // I, and I is known to not require scalarization.
5645  auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5646  return getPointerOperand(I) == Ptr && !memoryInstructionMustBeScalarized(I);
5647  };
5648 
5649  // For an instruction to be added into Worklist above, all its users inside
5650  // the loop should also be in Worklist. However, this condition cannot be
5651  // true for phi nodes that form a cyclic dependence. We must process phi
5652  // nodes separately. An induction variable will remain uniform if all users
5653  // of the induction variable and induction variable update remain uniform.
5654  // The code below handles both pointer and non-pointer induction variables.
5655  for (auto &Induction : Inductions) {
5656  auto *Ind = Induction.first;
5657  auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5658 
5659  // Determine if all users of the induction variable are uniform after
5660  // vectorization.
5661  auto UniformInd = all_of(Ind->users(), [&](User *U) -> bool {
5662  auto *I = cast<Instruction>(U);
5663  return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5664  isVectorizedMemAccessUse(I, Ind);
5665  });
5666  if (!UniformInd)
5667  continue;
5668 
5669  // Determine if all users of the induction variable update instruction are
5670  // uniform after vectorization.
5671  auto UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
5672  auto *I = cast<Instruction>(U);
5673  return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5674  isVectorizedMemAccessUse(I, IndUpdate);
5675  });
5676  if (!UniformIndUpdate)
5677  continue;
5678 
5679  // The induction variable and its update instruction will remain uniform.
5680  Worklist.insert(Ind);
5681  Worklist.insert(IndUpdate);
5682  DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
5683  DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate << "\n");
5684  }
5685 
5686  Uniforms.insert(Worklist.begin(), Worklist.end());
5687 }
5688 
5689 bool LoopVectorizationLegality::canVectorizeMemory() {
5690  LAI = &(*GetLAA)(*TheLoop);
5691  InterleaveInfo.setLAI(LAI);
5692  const OptimizationRemarkAnalysis *LAR = LAI->getReport();
5693  if (LAR) {
5694  OptimizationRemarkAnalysis VR(Hints->vectorizeAnalysisPassName(),
5695  "loop not vectorized: ", *LAR);
5696  ORE->emit(VR);
5697  }
5698  if (!LAI->canVectorizeMemory())
5699  return false;
5700 
5701  if (LAI->hasStoreToLoopInvariantAddress()) {
5702  ORE->emit(createMissedAnalysis("CantVectorizeStoreToLoopInvariantAddress")
5703  << "write to a loop invariant address could not be vectorized");
5704  DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
5705  return false;
5706  }
5707 
5708  Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
5709  PSE.addPredicate(LAI->getPSE().getUnionPredicate());
5710 
5711  return true;
5712 }
5713 
5714 bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
5715  Value *In0 = const_cast<Value *>(V);
5716  PHINode *PN = dyn_cast_or_null<PHINode>(In0);
5717  if (!PN)
5718  return false;
5719 
5720  return Inductions.count(PN);
5721 }
5722 
5723 bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) {
5724  return FirstOrderRecurrences.count(Phi);
5725 }
5726 
5727 bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
5728  return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
5729 }
5730 
5731 bool LoopVectorizationLegality::blockCanBePredicated(
5732  BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs) {
5733  const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
5734 
5735  for (Instruction &I : *BB) {
5736  // Check that we don't have a constant expression that can trap as operand.
5737  for (Value *Operand : I.operands()) {
5738  if (auto *C = dyn_cast<Constant>(Operand))
5739  if (C->canTrap())
5740  return false;
5741  }
5742  // We might be able to hoist the load.
5743  if (I.mayReadFromMemory()) {
5744  auto *LI = dyn_cast<LoadInst>(&I);
5745  if (!LI)
5746  return false;
5747  if (!SafePtrs.count(LI->getPointerOperand())) {
5748  if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand()) ||
5749  isLegalMaskedGather(LI->getType())) {
5750  MaskedOp.insert(LI);
5751  continue;
5752  }
5753  // !llvm.mem.parallel_loop_access implies if-conversion safety.
5754  if (IsAnnotatedParallel)
5755  continue;
5756  return false;
5757  }
5758  }
5759 
5760  if (I.mayWriteToMemory()) {
5761  auto *SI = dyn_cast<StoreInst>(&I);
5762  // We only support predication of stores in basic blocks with one
5763  // predecessor.
5764  if (!SI)
5765  return false;
5766 
5767  // Build a masked store if it is legal for the target.
5768  if (isLegalMaskedStore(SI->getValueOperand()->getType(),
5769  SI->getPointerOperand()) ||
5770  isLegalMaskedScatter(SI->getValueOperand()->getType())) {
5771  MaskedOp.insert(SI);
5772  continue;
5773  }
5774 
5775  bool isSafePtr = (SafePtrs.count(SI->getPointerOperand()) != 0);
5776  bool isSinglePredecessor = SI->getParent()->getSinglePredecessor();
5777 
5778  if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr ||
5779  !isSinglePredecessor)
5780  return false;
5781  }
5782  if (I.mayThrow())
5783  return false;
5784  }
5785 
5786  return true;
5787 }
5788 
5789 void InterleavedAccessInfo::collectConstStrideAccesses(
5791  const ValueToValueMap &Strides) {
5792 
5793  auto &DL = TheLoop->getHeader()->getModule()->getDataLayout();
5794 
5795  // Since it's desired that the load/store instructions be maintained in
5796  // "program order" for the interleaved access analysis, we have to visit the
5797  // blocks in the loop in reverse postorder (i.e., in a topological order).
5798  // Such an ordering will ensure that any load/store that may be executed
5799  // before a second load/store will precede the second load/store in
5800  // AccessStrideInfo.
5801  LoopBlocksDFS DFS(TheLoop);
5802  DFS.perform(LI);
5803  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
5804  for (auto &I : *BB) {
5805  auto *LI = dyn_cast<LoadInst>(&I);
5806  auto *SI = dyn_cast<StoreInst>(&I);
5807  if (!LI && !SI)
5808  continue;
5809 
5810  Value *Ptr = getPointerOperand(&I);
5811  // We don't check wrapping here because we don't know yet if Ptr will be
5812  // part of a full group or a group with gaps. Checking wrapping for all
5813  // pointers (even those that end up in groups with no gaps) will be overly
5814  // conservative. For full groups, wrapping should be ok since if we would
5815  // wrap around the address space we would do a memory access at nullptr
5816  // even without the transformation. The wrapping checks are therefore
5817  // deferred until after we've formed the interleaved groups.
5818  int64_t Stride = getPtrStride(PSE, Ptr, TheLoop, Strides,
5819  /*Assume=*/true, /*ShouldCheckWrap=*/false);
5820 
5821  const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
5822  PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
5823  uint64_t Size = DL.getTypeAllocSize(PtrTy->getElementType());
5824 
5825  // An alignment of 0 means target ABI alignment.
5826  unsigned Align = LI ? LI->getAlignment() : SI->getAlignment();
5827  if (!Align)
5828  Align = DL.getABITypeAlignment(PtrTy->getElementType());
5829 
5830  AccessStrideInfo[&I] = StrideDescriptor(Stride, Scev, Size, Align);
5831  }
5832 }
5833 
5834 // Analyze interleaved accesses and collect them into interleaved load and
5835 // store groups.
5836 //
5837 // When generating code for an interleaved load group, we effectively hoist all
5838 // loads in the group to the location of the first load in program order. When
5839 // generating code for an interleaved store group, we sink all stores to the
5840 // location of the last store. This code motion can change the order of load
5841 // and store instructions and may break dependences.
5842 //
5843 // The code generation strategy mentioned above ensures that we won't violate
5844 // any write-after-read (WAR) dependences.
5845 //
5846 // E.g., for the WAR dependence: a = A[i]; // (1)
5847 // A[i] = b; // (2)
5848 //
5849 // The store group of (2) is always inserted at or below (2), and the load
5850 // group of (1) is always inserted at or above (1). Thus, the instructions will
5851 // never be reordered. All other dependences are checked to ensure the
5852 // correctness of the instruction reordering.
5853 //
5854 // The algorithm visits all memory accesses in the loop in bottom-up program
5855 // order. Program order is established by traversing the blocks in the loop in
5856 // reverse postorder when collecting the accesses.
5857 //
5858 // We visit the memory accesses in bottom-up order because it can simplify the
5859 // construction of store groups in the presence of write-after-write (WAW)
5860 // dependences.
5861 //
5862 // E.g., for the WAW dependence: A[i] = a; // (1)
5863 // A[i] = b; // (2)
5864 // A[i + 1] = c; // (3)
5865 //
5866 // We will first create a store group with (3) and (2). (1) can't be added to
5867 // this group because it and (2) are dependent. However, (1) can be grouped
5868 // with other accesses that may precede it in program order. Note that a
5869 // bottom-up order does not imply that WAW dependences should not be checked.
5870 void InterleavedAccessInfo::analyzeInterleaving(
5871  const ValueToValueMap &Strides) {
5872  DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n");
5873 
5874  // Holds all accesses with a constant stride.
5876  collectConstStrideAccesses(AccessStrideInfo, Strides);
5877 
5878  if (AccessStrideInfo.empty())
5879  return;
5880 
5881  // Collect the dependences in the loop.
5882  collectDependences();
5883 
5884  // Holds all interleaved store groups temporarily.
5886  // Holds all interleaved load groups temporarily.
5888 
5889  // Search in bottom-up program order for pairs of accesses (A and B) that can
5890  // form interleaved load or store groups. In the algorithm below, access A
5891  // precedes access B in program order. We initialize a group for B in the
5892  // outer loop of the algorithm, and then in the inner loop, we attempt to
5893  // insert each A into B's group if:
5894  //
5895  // 1. A and B have the same stride,
5896  // 2. A and B have the same memory object size, and
5897  // 3. A belongs in B's group according to its distance from B.
5898  //
5899  // Special care is taken to ensure group formation will not break any
5900  // dependences.
5901  for (auto BI = AccessStrideInfo.rbegin(), E = AccessStrideInfo.rend();
5902  BI != E; ++BI) {
5903  Instruction *B = BI->first;
5904  StrideDescriptor DesB = BI->second;
5905 
5906  // Initialize a group for B if it has an allowable stride. Even if we don't
5907  // create a group for B, we continue with the bottom-up algorithm to ensure
5908  // we don't break any of B's dependences.
5909  InterleaveGroup *Group = nullptr;
5910  if (isStrided(DesB.Stride)) {
5911  Group = getInterleaveGroup(B);
5912  if (!Group) {
5913  DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B << '\n');
5914  Group = createInterleaveGroup(B, DesB.Stride, DesB.Align);
5915  }
5916  if (B->mayWriteToMemory())
5917  StoreGroups.insert(Group);
5918  else
5919  LoadGroups.insert(Group);
5920  }
5921 
5922  for (auto AI = std::next(BI); AI != E; ++AI) {
5923  Instruction *A = AI->first;
5924  StrideDescriptor DesA = AI->second;
5925 
5926  // Our code motion strategy implies that we can't have dependences
5927  // between accesses in an interleaved group and other accesses located
5928  // between the first and last member of the group. Note that this also
5929  // means that a group can't have more than one member at a given offset.
5930  // The accesses in a group can have dependences with other accesses, but
5931  // we must ensure we don't extend the boundaries of the group such that
5932  // we encompass those dependent accesses.
5933  //
5934  // For example, assume we have the sequence of accesses shown below in a
5935  // stride-2 loop:
5936  //
5937  // (1, 2) is a group | A[i] = a; // (1)
5938  // | A[i-1] = b; // (2) |
5939  // A[i-3] = c; // (3)
5940  // A[i] = d; // (4) | (2, 4) is not a group
5941  //
5942  // Because accesses (2) and (3) are dependent, we can group (2) with (1)
5943  // but not with (4). If we did, the dependent access (3) would be within
5944  // the boundaries of the (2, 4) group.
5945  if (!canReorderMemAccessesForInterleavedGroups(&*AI, &*BI)) {
5946 
5947  // If a dependence exists and A is already in a group, we know that A
5948  // must be a store since A precedes B and WAR dependences are allowed.
5949  // Thus, A would be sunk below B. We release A's group to prevent this
5950  // illegal code motion. A will then be free to form another group with
5951  // instructions that precede it.
5952  if (isInterleaved(A)) {
5953  InterleaveGroup *StoreGroup = getInterleaveGroup(A);
5954  StoreGroups.remove(StoreGroup);
5955  releaseGroup(StoreGroup);
5956  }
5957 
5958  // If a dependence exists and A is not already in a group (or it was
5959  // and we just released it), B might be hoisted above A (if B is a
5960  // load) or another store might be sunk below A (if B is a store). In
5961  // either case, we can't add additional instructions to B's group. B
5962  // will only form a group with instructions that it precedes.
5963  break;
5964  }
5965 
5966  // At this point, we've checked for illegal code motion. If either A or B
5967  // isn't strided, there's nothing left to do.
5968  if (!isStrided(DesA.Stride) || !isStrided(DesB.Stride))
5969  continue;
5970 
5971  // Ignore A if it's already in a group or isn't the same kind of memory
5972  // operation as B.
5973  if (isInterleaved(A) || A->mayReadFromMemory() != B->mayReadFromMemory())
5974  continue;
5975 
5976  // Check rules 1 and 2. Ignore A if its stride or size is different from
5977  // that of B.
5978  if (DesA.Stride != DesB.Stride || DesA.Size != DesB.Size)
5979  continue;
5980 
5981  // Calculate the distance from A to B.
5982  const SCEVConstant *DistToB = dyn_cast<SCEVConstant>(
5983  PSE.getSE()->getMinusSCEV(DesA.Scev, DesB.Scev));
5984  if (!DistToB)
5985  continue;
5986  int64_t DistanceToB = DistToB->getAPInt().getSExtValue();
5987 
5988  // Check rule 3. Ignore A if its distance to B is not a multiple of the
5989  // size.
5990  if (DistanceToB % static_cast<int64_t>(DesB.Size))
5991  continue;
5992 
5993  // Ignore A if either A or B is in a predicated block. Although we
5994  // currently prevent group formation for predicated accesses, we may be
5995  // able to relax this limitation in the future once we handle more
5996  // complicated blocks.
5997  if (isPredicated(A->getParent()) || isPredicated(B->getParent()))
5998  continue;
5999 
6000  // The index of A is the index of B plus A's distance to B in multiples
6001  // of the size.
6002  int IndexA =
6003  Group->getIndex(B) + DistanceToB / static_cast<int64_t>(DesB.Size);
6004 
6005  // Try to insert A into B's group.
6006  if (Group->insertMember(A, IndexA, DesA.Align)) {
6007  DEBUG(dbgs() << "LV: Inserted:" << *A << '\n'
6008  << " into the interleave group with" << *B << '\n');
6009  InterleaveGroupMap[A] = Group;
6010 
6011  // Set the first load in program order as the insert position.
6012  if (A->mayReadFromMemory())
6013  Group->setInsertPos(A);
6014  }
6015  } // Iteration over A accesses.
6016  } // Iteration over B accesses.
6017 
6018  // Remove interleaved store groups with gaps.
6019  for (InterleaveGroup *Group : StoreGroups)
6020  if (Group->getNumMembers() != Group->getFactor())
6021  releaseGroup(Group);
6022 
6023  // Remove interleaved groups with gaps (currently only loads) whose memory
6024  // accesses may wrap around. We have to revisit the getPtrStride analysis,
6025  // this time with ShouldCheckWrap=true, since collectConstStrideAccesses does
6026  // not check wrapping (see documentation there).
6027  // FORNOW we use Assume=false;
6028  // TODO: Change to Assume=true but making sure we don't exceed the threshold
6029  // of runtime SCEV assumptions checks (thereby potentially failing to
6030  // vectorize altogether).
6031  // Additional optional optimizations:
6032  // TODO: If we are peeling the loop and we know that the first pointer doesn't
6033  // wrap then we can deduce that all pointers in the group don't wrap.
6034  // This means that we can forcefully peel the loop in order to only have to
6035  // check the first pointer for no-wrap. When we'll change to use Assume=true
6036  // we'll only need at most one runtime check per interleaved group.
6037  //
6038  for (InterleaveGroup *Group : LoadGroups) {
6039 
6040  // Case 1: A full group. Can Skip the checks; For full groups, if the wide
6041  // load would wrap around the address space we would do a memory access at
6042  // nullptr even without the transformation.
6043  if (Group->getNumMembers() == Group->getFactor())
6044  continue;
6045 
6046  // Case 2: If first and last members of the group don't wrap this implies
6047  // that all the pointers in the group don't wrap.
6048  // So we check only group member 0 (which is always guaranteed to exist),
6049  // and group member Factor - 1; If the latter doesn't exist we rely on
6050  // peeling (if it is a non-reveresed accsess -- see Case 3).
6051  Value *FirstMemberPtr = getPointerOperand(Group->getMember(0));
6052  if (!getPtrStride(PSE, FirstMemberPtr, TheLoop, Strides, /*Assume=*/false,
6053  /*ShouldCheckWrap=*/true)) {
6054  DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
6055  "first group member potentially pointer-wrapping.\n");
6056  releaseGroup(Group);
6057  continue;
6058  }
6059  Instruction *LastMember = Group->getMember(Group->getFactor() - 1);
6060  if (LastMember) {
6061  Value *LastMemberPtr = getPointerOperand(LastMember);
6062  if (!getPtrStride(PSE, LastMemberPtr, TheLoop, Strides, /*Assume=*/false,
6063  /*ShouldCheckWrap=*/true)) {
6064  DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
6065  "last group member potentially pointer-wrapping.\n");
6066  releaseGroup(Group);
6067  }
6068  }
6069  else {
6070  // Case 3: A non-reversed interleaved load group with gaps: We need
6071  // to execute at least one scalar epilogue iteration. This will ensure
6072  // we don't speculatively access memory out-of-bounds. We only need
6073  // to look for a member at index factor - 1, since every group must have
6074  // a member at index zero.
6075  if (Group->isReverse()) {
6076  releaseGroup(Group);
6077  continue;
6078  }
6079  DEBUG(dbgs() << "LV: Interleaved group requires epilogue iteration.\n");
6080  RequiresScalarEpilogue = true;
6081  }
6082  }
6083 }
6084 
6086 LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
6087  // Width 1 means no vectorize
6088  VectorizationFactor Factor = {1U, 0U};
6089  if (OptForSize && Legal->getRuntimePointerChecking()->Need) {
6090  ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
6091  << "runtime pointer checks needed. Enable vectorization of this "
6092  "loop with '#pragma clang loop vectorize(enable)' when "
6093  "compiling with -Os/-Oz");
6094  DEBUG(dbgs()
6095  << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
6096  return Factor;
6097  }
6098 
6099  if (!EnableCondStoresVectorization && Legal->getNumPredStores()) {
6100  ORE->emit(createMissedAnalysis("ConditionalStore")
6101  << "store that is conditionally executed prevents vectorization");
6102  DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");
6103  return Factor;
6104  }
6105 
6106  MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
6107  unsigned SmallestType, WidestType;
6108  std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
6109  unsigned WidestRegister = TTI.getRegisterBitWidth(true);
6110  unsigned MaxSafeDepDist = -1U;
6111 
6112  // Get the maximum safe dependence distance in bits computed by LAA. If the
6113  // loop contains any interleaved accesses, we divide the dependence distance
6114  // by the maximum interleave factor of all interleaved groups. Note that
6115  // although the division ensures correctness, this is a fairly conservative
6116  // computation because the maximum distance computed by LAA may not involve
6117  // any of the interleaved accesses.
6118  if (Legal->getMaxSafeDepDistBytes() != -1U)
6119  MaxSafeDepDist =
6120  Legal->getMaxSafeDepDistBytes() * 8 / Legal->getMaxInterleaveFactor();
6121 
6122  WidestRegister =
6123  ((WidestRegister < MaxSafeDepDist) ? WidestRegister : MaxSafeDepDist);
6124  unsigned MaxVectorSize = WidestRegister / WidestType;
6125 
6126  DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / "
6127  << WidestType << " bits.\n");
6128  DEBUG(dbgs() << "LV: The Widest register is: " << WidestRegister
6129  << " bits.\n");
6130 
6131  if (MaxVectorSize == 0) {
6132  DEBUG(dbgs() << "LV: The target has no vector registers.\n");
6133  MaxVectorSize = 1;
6134  }
6135 
6136  assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements"
6137  " into one vector!");
6138 
6139  unsigned VF = MaxVectorSize;
6140  if (MaximizeBandwidth && !OptForSize) {
6141  // Collect all viable vectorization factors.
6143  unsigned NewMaxVectorSize = WidestRegister / SmallestType;
6144  for (unsigned VS = MaxVectorSize; VS <= NewMaxVectorSize; VS *= 2)
6145  VFs.push_back(VS);
6146 
6147  // For each VF calculate its register usage.
6148  auto RUs = calculateRegisterUsage(VFs);
6149 
6150  // Select the largest VF which doesn't require more registers than existing
6151  // ones.
6152  unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
6153  for (int i = RUs.size() - 1; i >= 0; --i) {
6154  if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
6155  VF = VFs[i];
6156  break;
6157  }
6158  }
6159  }
6160 
6161  // If we optimize the program for size, avoid creating the tail loop.
6162  if (OptForSize) {
6163  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
6164  DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
6165 
6166  // If we don't know the precise trip count, don't try to vectorize.
6167  if (TC < 2) {
6168  ORE->emit(
6169  createMissedAnalysis("UnknownLoopCountComplexCFG")
6170  << "unable to calculate the loop count due to complex control flow");
6171  DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
6172  return Factor;
6173  }
6174 
6175  // Find the maximum SIMD width that can fit within the trip count.
6176  VF = TC % MaxVectorSize;
6177 
6178  if (VF == 0)
6179  VF = MaxVectorSize;
6180  else {
6181  // If the trip count that we found modulo the vectorization factor is not
6182  // zero then we require a tail.
6183  ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize")
6184  << "cannot optimize for size and vectorize at the "
6185  "same time. Enable vectorization of this loop "
6186  "with '#pragma clang loop vectorize(enable)' "
6187  "when compiling with -Os/-Oz");
6188  DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
6189  return Factor;
6190  }
6191  }
6192 
6193  int UserVF = Hints->getWidth();
6194  if (UserVF != 0) {
6195  assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6196  DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6197 
6198  Factor.Width = UserVF;
6199  collectInstsToScalarize(UserVF);
6200  return Factor;
6201  }
6202 
6203  float Cost = expectedCost(1).first;
6204 #ifndef NDEBUG
6205  const float ScalarCost = Cost;
6206 #endif /* NDEBUG */
6207  unsigned Width = 1;
6208  DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
6209 
6210  bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
6211  // Ignore scalar width, because the user explicitly wants vectorization.
6212  if (ForceVectorization && VF > 1) {
6213  Width = 2;
6214  Cost = expectedCost(Width).first / (float)Width;
6215  }
6216 
6217  for (unsigned i = 2; i <= VF; i *= 2) {
6218  // Notice that the vector loop needs to be executed less times, so
6219  // we need to divide the cost of the vector loops by the width of
6220  // the vector elements.
6221  VectorizationCostTy C = expectedCost(i);
6222  float VectorCost = C.first / (float)i;
6223  DEBUG(dbgs() << "LV: Vector loop of width " << i
6224  << " costs: " << (int)VectorCost << ".\n");
6225  if (!C.second && !ForceVectorization) {
6226  DEBUG(
6227  dbgs() << "LV: Not considering vector loop of width " << i
6228  << " because it will not generate any vector instructions.\n");
6229  continue;
6230  }
6231  if (VectorCost < Cost) {
6232  Cost = VectorCost;
6233  Width = i;
6234  }
6235  }
6236 
6237  DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
6238  << "LV: Vectorization seems to be not beneficial, "
6239  << "but was forced by a user.\n");
6240  DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
6241  Factor.Width = Width;
6242  Factor.Cost = Width * Cost;
6243  return Factor;
6244 }
6245 
6246 std::pair<unsigned, unsigned>
6247 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
6248  unsigned MinWidth = -1U;
6249  unsigned MaxWidth = 8;
6250  const DataLayout &DL = TheFunction->getParent()->getDataLayout();
6251 
6252  // For each block.
6253  for (BasicBlock *BB : TheLoop->blocks()) {
6254  // For each instruction in the loop.
6255  for (Instruction &I : *BB) {
6256  Type *T = I.getType();
6257 
6258  // Skip ignored values.
6259  if (ValuesToIgnore.count(&I))
6260  continue;
6261 
6262  // Only examine Loads, Stores and PHINodes.
6263  if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
6264  continue;
6265 
6266  // Examine PHI nodes that are reduction variables. Update the type to
6267  // account for the recurrence type.
6268  if (auto *PN = dyn_cast<PHINode>(&I)) {
6269  if (!Legal->isReductionVariable(PN))
6270  continue;
6271  RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
6272  T = RdxDesc.getRecurrenceType();
6273  }
6274 
6275  // Examine the stored values.
6276  if (auto *ST = dyn_cast<StoreInst>(&I))
6277  T = ST->getValueOperand()->getType();
6278 
6279  // Ignore loaded pointer types and stored pointer types that are not
6280  // consecutive. However, we do want to take consecutive stores/loads of
6281  // pointer vectors into account.
6282  if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I))
6283  continue;
6284 
6285  MinWidth = std::min(MinWidth,
6286  (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
6287  MaxWidth = std::max(MaxWidth,
6288  (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
6289  }
6290  }
6291 
6292  return {MinWidth, MaxWidth};
6293 }
6294 
6295 unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
6296  unsigned VF,
6297  unsigned LoopCost) {
6298 
6299  // -- The interleave heuristics --
6300  // We interleave the loop in order to expose ILP and reduce the loop overhead.
6301  // There are many micro-architectural considerations that we can't predict
6302  // at this level. For example, frontend pressure (on decode or fetch) due to
6303  // code size, or the number and capabilities of the execution ports.
6304  //
6305  // We use the following heuristics to select the interleave count:
6306  // 1. If the code has reductions, then we interleave to break the cross
6307  // iteration dependency.
6308  // 2. If the loop is really small, then we interleave to reduce the loop
6309  // overhead.
6310  // 3. We don't interleave if we think that we will spill registers to memory
6311  // due to the increased register pressure.
6312 
6313  // When we optimize for size, we don't interleave.
6314  if (OptForSize)
6315  return 1;
6316 
6317  // We used the distance for the interleave count.
6318  if (Legal->getMaxSafeDepDistBytes() != -1U)
6319  return 1;
6320 
6321  // Do not interleave loops with a relatively small trip count.
6322  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
6323  if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
6324  return 1;
6325 
6326  unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
6327  DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
6328  << " registers\n");
6329 
6330  if (VF == 1) {
6331  if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
6332  TargetNumRegisters = ForceTargetNumScalarRegs;
6333  } else {
6334  if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
6335  TargetNumRegisters = ForceTargetNumVectorRegs;
6336  }
6337 
6338  RegisterUsage R = calculateRegisterUsage({VF})[0];
6339  // We divide by these constants so assume that we have at least one
6340  // instruction that uses at least one register.
6341  R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
6342  R.NumInstructions = std::max(R.NumInstructions, 1U);
6343 
6344  // We calculate the interleave count using the following formula.
6345  // Subtract the number of loop invariants from the number of available
6346  // registers. These registers are used by all of the interleaved instances.
6347  // Next, divide the remaining registers by the number of registers that is
6348  // required by the loop, in order to estimate how many parallel instances
6349  // fit without causing spills. All of this is rounded down if necessary to be
6350  // a power of two. We want power of two interleave count to simplify any
6351  // addressing operations or alignment considerations.
6352  unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
6353  R.MaxLocalUsers);
6354 
6355  // Don't count the induction variable as interleaved.
6357  IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
6358  std::max(1U, (R.MaxLocalUsers - 1)));
6359 
6360  // Clamp the interleave ranges to reasonable counts.
6361  unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
6362 
6363  // Check if the user has overridden the max.
6364  if (VF == 1) {
6365  if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
6366  MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
6367  } else {
6368  if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
6369  MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
6370  }
6371 
6372  // If we did not calculate the cost for VF (because the user selected the VF)
6373  // then we calculate the cost of VF here.
6374  if (LoopCost == 0)
6375  LoopCost = expectedCost(VF).first;
6376 
6377  // Clamp the calculated IC to be between the 1 and the max interleave count
6378  // that the target allows.
6379  if (IC > MaxInterleaveCount)
6380  IC = MaxInterleaveCount;
6381  else if (IC < 1)
6382  IC = 1;
6383 
6384  // Interleave if we vectorized this loop and there is a reduction that could
6385  // benefit from interleaving.
6386  if (VF > 1 && Legal->getReductionVars()->size()) {
6387  DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
6388  return IC;
6389  }
6390 
6391  // Note that if we've already vectorized the loop we will have done the
6392  // runtime check and so interleaving won't require further checks.
6393  bool InterleavingRequiresRuntimePointerCheck =
6394  (VF == 1 && Legal->getRuntimePointerChecking()->Need);
6395 
6396  // We want to interleave small loops in order to reduce the loop overhead and
6397  // potentially expose ILP opportunities.
6398  DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
6399  if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
6400  // We assume that the cost overhead is 1 and we use the cost model
6401  // to estimate the cost of the loop and interleave until the cost of the
6402  // loop overhead is about 5% of the cost of the loop.
6403  unsigned SmallIC =
6404  std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
6405 
6406  // Interleave until store/load ports (estimated by max interleave count) are
6407  // saturated.
6408  unsigned NumStores = Legal->getNumStores();
6409  unsigned NumLoads = Legal->getNumLoads();
6410  unsigned StoresIC = IC / (NumStores ? NumStores : 1);
6411  unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
6412 
6413  // If we have a scalar reduction (vector reductions are already dealt with
6414  // by this point), we can increase the critical path length if the loop
6415  // we're interleaving is inside another loop. Limit, by default to 2, so the
6416  // critical path only gets increased by one reduction operation.
6417  if (Legal->getReductionVars()->size() && TheLoop->getLoopDepth() > 1) {
6418  unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
6419  SmallIC = std::min(SmallIC, F);
6420  StoresIC = std::min(StoresIC, F);
6421  LoadsIC = std::min(LoadsIC, F);
6422  }
6423 
6425  std::max(StoresIC, LoadsIC) > SmallIC) {
6426  DEBUG(dbgs() << "LV: Interleaving to saturate store or load ports.\n");
6427  return std::max(StoresIC, LoadsIC);
6428  }
6429 
6430  DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
6431  return SmallIC;
6432  }
6433 
6434  // Interleave if this is a large loop (small loops are already dealt with by
6435  // this point) that could benefit from interleaving.
6436  bool HasReductions = (Legal->getReductionVars()->size() > 0);
6437  if (TTI.enableAggressiveInterleaving(HasReductions)) {
6438  DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6439  return IC;
6440  }
6441 
6442  DEBUG(dbgs() << "LV: Not Interleaving.\n");
6443  return 1;
6444 }
6445 
6447 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
6448  // This function calculates the register usage by measuring the highest number
6449  // of values that are alive at a single location. Obviously, this is a very
6450  // rough estimation. We scan the loop in a topological order in order and
6451  // assign a number to each instruction. We use RPO to ensure that defs are
6452  // met before their users. We assume that each instruction that has in-loop
6453  // users starts an interval. We record every time that an in-loop value is
6454  // used, so we have a list of the first and last occurrences of each
6455  // instruction. Next, we transpose this data structure into a multi map that
6456  // holds the list of intervals that *end* at a specific location. This multi
6457  // map allows us to perform a linear search. We scan the instructions linearly
6458  // and record each time that a new interval starts, by placing it in a set.
6459  // If we find this value in the multi-map then we remove it from the set.
6460  // The max register usage is the maximum size of the set.
6461  // We also search for instructions that are defined outside the loop, but are
6462  // used inside the loop. We need this number separately from the max-interval
6463  // usage number because when we unroll, loop-invariant values do not take
6464  // more register.
6465  LoopBlocksDFS DFS(TheLoop);
6466  DFS.perform(LI);
6467 
6468  RegisterUsage RU;
6469  RU.NumInstructions = 0;
6470 
6471  // Each 'key' in the map opens a new interval. The values
6472  // of the map are the index of the 'last seen' usage of the
6473  // instruction that is the key.
6475  // Maps instruction to its index.
6477  // Marks the end of each interval.
6478  IntervalMap EndPoint;
6479  // Saves the list of instruction indices that are used in the loop.
6481  // Saves the list of values that are used in the loop but are
6482  // defined outside the loop, such as arguments and constants.
6483  SmallPtrSet<Value *, 8> LoopInvariants;
6484 
6485  unsigned Index = 0;
6486  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6487  RU.NumInstructions += BB->size();
6488  for (Instruction &I : *BB) {
6489  IdxToInstr[Index++] = &I;
6490 
6491  // Save the end location of each USE.
6492  for (Value *U : I.operands()) {
6493  auto *Instr = dyn_cast<Instruction>(U);
6494 
6495  // Ignore non-instruction values such as arguments, constants, etc.
6496  if (!Instr)
6497  continue;
6498 
6499  // If this instruction is outside the loop then record it and continue.
6500  if (!TheLoop->contains(Instr)) {
6501  LoopInvariants.insert(Instr);
6502  continue;
6503  }
6504 
6505  // Overwrite previous end points.
6506  EndPoint[Instr] = Index;
6507  Ends.insert(Instr);
6508  }
6509  }
6510  }
6511 
6512  // Saves the list of intervals that end with the index in 'key'.
6513  typedef SmallVector<Instruction *, 2> InstrList;
6514  DenseMap<unsigned, InstrList> TransposeEnds;
6515 
6516  // Transpose the EndPoints to a list of values that end at each index.
6517  for (auto &Interval : EndPoint)
6518  TransposeEnds[Interval.second].push_back(Interval.first);
6519 
6520  SmallSet<Instruction *, 8> OpenIntervals;
6521 
6522  // Get the size of the widest register.
6523  unsigned MaxSafeDepDist = -1U;
6524  if (Legal->getMaxSafeDepDistBytes() != -1U)
6525  MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
6526  unsigned WidestRegister =
6527  std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
6528  const DataLayout &DL = TheFunction->getParent()->getDataLayout();
6529 
6531  SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);
6532 
6533  DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
6534 
6535  // A lambda that gets the register usage for the given type and VF.
6536  auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
6537  if (Ty->isTokenTy())
6538  return 0U;
6539  unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
6540  return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
6541  };
6542 
6543  for (unsigned int i = 0; i < Index; ++i) {
6544  Instruction *I = IdxToInstr[i];
6545 
6546  // Remove all of the instructions that end at this location.
6547  InstrList &List = TransposeEnds[i];
6548  for (Instruction *ToRemove : List)
6549  OpenIntervals.erase(ToRemove);
6550 
6551  // Ignore instructions that are never used within the loop.
6552  if (!Ends.count(I))
6553  continue;
6554 
6555  // Skip ignored values.
6556  if (ValuesToIgnore.count(I))
6557  continue;
6558 
6559  // For each VF find the maximum usage of registers.
6560  for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6561  if (VFs[j] == 1) {
6562  MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());
6563  continue;
6564  }
6565 
6566  // Count the number of live intervals.
6567  unsigned RegUsage = 0;
6568  for (auto Inst : OpenIntervals) {
6569  // Skip ignored values for VF > 1.
6570  if (VecValuesToIgnore.count(Inst))
6571  continue;
6572  RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
6573  }
6574  MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
6575  }
6576 
6577  DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6578  << OpenIntervals.size() << '\n');
6579 
6580  // Add the current instruction to the list of open intervals.
6581  OpenIntervals.insert(I);
6582  }
6583 
6584  for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6585  unsigned Invariant = 0;
6586  if (VFs[i] == 1)
6587  Invariant = LoopInvariants.size();
6588  else {
6589  for (auto Inst : LoopInvariants)
6590  Invariant += GetRegUsage(Inst->getType(), VFs[i]);
6591  }
6592 
6593  DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n');
6594  DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
6595  DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n');
6596  DEBUG(dbgs() << "LV(REG): LoopSize: " << RU.NumInstructions << '\n');
6597 
6598  RU.LoopInvariantRegs = Invariant;
6599  RU.MaxLocalUsers = MaxUsages[i];
6600  RUs[i] = RU;
6601  }
6602 
6603  return RUs;
6604 }
6605 
6606 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
6607 
6608  // If we aren't vectorizing the loop, or if we've already collected the
6609  // instructions to scalarize, there's nothing to do. Collection may already
6610  // have occurred if we have a user-selected VF and are now computing the
6611  // expected cost for interleaving.
6612  if (VF < 2 || InstsToScalarize.count(VF))
6613  return;
6614 
6615  // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6616  // not profitable to scalarize any instructions, the presence of VF in the
6617  // map will indicate that we've analyzed it already.
6618  ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6619 
6620  // Find all the instructions that are scalar with predication in the loop and
6621  // determine if it would be better to not if-convert the blocks they are in.
6622  // If so, we also record the instructions to scalarize.
6623  for (BasicBlock *BB : TheLoop->blocks()) {
6624  if (!Legal->blockNeedsPredication(BB))
6625  continue;
6626  for (Instruction &I : *BB)
6627  if (Legal->isScalarWithPredication(&I)) {
6628  ScalarCostsTy ScalarCosts;
6629  if (computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6630  ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6631  }
6632  }
6633 }
6634 
6635 int LoopVectorizationCostModel::computePredInstDiscount(
6636  Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
6637  unsigned VF) {
6638 
6639  assert(!Legal->isUniformAfterVectorization(PredInst) &&
6640  "Instruction marked uniform-after-vectorization will be predicated");
6641 
6642  // Initialize the discount to zero, meaning that the scalar version and the
6643  // vector version cost the same.
6644  int Discount = 0;
6645 
6646  // Holds instructions to analyze. The instructions we visit are mapped in
6647  // ScalarCosts. Those instructions are the ones that would be scalarized if
6648  // we find that the scalar version costs less.
6650 
6651  // Returns true if the given instruction can be scalarized.
6652  auto canBeScalarized = [&](Instruction *I) -> bool {
6653 
6654  // We only attempt to scalarize instructions forming a single-use chain
6655  // from the original predicated block that would otherwise be vectorized.
6656  // Although not strictly necessary, we give up on instructions we know will
6657  // already be scalar to avoid traversing chains that are unlikely to be
6658  // beneficial.
6659  if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6660  Legal->isScalarAfterVectorization(I))
6661  return false;
6662 
6663  // If the instruction is scalar with predication, it will be analyzed
6664  // separately. We ignore it within the context of PredInst.
6665  if (Legal->isScalarWithPredication(I))
6666  return false;
6667 
6668  // If any of the instruction's operands are uniform after vectorization,
6669  // the instruction cannot be scalarized. This prevents, for example, a
6670  // masked load from being scalarized.
6671  //
6672  // We assume we will only emit a value for lane zero of an instruction
6673  // marked uniform after vectorization, rather than VF identical values.
6674  // Thus, if we scalarize an instruction that uses a uniform, we would
6675  // create uses of values corresponding to the lanes we aren't emitting code
6676  // for. This behavior can be changed by allowing getScalarValue to clone
6677  // the lane zero values for uniforms rather than asserting.
6678  for (Use &U : I->operands())
6679  if (auto *J = dyn_cast<Instruction>(U.get()))
6680  if (Legal->isUniformAfterVectorization(J))
6681  return false;
6682 
6683  // Otherwise, we can scalarize the instruction.
6684  return true;
6685  };
6686 
6687  // Returns true if an operand that cannot be scalarized must be extracted
6688  // from a vector. We will account for this scalarization overhead below. Note
6689  // that the non-void predicated instructions are placed in their own blocks,
6690  // and their return values are inserted into vectors. Thus, an extract would
6691  // still be required.
6692  auto needsExtract = [&](Instruction *I) -> bool {
6693  return TheLoop->contains(I) && !Legal->isScalarAfterVectorization(I);
6694  };
6695 
6696  // Compute the expected cost discount from scalarizing the entire expression
6697  // feeding the predicated instruction. We currently only consider expressions
6698  // that are single-use instruction chains.
6699  Worklist.push_back(PredInst);
6700  while (!Worklist.empty()) {
6701  Instruction *I = Worklist.pop_back_val();
6702 
6703  // If we've already analyzed the instruction, there's nothing to do.
6704  if (ScalarCosts.count(I))
6705  continue;
6706 
6707  // Compute the cost of the vector instruction. Note that this cost already
6708  // includes the scalarization overhead of the predicated instruction.
6709  unsigned VectorCost = getInstructionCost(I, VF).first;
6710 
6711  // Compute the cost of the scalarized instruction. This cost is the cost of
6712  // the instruction as if it wasn't if-converted and instead remained in the
6713  // predicated block. We will scale this cost by block probability after
6714  // computing the scalarization overhead.
6715  unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
6716 
6717  // Compute the scalarization overhead of needed insertelement instructions
6718  // and phi nodes.
6719  if (Legal->isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
6720  ScalarCost += getScalarizationOverhead(ToVectorTy(I->getType(), VF), true,
6721  false, TTI);
6722  ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
6723  }
6724 
6725  // Compute the scalarization overhead of needed extractelement
6726  // instructions. For each of the instruction's operands, if the operand can
6727  // be scalarized, add it to the worklist; otherwise, account for the
6728  // overhead.
6729  for (Use &U : I->operands())
6730  if (auto *J = dyn_cast<Instruction>(U.get())) {
6731  assert(VectorType::isValidElementType(J->getType()) &&
6732  "Instruction has non-scalar type");
6733  if (canBeScalarized(J))
6734  Worklist.push_back(J);
6735  else if (needsExtract(J))
6736  ScalarCost += getScalarizationOverhead(ToVectorTy(J->getType(), VF),
6737  false, true, TTI);
6738  }
6739 
6740  // Scale the total scalar cost by block probability.
6741  ScalarCost /= getReciprocalPredBlockProb();
6742 
6743  // Compute the discount. A non-negative discount means the vector version
6744  // of the instruction costs more, and scalarizing would be beneficial.
6745  Discount += VectorCost - ScalarCost;
6746  ScalarCosts[I] = ScalarCost;
6747  }
6748 
6749  return Discount;
6750 }
6751 
6752 LoopVectorizationCostModel::VectorizationCostTy
6753 LoopVectorizationCostModel::expectedCost(unsigned VF) {
6754  VectorizationCostTy Cost;
6755 
6756  // Collect the instructions (and their associated costs) that will be more
6757  // profitable to scalarize.
6758  collectInstsToScalarize(VF);
6759 
6760  // For each block.
6761  for (BasicBlock *BB : TheLoop->blocks()) {
6762  VectorizationCostTy BlockCost;
6763 
6764  // For each instruction in the old loop.
6765  for (Instruction &I : *BB) {
6766  // Skip dbg intrinsics.
6767  if (isa<DbgInfoIntrinsic>(I))
6768  continue;
6769 
6770  // Skip ignored values.
6771  if (ValuesToIgnore.count(&I))
6772  continue;
6773 
6774  VectorizationCostTy C = getInstructionCost(&I, VF);
6775 
6776  // Check if we should override the cost.
6777  if (ForceTargetInstructionCost.getNumOccurrences() > 0)
6778  C.first = ForceTargetInstructionCost;
6779 
6780  BlockCost.first += C.first;
6781  BlockCost.second |= C.second;
6782  DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first << " for VF "
6783  << VF << " For instruction: " << I << '\n');
6784  }
6785 
6786  // If we are vectorizing a predicated block, it will have been
6787  // if-converted. This means that the block's instructions (aside from
6788  // stores and instructions that may divide by zero) will now be
6789  // unconditionally executed. For the scalar case, we may not always execute
6790  // the predicated block. Thus, scale the block's cost by the probability of
6791  // executing it.
6792  if (VF == 1 && Legal->blockNeedsPredication(BB))
6793  BlockCost.first /= getReciprocalPredBlockProb();
6794 
6795  Cost.first += BlockCost.first;
6796  Cost.second |= BlockCost.second;
6797  }
6798 
6799  return Cost;
6800 }
6801 
6802 /// \brief Gets Address Access SCEV after verifying that the access pattern
6803 /// is loop invariant except the induction variable dependence.
6804 ///
6805 /// This SCEV can be sent to the Target in order to estimate the address
6806 /// calculation cost.
6808  Value *Ptr,
6809  LoopVectorizationLegality *Legal,
6810  ScalarEvolution *SE,
6811  const Loop *TheLoop) {
6812  auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6813  if (!Gep)
6814  return nullptr;
6815 
6816  // We are looking for a gep with all loop invariant indices except for one
6817  // which should be an induction variable.
6818  unsigned NumOperands = Gep->getNumOperands();
6819  for (unsigned i = 1; i < NumOperands; ++i) {
6820  Value *Opd = Gep->getOperand(i);
6821  if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6822  !Legal->isInductionVariable(Opd))
6823  return nullptr;
6824  }
6825 
6826  // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6827  return SE->getSCEV(Ptr);
6828 }
6829 
6830 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6831  return Legal->hasStride(I->getOperand(0)) ||
6832  Legal->hasStride(I->getOperand(1));
6833 }
6834 
6835 LoopVectorizationCostModel::VectorizationCostTy
6836 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
6837  // If we know that this instruction will remain uniform, check the cost of
6838  // the scalar version.
6839  if (Legal->isUniformAfterVectorization(I))
6840  VF = 1;
6841 
6842  if (VF > 1 && isProfitableToScalarize(I, VF))
6843  return VectorizationCostTy(InstsToScalarize[VF][I], false);
6844 
6845  Type *VectorTy;
6846  unsigned C = getInstructionCost(I, VF, VectorTy);
6847 
6848  bool TypeNotScalarized =
6849  VF > 1 && !VectorTy->isVoidTy() && TTI.getNumberOfParts(VectorTy) < VF;
6850  return VectorizationCostTy(C, TypeNotScalarized);
6851 }
6852 
6853 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6854  unsigned VF,
6855  Type *&VectorTy) {
6856  Type *RetTy = I->getType();
6857  if (canTruncateToMinimalBitwidth(I, VF))
6858  RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6859  VectorTy = ToVectorTy(RetTy, VF);
6860  auto SE = PSE.getSE();
6861 
6862  // TODO: We need to estimate the cost of intrinsic calls.
6863  switch (I->getOpcode()) {
6864  case Instruction::GetElementPtr:
6865  // We mark this instruction as zero-cost because the cost of GEPs in
6866  // vectorized code depends on whether the corresponding memory instruction
6867  // is scalarized or not. Therefore, we handle GEPs with the memory
6868  // instruction cost.
6869  return 0;
6870  case Instruction::Br: {
6871  return TTI.getCFInstrCost(I->getOpcode());
6872  }
6873  case Instruction::PHI: {
6874  auto *Phi = cast<PHINode>(I);
6875 
6876  // First-order recurrences are replaced by vector shuffles inside the loop.
6877  if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
6879  VectorTy, VF - 1, VectorTy);
6880 
6881  // TODO: IF-converted IFs become selects.
6882  return 0;
6883  }
6884  case Instruction::UDiv:
6885  case Instruction::SDiv:
6886  case Instruction::URem:
6887  case Instruction::SRem:
6888  // If we have a predicated instruction, it may not be executed for each
6889  // vector lane. Get the scalarization cost and scale this amount by the
6890  // probability of executing the predicated block. If the instruction is not
6891  // predicated, we fall through to the next case.
6892  if (VF > 1 && Legal->isScalarWithPredication(I)) {
6893  unsigned Cost = 0;
6894 
6895  // These instructions have a non-void type, so account for the phi nodes
6896  // that we will create. This cost is likely to be zero. The phi node
6897  // cost, if any, should be scaled by the block probability because it
6898  // models a copy at the end of each predicated block.
6899  Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
6900 
6901  // The cost of the non-predicated instruction.
6902  Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
6903 
6904  // The cost of insertelement and extractelement instructions needed for
6905  // scalarization.
6906  Cost += getScalarizationOverhead(I, VF, TTI);
6907 
6908  // Scale the cost by the probability of executing the predicated blocks.
6909  // This assumes the predicated block for each vector lane is equally
6910  // likely.
6911  return Cost / getReciprocalPredBlockProb();
6912  }
6913  case Instruction::Add:
6914  case Instruction::FAdd:
6915  case Instruction::Sub:
6916  case Instruction::FSub:
6917  case Instruction::Mul:
6918  case Instruction::FMul:
6919  case Instruction::FDiv:
6920  case Instruction::FRem:
6921  case Instruction::Shl:
6922  case Instruction::LShr:
6923  case Instruction::AShr:
6924  case Instruction::And:
6925  case Instruction::Or:
6926  case Instruction::Xor: {
6927  // Since we will replace the stride by 1 the multiplication should go away.
6928  if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6929  return 0;
6930  // Certain instructions can be cheaper to vectorize if they have a constant
6931  // second vector operand. One example of this are shifts on x86.
6940  Value *Op2 = I->getOperand(1);
6941 
6942  // Check for a splat or for a non uniform vector of constants.
6943  if (isa<ConstantInt>(Op2)) {
6944  ConstantInt *CInt = cast<ConstantInt>(Op2);
6945  if (CInt && CInt->getValue().isPowerOf2())
6948  } else if (isa<ConstantVector>(Op2) || isa<ConstantDataVector>(Op2)) {
6950  Constant *SplatValue = cast<Constant>(Op2)->getSplatValue();
6951  if (SplatValue) {
6952  ConstantInt *CInt = dyn_cast<ConstantInt>(SplatValue);
6953  if (CInt && CInt->getValue().isPowerOf2())
6956  }
6957  } else if (Legal->isUniform(Op2)) {
6959  }
6961  return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK,
6962  Op2VK, Op1VP, Op2VP, Operands);
6963  }
6964  case Instruction::Select: {
6965  SelectInst *SI = cast<SelectInst>(I);
6966  const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6967  bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6968  Type *CondTy = SI->getCondition()->getType();
6969  if (!ScalarCond)
6970  CondTy = VectorType::get(CondTy, VF);
6971 
6972  return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
6973  }
6974  case Instruction::ICmp:
6975  case Instruction::FCmp: {
6976  Type *ValTy = I->getOperand(0)->getType();
6977  Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6978  if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6979  ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6980  VectorTy = ToVectorTy(ValTy, VF);
6981  return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy);
6982  }
6983  case Instruction::Store:
6984  case Instruction::Load: {
6985  StoreInst *SI = dyn_cast<StoreInst>(I);
6986  LoadInst *LI = dyn_cast<LoadInst>(I);
6987  Type *ValTy = (SI ? SI->getValueOperand()->getType() : LI->getType());
6988  VectorTy = ToVectorTy(ValTy, VF);
6989 
6990  unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment();
6991  unsigned AS =
6992  SI ? SI->getPointerAddressSpace() : LI->getPointerAddressSpace();
6993  Value *Ptr = getPointerOperand(I);
6994  // We add the cost of address computation here instead of with the gep
6995  // instruction because only here we know whether the operation is
6996  // scalarized.
6997  if (VF == 1)
6998  return TTI.getAddressComputationCost(VectorTy) +
6999  TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
7000 
7001  if (LI && Legal->isUniform(Ptr)) {
7002  // Scalar load + broadcast
7003  unsigned Cost = TTI.getAddressComputationCost(ValTy->getScalarType());
7004  Cost += TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
7005  Alignment, AS);
7006  return Cost +
7008  }
7009 
7010  // For an interleaved access, calculate the total cost of the whole
7011  // interleave group.
7012  if (Legal->isAccessInterleaved(I)) {
7013  auto Group = Legal->getInterleavedAccessGroup(I);
7014  assert(Group && "Fail to get an interleaved access group.");
7015 
7016  // Only calculate the cost once at the insert position.
7017  if (Group->getInsertPos() != I)
7018  return 0;
7019 
7020  unsigned InterleaveFactor = Group->getFactor();
7021  Type *WideVecTy =
7022  VectorType::get(VectorTy->getVectorElementType(),
7023  VectorTy->getVectorNumElements() * InterleaveFactor);
7024 
7025  // Holds the indices of existing members in an interleaved load group.
7026  // An interleaved store group doesn't need this as it doesn't allow gaps.
7027  SmallVector<unsigned, 4> Indices;
7028  if (LI) {
7029  for (unsigned i = 0; i < InterleaveFactor; i++)
7030  if (Group->getMember(i))
7031  Indices.push_back(i);
7032  }
7033 
7034  // Calculate the cost of the whole interleaved group.
7035  unsigned Cost = TTI.getInterleavedMemoryOpCost(
7036  I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
7037  Group->getAlignment(), AS);
7038 
7039  if (Group->isReverse())
7040  Cost +=
7041  Group->getNumMembers() *
7043 
7044  // FIXME: The interleaved load group with a huge gap could be even more
7045  // expensive than scalar operations. Then we could ignore such group and
7046  // use scalar operations instead.
7047  return Cost;
7048  }
7049 
7050  // Check if the memory instruction will be scalarized.
7051  if (Legal->memoryInstructionMustBeScalarized(I, VF)) {
7052  unsigned Cost = 0;
7053  Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
7054 
7055  // Figure out whether the access is strided and get the stride value
7056  // if it's known in compile time
7057  const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, SE, TheLoop);
7058 
7059  // Get the cost of the scalar memory instruction and address computation.
7060  Cost += VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
7061  Cost += VF *
7062  TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
7063  Alignment, AS);
7064 
7065  // Get the overhead of the extractelement and insertelement instructions
7066  // we might create due to scalarization.
7067  Cost += getScalarizationOverhead(I, VF, TTI);
7068 
7069  // If we have a predicated store, it may not be executed for each vector
7070  // lane. Scale the cost by the probability of executing the predicated
7071  // block.
7072  if (Legal->isScalarWithPredication(I))
7073  Cost /= getReciprocalPredBlockProb();
7074 
7075  return Cost;
7076  }
7077 
7078  // Determine if the pointer operand of the access is either consecutive or
7079  // reverse consecutive.
7080  int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
7081  bool Reverse = ConsecutiveStride < 0;
7082 
7083  // Determine if either a gather or scatter operation is legal.
7084  bool UseGatherOrScatter =
7085  !ConsecutiveStride && Legal->isLegalGatherOrScatter(I);
7086 
7087  unsigned Cost = TTI.getAddressComputationCost(VectorTy);
7088  if (UseGatherOrScatter) {
7089  assert(ConsecutiveStride == 0 &&
7090  "Gather/Scatter are not used for consecutive stride");
7091  return Cost +
7092  TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
7093  Legal->isMaskRequired(I), Alignment);
7094  }
7095  // Wide load/stores.
7096  if (Legal->isMaskRequired(I))
7097  Cost +=
7098  TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
7099  else
7100  Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
7101 
7102  if (Reverse)
7103  Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
7104  return Cost;
7105  }
7106  case Instruction::ZExt:
7107  case Instruction::SExt:
7108  case Instruction::FPToUI:
7109  case Instruction::FPToSI:
7110  case Instruction::FPExt:
7111  case Instruction::PtrToInt:
7112  case Instruction::IntToPtr:
7113  case Instruction::SIToFP:
7114  case Instruction::UIToFP:
7115  case Instruction::Trunc:
7116  case Instruction::FPTrunc:
7117  case Instruction::BitCast: {
7118  // We optimize the truncation of induction variable.
7119  // The cost of these is the same as the scalar operation.
7120  if (I->getOpcode() == Instruction::Trunc &&
7121  Legal->isInductionVariable(I->getOperand(0)))
7122  return TTI.getCastInstrCost(I->getOpcode(), I->getType(),
7123  I->getOperand(0)->getType());
7124 
7125  Type *SrcScalarTy = I->getOperand(0)->getType();
7126  Type *SrcVecTy = ToVectorTy(SrcScalarTy, VF);
7127  if (canTruncateToMinimalBitwidth(I, VF)) {
7128  // This cast is going to be shrunk. This may remove the cast or it might
7129  // turn it into slightly different cast. For example, if MinBW == 16,
7130  // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7131  //
7132  // Calculate the modified src and dest types.
7133  Type *MinVecTy = VectorTy;
7134  if (I->getOpcode() == Instruction::Trunc) {
7135  SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7136  VectorTy =
7137  largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7138  } else if (I->getOpcode() == Instruction::ZExt ||
7139  I->getOpcode() == Instruction::SExt) {
7140  SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7141  VectorTy =
7142  smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7143  }
7144  }
7145 
7146  return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
7147  }
7148  case Instruction::Call: {
7149  bool NeedToScalarize;
7150  CallInst *CI = cast<CallInst>(I);
7151  unsigned CallCost = getVectorCallCost(CI, VF, TTI, TLI, NeedToScalarize);
7152  if (getVectorIntrinsicIDForCall(CI, TLI))
7153  return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI));
7154  return CallCost;
7155  }
7156  default:
7157  // The cost of executing VF copies of the scalar instruction. This opcode
7158  // is unknown. Assume that it is the same as 'mul'.
7159  return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
7160  getScalarizationOverhead(I, VF, TTI);
7161  } // end of switch.
7162 }
7163 
7164 char LoopVectorize::ID = 0;
7165 static const char lv_name[] = "Loop Vectorization";
7166 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7175 INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
7177 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
7181 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7182 
7183 namespace llvm {
7184 Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) {
7185  return new LoopVectorize(NoUnrolling, AlwaysVectorize);
7186 }
7187 }
7188 
7189 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7190 
7191  // Check if the pointer operand of a load or store instruction is
7192  // consecutive.
7193  if (auto *Ptr = getPointerOperand(Inst))
7194  return Legal->isConsecutivePtr(Ptr);
7195  return false;
7196 }
7197 
7198 void LoopVectorizationCostModel::collectValuesToIgnore() {
7199  // Ignore ephemeral values.
7200  CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7201 
7202  // Ignore type-promoting instructions we identified during reduction
7203  // detection.
7204  for (auto &Reduction : *Legal->getReductionVars()) {
7205  RecurrenceDescriptor &RedDes = Reduction.second;
7206  SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7207  VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7208  }
7209 
7210  // Insert values known to be scalar into VecValuesToIgnore. This is a
7211  // conservative estimation of the values that will later be scalarized.
7212  //
7213  // FIXME: Even though an instruction is not scalar-after-vectoriztion, it may
7214  // still be scalarized. For example, we may find an instruction to be
7215  // more profitable for a given vectorization factor if it were to be
7216  // scalarized. But at this point, we haven't yet computed the
7217  // vectorization factor.
7218  for (auto *BB : TheLoop->getBlocks())
7219  for (auto &I : *BB)
7220  if (Legal->isScalarAfterVectorization(&I))
7221  VecValuesToIgnore.insert(&I);
7222 }
7223 
7224 void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
7225  bool IfPredicateInstr) {
7226  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
7227  // Holds vector parameters or scalars, in case of uniform vals.
7229 
7230  setDebugLocFromInst(Builder, Instr);
7231 
7232  // Does this instruction return a value ?
7233  bool IsVoidRetTy = Instr->getType()->isVoidTy();
7234 
7235  // Initialize a new scalar map entry.
7236  ScalarParts Entry(UF);
7237 
7238  VectorParts Cond;
7239  if (IfPredicateInstr)
7240  Cond = createBlockInMask(Instr->getParent());
7241 
7242  // For each vector unroll 'part':
7243  for (unsigned Part = 0; Part < UF; ++Part) {
7244  Entry[Part].resize(1);
7245  // For each scalar that we create:
7246 
7247  // Start an "if (pred) a[i] = ..." block.
7248  Value *Cmp = nullptr;
7249  if (IfPredicateInstr) {
7250  if (Cond[Part]->getType()->isVectorTy())
7251  Cond[Part] =
7252  Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0));
7253  Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part],
7254  ConstantInt::get(Cond[Part]->getType(), 1));
7255  }
7256 
7257  Instruction *Cloned = Instr->clone();
7258  if (!IsVoidRetTy)
7259  Cloned->setName(Instr->getName() + ".cloned");
7260 
7261  // Replace the operands of the cloned instructions with their scalar
7262  // equivalents in the new loop.
7263  for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
7264  auto *NewOp = getScalarValue(Instr->getOperand(op), Part, 0);
7265  Cloned->setOperand(op, NewOp);
7266  }
7267 
7268  // Place the cloned scalar in the new loop.
7269  Builder.Insert(Cloned);
7270 
7271  // Add the cloned scalar to the scalar map entry.
7272  Entry[Part][0] = Cloned;
7273 
7274  // If we just cloned a new assumption, add it the assumption cache.
7275  if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
7276  if (II->getIntrinsicID() == Intrinsic::assume)
7277  AC->registerAssumption(II);
7278 
7279  // End if-block.
7280  if (IfPredicateInstr)
7281  PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp));
7282  }
7283  VectorLoopValueMap.initScalar(Instr, Entry);
7284 }
7285 
7286 void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) {
7287  auto *SI = dyn_cast<StoreInst>(Instr);
7288  bool IfPredicateInstr = (SI && Legal->blockNeedsPredication(SI->getParent()));
7289 
7290  return scalarizeInstruction(Instr, IfPredicateInstr);
7291 }
7292 
7293 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
7294 
7295 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7296 
7297 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
7298  Instruction::BinaryOps BinOp) {
7299  // When unrolling and the VF is 1, we only need to add a simple scalar.
7300  Type *Ty = Val->getType();
7301  assert(!Ty->isVectorTy() && "Val must be a scalar");
7302 
7303  if (Ty->isFloatingPointTy()) {
7304  Constant *C = ConstantFP::get(Ty, (double)StartIdx);
7305 
7306  // Floating point operations had to be 'fast' to enable the unrolling.
7307  Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
7308  return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
7309  }
7310  Constant *C = ConstantInt::get(Ty, StartIdx);
7311  return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
7312 }
7313 
7316  // Reserve first location for self reference to the LoopID metadata node.
7317  MDs.push_back(nullptr);
7318  bool IsUnrollMetadata = false;
7319  MDNode *LoopID = L->getLoopID();
7320  if (LoopID) {
7321  // First find existing loop unrolling disable metadata.
7322  for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7323  auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7324  if (MD) {
7325  const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7326  IsUnrollMetadata =
7327  S && S->getString().startswith("llvm.loop.unroll.disable");
7328  }
7329  MDs.push_back(LoopID->getOperand(i));
7330  }
7331  }
7332 
7333  if (!IsUnrollMetadata) {
7334  // Add runtime unroll disable metadata.
7335  LLVMContext &Context = L->getHeader()->getContext();
7336  SmallVector<Metadata *, 1> DisableOperands;
7337  DisableOperands.push_back(
7338  MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7339  MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7340  MDs.push_back(DisableNode);
7341  MDNode *NewLoopID = MDNode::get(Context, MDs);
7342  // Set operand 0 to refer to the loop id itself.
7343  NewLoopID->replaceOperandWith(0, NewLoopID);
7344  L->setLoopID(NewLoopID);
7345  }
7346 }
7347 
7349  assert(L->empty() && "Only process inner loops.");
7350 
7351 #ifndef NDEBUG
7352  const std::string DebugLocStr = getDebugLocString(L);
7353 #endif /* NDEBUG */
7354 
7355  DEBUG(dbgs() << "\nLV: Checking a loop in \""
7356  << L->getHeader()->getParent()->getName() << "\" from "
7357  << DebugLocStr << "\n");
7358 
7359  LoopVectorizeHints Hints(L, DisableUnrolling, *ORE);
7360 
7361  DEBUG(dbgs() << "LV: Loop hints:"
7362  << " force="
7363  << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7364  ? "disabled"
7365  : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7366  ? "enabled"
7367  : "?"))
7368  << " width=" << Hints.getWidth()
7369  << " unroll=" << Hints.getInterleave() << "\n");
7370 
7371  // Function containing loop
7372  Function *F = L->getHeader()->getParent();
7373 
7374  // Looking at the diagnostic output is the only way to determine if a loop
7375  // was vectorized (other than looking at the IR or machine code), so it
7376  // is important to generate an optimization remark for each loop. Most of
7377  // these messages are generated as OptimizationRemarkAnalysis. Remarks
7378  // generated as OptimizationRemark and OptimizationRemarkMissed are
7379  // less verbose reporting vectorized loops and unvectorized loops that may
7380  // benefit from vectorization, respectively.
7381 
7382  if (!Hints.allowVectorization(F, L, AlwaysVectorize)) {
7383  DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7384  return false;
7385  }
7386 
7387  // Check the loop for a trip count threshold:
7388  // do not vectorize loops with a tiny trip count.
7389  const unsigned MaxTC = SE->getSmallConstantMaxTripCount(L);
7390  if (MaxTC > 0u && MaxTC < TinyTripCountVectorThreshold) {
7391  DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7392  << "This loop is not worth vectorizing.");
7393  if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7394  DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7395  else {
7396  DEBUG(dbgs() << "\n");
7397  ORE->emit(createMissedAnalysis(Hints.vectorizeAnalysisPassName(),
7398  "NotBeneficial", L)
7399  << "vectorization is not beneficial "
7400  "and is not explicitly forced");
7401  return false;
7402  }
7403  }
7404 
7405  PredicatedScalarEvolution PSE(*SE, *L);
7406 
7407  // Check if it is legal to vectorize the loop.
7408  LoopVectorizationRequirements Requirements(*ORE);
7409  LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, GetLAA, LI, ORE,
7410  &Requirements, &Hints);
7411  if (!LVL.canVectorize()) {
7412  DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7413  emitMissedWarning(F, L, Hints, ORE);
7414  return false;
7415  }
7416 
7417  // Use the cost model.
7418  LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F,
7419  &Hints);
7420  CM.collectValuesToIgnore();
7421 
7422  // Check the function attributes to find out if this function should be
7423  // optimized for size.
7424  bool OptForSize =
7425  Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();
7426 
7427  // Compute the weighted frequency of this loop being executed and see if it
7428  // is less than 20% of the function entry baseline frequency. Note that we
7429  // always have a canonical loop here because we think we *can* vectorize.
7430  // FIXME: This is hidden behind a flag due to pervasive problems with
7431  // exactly what block frequency models.
7433  BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader());
7434  if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
7435  LoopEntryFreq < ColdEntryFreq)
7436  OptForSize = true;
7437  }
7438 
7439  // Check the function attributes to see if implicit floats are allowed.
7440  // FIXME: This check doesn't seem possibly correct -- what if the loop is
7441  // an integer loop and the vector instructions selected are purely integer
7442  // vector instructions?
7443  if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7444  DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
7445  "attribute is used.\n");
7446  ORE->emit(createMissedAnalysis(Hints.vectorizeAnalysisPassName(),
7447  "NoImplicitFloat", L)
7448  << "loop not vectorized due to NoImplicitFloat attribute");
7449  emitMissedWarning(F, L, Hints, ORE);
7450  return false;
7451  }
7452 
7453  // Check if the target supports potentially unsafe FP vectorization.
7454  // FIXME: Add a check for the type of safety issue (denormal, signaling)
7455  // for the target we're vectorizing for, to make sure none of the
7456  // additional fp-math flags can help.
7457  if (Hints.isPotentiallyUnsafe() &&
7459  DEBUG(dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n");
7460  ORE->emit(
7461  createMissedAnalysis(Hints.vectorizeAnalysisPassName(), "UnsafeFP", L)
7462  << "loop not vectorized due to unsafe FP support.");
7463  emitMissedWarning(F, L, Hints, ORE);
7464  return false;
7465  }
7466 
7467  // Select the optimal vectorization factor.
7469  CM.selectVectorizationFactor(OptForSize);
7470 
7471  // Select the interleave count.
7472  unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);
7473 
7474  // Get user interleave count.
7475  unsigned UserIC = Hints.getInterleave();
7476 
7477  // Identify the diagnostic messages that should be produced.
7478  std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7479  bool VectorizeLoop = true, InterleaveLoop = true;
7480  if (Requirements.doesNotMeet(F, L, Hints)) {
7481  DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7482  "requirements.\n");
7483  emitMissedWarning(F, L, Hints, ORE);
7484  return false;
7485  }
7486 
7487  if (VF.Width == 1) {
7488  DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7489  VecDiagMsg = std::make_pair(
7490  "VectorizationNotBeneficial",
7491  "the cost-model indicates that vectorization is not beneficial");
7492  VectorizeLoop = false;
7493  }
7494 
7495  if (IC == 1 && UserIC <= 1) {
7496  // Tell the user interleaving is not beneficial.
7497  DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7498  IntDiagMsg = std::make_pair(
7499  "InterleavingNotBeneficial",
7500  "the cost-model indicates that interleaving is not beneficial");
7501  InterleaveLoop = false;
7502  if (UserIC == 1) {
7503  IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7504  IntDiagMsg.second +=
7505  " and is explicitly disabled or interleave count is set to 1";
7506  }
7507  } else if (IC > 1 && UserIC == 1) {
7508  // Tell the user interleaving is beneficial, but it explicitly disabled.
7509  DEBUG(dbgs()
7510  << "LV: Interleaving is beneficial but is explicitly disabled.");
7511  IntDiagMsg = std::make_pair(
7512  "InterleavingBeneficialButDisabled",
7513  "the cost-model indicates that interleaving is beneficial "
7514  "but is explicitly disabled or interleave count is set to 1");
7515  InterleaveLoop = false;
7516  }
7517 
7518  // Override IC if user provided an interleave count.
7519  IC = UserIC > 0 ? UserIC : IC;
7520 
7521  // Emit diagnostic messages, if any.
7522  const char *VAPassName = Hints.vectorizeAnalysisPassName();
7523  if (!VectorizeLoop && !InterleaveLoop) {
7524  // Do not vectorize or interleaving the loop.
7525  ORE->emit(OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7526  L->getStartLoc(), L->getHeader())
7527  << VecDiagMsg.second);
7528  ORE->emit(OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
7529  L->getStartLoc(), L->getHeader())
7530  << IntDiagMsg.second);
7531  return false;
7532  } else if (!VectorizeLoop && InterleaveLoop) {
7533  DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7534  ORE->emit(OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7535  L->getStartLoc(), L->getHeader())
7536  << VecDiagMsg.second);
7537  } else if (VectorizeLoop && !InterleaveLoop) {
7538  DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
7539  << DebugLocStr << '\n');
7540  ORE->emit(OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
7541  L->getStartLoc(), L->getHeader())
7542  << IntDiagMsg.second);
7543  } else if (VectorizeLoop && InterleaveLoop) {
7544  DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
7545  << DebugLocStr << '\n');
7546  DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7547  }
7548 
7549  using namespace ore;
7550  if (!VectorizeLoop) {
7551  assert(IC > 1 && "interleave count should not be 1 or 0");
7552  // If we decided that it is not legal to vectorize the loop, then
7553  // interleave it.
7554  InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
7555  &CM);
7556  Unroller.vectorize();
7557 
7558  ORE->emit(OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
7559  L->getHeader())
7560  << "interleaved loop (interleaved count: "
7561  << NV("InterleaveCount", IC) << ")");
7562  } else {
7563  // If we decided that it is *legal* to vectorize the loop, then do it.
7564  InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7565  &LVL, &CM);
7566  LB.vectorize();
7567  ++LoopsVectorized;
7568 
7569  // Add metadata to disable runtime unrolling a scalar loop when there are
7570  // no runtime checks about strides and memory. A scalar loop that is
7571  // rarely used is not worth unrolling.
7572  if (!LB.areSafetyChecksAdded())
7574 
7575  // Report the vectorization decision.
7576  ORE->emit(OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
7577  L->getHeader())
7578  << "vectorized loop (vectorization width: "
7579  << NV("VectorizationFactor", VF.Width)
7580  << ", interleaved count: " << NV("InterleaveCount", IC) << ")");
7581  }
7582 
7583  // Mark the loop as already vectorized to avoid vectorizing again.
7584  Hints.setAlreadyVectorized();
7585 
7586  DEBUG(verifyFunction(*L->getHeader()->getParent()));
7587  return true;
7588 }
7589 
7591  Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
7593  DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
7594  std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
7595  OptimizationRemarkEmitter &ORE_) {
7596 
7597  SE = &SE_;
7598  LI = &LI_;
7599  TTI = &TTI_;
7600  DT = &DT_;
7601  BFI = &BFI_;
7602  TLI = TLI_;
7603  AA = &AA_;
7604  AC = &AC_;
7605  GetLAA = &GetLAA_;
7606  DB = &DB_;
7607  ORE = &ORE_;
7608 
7609  // Compute some weights outside of the loop over the loops. Compute this
7610  // using a BranchProbability to re-use its scaling math.
7611  const BranchProbability ColdProb(1, 5); // 20%
7612  ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb;
7613 
7614  // Don't attempt if
7615  // 1. the target claims to have no vector registers, and
7616  // 2. interleaving won't help ILP.
7617  //
7618  // The second condition is necessary because, even if the target has no
7619  // vector registers, loop vectorization may still enable scalar
7620  // interleaving.
7621  if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2)
7622  return false;
7623 
7624  // Build up a worklist of inner-loops to vectorize. This is necessary as
7625  // the act of vectorizing or partially unrolling a loop creates new loops
7626  // and can invalidate iterators across the loops.
7627  SmallVector<Loop *, 8> Worklist;
7628 
7629  for (Loop *L : *LI)
7630  addAcyclicInnerLoop(*L, Worklist);
7631 
7632  LoopsAnalyzed += Worklist.size();
7633 
7634  // Now walk the identified inner loops.
7635  bool Changed = false;
7636  while (!Worklist.empty())
7637  Changed |= processLoop(Worklist.pop_back_val());
7638 
7639  // Process each loop nest in the function.
7640  return Changed;
7641 
7642 }
7643 
7644 
7647  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
7648  auto &LI = AM.getResult<LoopAnalysis>(F);
7649  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
7650  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
7651  auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
7652  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
7653  auto &AA = AM.getResult<AAManager>(F);
7654  auto &AC = AM.getResult<AssumptionAnalysis>(F);
7655  auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
7657 
7658  auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
7659  std::function<const LoopAccessInfo &(Loop &)> GetLAA =
7660  [&](Loop &L) -> const LoopAccessInfo & {
7661  LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI};
7662  return LAM.getResult<LoopAccessAnalysis>(L, AR);
7663  };
7664  bool Changed =
7665  runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE);
7666  if (!Changed)
7667  return PreservedAnalyses::all();
7668  PreservedAnalyses PA;
7669  PA.preserve<LoopAnalysis>();
7671  PA.preserve<BasicAA>();
7672  PA.preserve<GlobalsAA>();
7673  return PA;
7674 }
Legacy wrapper pass to provide the GlobalsAAResult object.
MachineLoop * L
unsigned getNumBackEdges() const
Calculate the number of back edges to the loop header.
Definition: LoopInfo.h:185
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:81
static unsigned RuntimeMemoryCheckThreshold
\brief When performing memory disambiguation checks at runtime do not make more than this number of c...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
bool enableInterleavedAccessVectorization() const
Enable matching of interleaved access groups.
static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract, const TargetTransformInfo &TTI)
Estimate the overhead of scalarizing a value based on its type.
Value * getValueOperand()
Definition: Instructions.h:391
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:76
This builds on the llvm/ADT/GraphTraits.h file to find the strongly connected components (SCCs) of a ...
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:102
void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction, which must be an operator which supports these flags.
void ReplaceInstWithInst(BasicBlock::InstListType &BIL, BasicBlock::iterator &BI, Instruction *I)
Replace the instruction specified by BI with the instruction specified by I.
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:870
bool isFPVectorizationPotentiallyUnsafe() const
Indicate that it is potentially unsafe to automatically vectorize floating-point operations because t...
int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, bool VariableMask, unsigned Alignment) const
bool isLegalMaskedStore(Type *DataType) const
Return true if the target supports masked load/store AVX2 and AVX-512 targets allow masks for consecu...
LLVM_NODISCARD LLVM_ATTRIBUTE_ALWAYS_INLINE bool endswith(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:276
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
uint64_t getMaxSafeDepDistBytes() const
static IntegerType * getInt1Ty(LLVMContext &C)
Definition: Type.cpp:166
Diagnostic information for missed-optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function. ...
Definition: Function.cpp:226
LLVMContext & Context
DiagnosticInfoOptimizationBase::Argument NV
STATISTIC(NumFunctions,"Total number of functions")
size_t i
BasicBlock * SplitBlock(BasicBlock *Old, Instruction *SplitPt, DominatorTree *DT=nullptr, LoopInfo *LI=nullptr)
Split the specified block at the specified instruction - everything before SplitPt stays in Old and e...
ValueT lookup(const KeyT &Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:162
static cl::opt< unsigned > ForceTargetNumScalarRegs("force-target-num-scalar-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of scalar registers."))
Instruction * getUnsafeAlgebraInst()
Returns first unsafe algebra instruction in the PHI node's use-chain.
Definition: LoopUtils.h:200
int getWidth()
Get the width of a number.
Definition: ScaledNumber.h:43
This is the interface for a simple mod/ref and alias analysis over globals.
A Module instance is used to store all the information related to an LLVM module. ...
Definition: Module.h:52
void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
Definition: Metadata.cpp:819
static Constant * getSequentialMask(IRBuilder<> &Builder, unsigned NumInt, unsigned NumUndef)
static MDString * get(LLVMContext &Context, StringRef Str)
Definition: Metadata.cpp:414
const Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
Interval Class - An Interval is a set of nodes defined such that every node in the interval has all o...
Definition: Interval.h:37
reverse_iterator rend()
Definition: MapVector.h:60
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:307
Min/max implemented in terms of select(cmp()).
Definition: LoopUtils.h:74
This class represents zero extension of integer types.
unsigned getNumOperands() const
Definition: User.h:167
int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) const
static Value * ConcatenateVectors(IRBuilder<> &Builder, ArrayRef< Value * > InputList)
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1040
value_op_iterator value_op_begin()
Definition: User.h:231
static void AddRuntimeUnrollDisableMetaData(Loop *L)
The main scalar evolution driver.
iterator end() const
Definition: ArrayRef.h:130
This class represents a function call, abstracting a target machine's calling convention.
int64_t getPtrStride(PredicatedScalarEvolution &PSE, Value *Ptr, const Loop *Lp, const ValueToValueMap &StridesMap=ValueToValueMap(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of its element size.
size_type count(PtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:380
An immutable pass that tracks lazily created AssumptionCache objects.
static Type * largestIntegerVectorType(Type *T1, Type *T2)
unsigned getMaxInterleaveFactor(unsigned VF) const
static bool isReductionPHI(PHINode *Phi, Loop *TheLoop, RecurrenceDescriptor &RedDes)
Returns true if Phi is a reduction in TheLoop.
Definition: LoopUtils.cpp:475
A cache of .assume calls within a function.
Analysis pass providing the TargetTransformInfo.
static cl::opt< unsigned > TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), cl::Hidden, cl::desc("Don't vectorize loops with a constant ""trip count that is smaller than this ""value."))
We don't vectorize loops with a known constant trip count below this number.
Externally visible function.
Definition: GlobalValue.h:49
void initializeLoopVectorizePass(PassRegistry &)
This class implements a map that also provides access to all stored values in a deterministic order...
Definition: MapVector.h:32
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
value_op_iterator value_op_end()
Definition: User.h:234
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:736
LoopT * getParentLoop() const
Definition: LoopInfo.h:103
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:100
static cl::opt< bool > MaximizeBandwidth("vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which ""will be determined by the smallest type in loop."))
unsigned getNumberOfParts(Type *Tp) const
A debug info location.
Definition: DebugLoc.h:34
const Instruction & front() const
Definition: BasicBlock.h:240
Metadata node.
Definition: Metadata.h:830
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.h:234
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:189
An instruction for reading from memory.
Definition: Instructions.h:164
FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys=None)
Return the function type for an intrinsic.
Definition: Function.cpp:905
Hexagon Common GEP
BlockT * getExitBlock() const
If getExitBlocks would return exactly one block, return that block.
Definition: LoopInfoImpl.h:79
#define op(i)
Type * getElementType() const
Definition: DerivedTypes.h:462
static cl::opt< unsigned > NumberOfStoresToPredicate("vectorize-num-stores-pred", cl::init(1), cl::Hidden, cl::desc("Max number of stores to be predicated behind an if."))
The number of stores in a loop that are allowed to need predication.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:93
bool erase(const T &V)
Definition: SmallSet.h:107
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:78
static Value * ConcatenateTwoVectors(IRBuilder<> &Builder, Value *V1, Value *V2)
op_iterator op_begin()
Definition: User.h:205
BlockT * getHeader() const
Definition: LoopInfo.h:102
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
uint64_t getTypeAllocSizeInBits(Type *Ty) const
Returns the offset in bits between successive objects of the specified type, including alignment padd...
Definition: DataLayout.h:418
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:195
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:191
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
Definition: VectorUtils.cpp:86
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
Definition: LoopInfoImpl.h:157
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:345
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:228
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:347
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:53
std::string str() const
Return the twine contents as a std::string.
Definition: Twine.cpp:17
bool hasUnsafeAlgebra()
Returns true if the recurrence has unsafe algebra which requires a relaxed floating-point model...
Definition: LoopUtils.h:197
static Value * getPointerOperand(Instruction &Inst)
bool empty() const
Definition: MapVector.h:63
This class represents the LLVM 'select' instruction.
bool isIdenticalTo(const Instruction *I) const
Return true if the specified instruction is exactly identical to the current one. ...
struct fuzzer::@269 Flags
static const unsigned MaxVectorWidth
Maximum SIMD width.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal)
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:578
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:143
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr it the function does no...
Definition: BasicBlock.cpp:116
bool isLegalMaskedGather(Type *DataType) const
A Use represents the edge between a Value definition and its users.
Definition: Use.h:56
static cl::opt< bool > EnableLoadStoreRuntimeInterleave("enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, cl::desc("Enable runtime interleaving until load/store ports are saturated"))
static cl::opt< unsigned > MaxInterleaveGroupFactor("max-interleave-group-factor", cl::Hidden, cl::desc("Maximum factor for an interleaved access group (default = 8)"), cl::init(8))
Maximum factor for an interleaved memory access.
Diagnostic information for optimization analysis remarks.
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:994
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:588
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:90
Legacy analysis pass which computes BlockFrequencyInfo.
size_type size() const
Definition: SmallSet.h:59
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:257
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:806
ConstantInt * getConstIntStepValue() const
Definition: LoopUtils.cpp:695
Type * getVectorElementType() const
Definition: Type.h:353
Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following: ...
static void cse(BasicBlock *BB)
Perform cse of induction variable instructions.
bool remove(const value_type &X)
Remove an item from the set vector.
Definition: SetVector.h:152
static const unsigned TinyTripCountInterleaveThreshold
We don't interleave loops with a known constant trip count below this number.
void addPredicate(const SCEVPredicate &Pred)
Adds a new predicate.
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:60
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:154
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:399
bool isLegalMaskedScatter(Type *DataType) const
Return true if the target supports masked gather/scatter AVX-512 fully supports gather and scatter fo...
user_iterator_impl< User > user_iterator
Definition: Value.h:340
Class to represent function types.
Definition: DerivedTypes.h:102
static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI)
static unsigned getAlignment(GlobalVariable *GV)
#define F(x, y, z)
Definition: MD5.cpp:51
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:136
bool mayReadFromMemory() const
Return true if this instruction may read memory.
LLVM_NODISCARD LLVM_ATTRIBUTE_ALWAYS_INLINE bool startswith(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:264
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
static bool isEqual(const Function &Caller, const Function &Callee)
void addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase< BlockT, LoopT > &LI)
This method is used by other analyses to update loop information.
Definition: LoopInfoImpl.h:188
Value * transform(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL) const
Compute the transformed value of Index at offset StartValue using step StepValue. ...
Definition: LoopUtils.cpp:701
void addChildLoop(LoopT *NewChild)
Add the specified loop to be a child of this loop.
Definition: LoopInfo.h:279
unsigned getRegisterBitWidth(bool Vector) const
Pass * createLoopVectorizePass(bool NoUnrolling=false, bool AlwaysVectorize=true)
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Definition: Type.cpp:646
Function Alias Analysis false
BasicBlock * getSuccessor(unsigned i) const
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:83
int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) const
static Type * smallestIntegerVectorType(Type *T1, Type *T2)
static FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
Definition: Type.cpp:291
static CmpInst * Create(OtherOps Op, Predicate predicate, Value *S1, Value *S2, const Twine &Name="", Instruction *InsertBefore=nullptr)
Construct a compare instruction, given the opcode, the predicate and the two operands.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:33
bool isFloatingPointTy() const
Return true if this is one of the six floating-point types.
Definition: Type.h:160
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:949
Not an induction variable.
Definition: LoopUtils.h:265
static GCRegistry::Add< OcamlGC > B("ocaml","ocaml 3.10-compatible GC")
An instruction for storing to memory.
Definition: Instructions.h:300
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:151
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:401
Diagnostic information for optimization analysis remarks related to pointer aliasing.
Reverse the order of the vector.
static cl::opt< unsigned > ForceTargetMaxScalarInterleaveFactor("force-target-max-scalar-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for ""scalar loops."))
static Constant * getInterleavedMask(IRBuilder<> &Builder, unsigned VF, unsigned NumVec)
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:263
iterator find(const KeyT &Key)
Definition: MapVector.h:131
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:392
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:96
Type * getScalarType() const LLVM_READONLY
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.cpp:44
This class represents a truncation of integer types.
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:141
Maximum length of the test input libFuzzer tries to guess a good value based on the corpus and reports it always prefer smaller inputs during the corpus shuffle When libFuzzer itself reports a bug this exit code will be used If indicates the maximal total time in seconds to run the fuzzer minimizes the provided crash input Use with etc Experimental Use value profile to guide fuzzing Number of simultaneous worker processes to run the jobs If min(jobs, NumberOfCpuCores()/2)\" is used.") FUZZER_FLAG_INT(reload
bool isInBounds() const
Determine whether the GEP has the inbounds flag.
bool runImpl(Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_, std::function< const LoopAccessInfo &(Loop &)> &GetLAA_, OptimizationRemarkEmitter &ORE)
Class to represent pointers.
Definition: DerivedTypes.h:443
static GCRegistry::Add< CoreCLRGC > E("coreclr","CoreCLR-compatible GC")
unsigned getNumIncomingValues() const
Return the number of incoming edges.
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:143
Pointer induction var. Step = C / sizeof(elem).
Definition: LoopUtils.h:267
static bool canIfConvertPHINodes(BasicBlock *BB)
Check whether it is safe to if-convert this phi node.
const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition: Module.cpp:196
Optimization analysis message produced during vectorization.
ExtractSubvector Index indicates start offset.
int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef< Type * > Tys, FastMathFlags FMF) const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs ...
Definition: Instructions.h:830
const SCEV * getCouldNotCompute()
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
#define P(N)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:395
This instruction inserts a single (scalar) element into a VectorType value.
static cl::opt< bool > EnableCondStoresVectorization("enable-cond-stores-vec", cl::init(true), cl::Hidden, cl::desc("Enable if predication of stores during vectorization."))
Integer induction variable. Step = C.
Definition: LoopUtils.h:266
unsigned getAlignment() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:348
Wrapper pass for TargetTransformInfo.
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:107
* if(!EatIfPresent(lltok::kw_thread_local)) return false
ParseOptionalThreadLocal := /*empty.
void dump() const
Definition: LoopInfo.cpp:408
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Definition: LoopInfoImpl.h:109
constexpr bool isPowerOf2_32(uint32_t Value)
isPowerOf2_32 - This function returns true if the argument is a power of two > 0. ...
Definition: MathExtras.h:399
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs...ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:653
LLVM Basic Block Representation.
Definition: BasicBlock.h:51
static cl::opt< unsigned > MaxNestedScalarReductionIC("max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, cl::desc("The maximum interleave count to use when interleaving a scalar ""reduction in a nested loop."))
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:45
size_type size() const
Definition: SmallPtrSet.h:99
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:48
Type * getType() const
Return the LLVM type of this SCEV expression.
Conditional or Unconditional Branch instruction.
Min/max implemented in terms of select(cmp()).
Definition: LoopUtils.h:77
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:219
Value handle that tracks a Value across RAUW.
Definition: ValueHandle.h:275
static cl::opt< unsigned > PragmaVectorizeSCEVCheckThreshold("pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden, cl::desc("The maximum number of SCEV checks allowed with a ""vectorize(enable) pragma"))
This is an important base class in LLVM.
Definition: Constant.h:42
This analysis provides dependence information for the memory accesses of a loop.
const Value * getCondition() const
Instruction * getUnsafeAlgebraInst()
Returns induction operator that does not have "fast-math" property and requires FP unsafe mode...
Definition: LoopUtils.h:332
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1321
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:36
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static Type * getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1)
static Value * addFastMathFlag(Value *V)
Adds a 'fast' flag to floating point operations.
#define H(x, y, z)
Definition: MD5.cpp:53
MapVector< Instruction *, uint64_t > computeMinimumValueSizes(ArrayRef< BasicBlock * > Blocks, DemandedBits &DB, const TargetTransformInfo *TTI=nullptr)
Compute a map of integer instructions to their minimum legal type size.
A manager for alias analyses.
APInt Or(const APInt &LHS, const APInt &RHS)
Bitwise OR function for APInt.
Definition: APInt.h:1947
char & LCSSAID
Definition: LCSSA.cpp:379
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:368
APInt Xor(const APInt &LHS, const APInt &RHS)
Bitwise XOR function for APInt.
Definition: APInt.h:1952
int getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys) const
LLVM_NODISCARD LLVM_ATTRIBUTE_ALWAYS_INLINE StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition: StringRef.h:587
static Constant * getStridedMask(IRBuilder<> &Builder, unsigned Start, unsigned Stride, unsigned VF)
Diagnostic information for applied optimization remarks.
Interval::pred_iterator pred_begin(Interval *I)
pred_begin/pred_end - define methods so that Intervals may be used just like BasicBlocks can with the...
Definition: Interval.h:116
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:259
MDNode * getLoopID() const
Return the llvm.loop loop id metadata node for this loop if it is present.
Definition: LoopInfo.cpp:212
static cl::opt< unsigned > PragmaVectorizeMemoryCheckThreshold("pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, cl::desc("The maximum allowed number of runtime memory checks with a ""vectorize(enable) pragma."))
void setUnsafeAlgebra()
Definition: Operator.h:205
Represent the analysis usage information of a pass.
op_iterator op_end()
Definition: User.h:207
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:109
bool any_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:743
Analysis pass providing a never-invalidated alias analysis result.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return LHS-RHS. Minus is represented in SCEV as A+B*-1.
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE,"Assign register bank of generic virtual registers", false, false) RegBankSelect
static const unsigned End
iterator begin() const
Definition: ArrayRef.h:129
uint64_t getNumElements() const
Definition: DerivedTypes.h:335
static bool isFirstOrderRecurrence(PHINode *Phi, Loop *TheLoop, DominatorTree *DT)
Returns true if Phi is a first-order recurrence.
Definition: LoopUtils.cpp:524
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:298
bool enableAggressiveInterleaving(bool LoopHasReductions) const
Don't restrict interleaved unrolling to small loops.
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
Definition: LoopInfoImpl.h:52
Value * getOperand(unsigned i) const
Definition: User.h:145
const SCEV * replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE, const ValueToValueMap &PtrToStride, Value *Ptr, Value *OrigPtr=nullptr)
Return the SCEV corresponding to a pointer with the symbolic stride replaced with constant one...
Interval::pred_iterator pred_end(Interval *I)
Definition: Interval.h:119
op_range operands()
Definition: User.h:213
Value * getPointerOperand()
Definition: Instructions.h:270
unsigned getSmallConstantTripCount(Loop *L)
Returns the maximum trip count of the loop if it is a single-exit loop and we can compute a small max...
iterator begin() const
Definition: SmallPtrSet.h:398
std::pair< NoneType, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:80
static cl::opt< unsigned > ForceTargetNumVectorRegs("force-target-num-vector-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of vector registers."))
static CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd)
Create a BitCast AddrSpaceCast, or a PtrToInt cast instruction.
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:1629
int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const
const APInt & getAPInt() const
void append(in_iter in_start, in_iter in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:392
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:213
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1337
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:113
int getShuffleCost(ShuffleKind Kind, Type *Tp, int Index=0, Type *SubTp=nullptr) const
static unsigned getIncomingValueNumForOperand(unsigned i)
RecurrenceKind getRecurrenceKind()
Definition: LoopUtils.h:187
static unsigned getRecurrenceBinOp(RecurrenceKind Kind)
Returns the opcode of binary operation corresponding to the RecurrenceKind.
Definition: LoopUtils.cpp:589
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:391
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool mayWriteToMemory() const
Return true if this instruction may modify memory.
void emitLoopVectorizeWarning(LLVMContext &Ctx, const Function &Fn, const DebugLoc &DLoc, const Twine &Msg)
Emit a warning when loop vectorization is specified but fails.
Value * getStartValue() const
Definition: LoopUtils.h:292
static void DFS(BasicBlock *Root, SetVector< BasicBlock * > &Set)
char & LoopSimplifyID
OperandValueProperties
Additional properties of an operand's values.
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: LoopUtils.h:63
bool isConditional() const
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:64
void setLoopID(MDNode *LoopID) const
Set the llvm.loop loop id metadata for this loop.
Definition: LoopInfo.cpp:246
static const SCEV * getAddressAccessSCEV(Value *Ptr, LoopVectorizationLegality *Legal, ScalarEvolution *SE, const Loop *TheLoop)
Gets Address Access SCEV after verifying that the access pattern is loop invariant except the inducti...
IntegerType * getIntPtrType(LLVMContext &C, unsigned AddressSpace=0) const
Returns an integer type with size at least as big as that of a pointer in the given address space...
Definition: DataLayout.cpp:709
A function analysis which provides an AssumptionCache.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
StringRef getString() const
Definition: Metadata.cpp:424
unsigned getABITypeAlignment(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:689
static Constant * getSplat(unsigned NumElts, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
Definition: Constants.cpp:1034
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:234
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1034
void emitLoopInterleaveWarning(LLVMContext &Ctx, const Function &Fn, const DebugLoc &DLoc, const Twine &Msg)
Emit a warning when loop interleaving is specified but fails.
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:292
Analysis pass which computes BlockFrequencyInfo.
Iterator for intrusive lists based on ilist_node.
static bool mayDivideByZero(Instruction &I)
A helper function for checking whether an integer division-related instruction may divide by zero (in...
See the file comment.
Definition: ValueMap.h:87
Diagnostic information for optimization analysis remarks related to floating-point non-commutativity...
#define LV_NAME
This is the shared class of boolean and integer constants.
Definition: Constants.h:88
void emit(DiagnosticInfoOptimizationBase &OptDiag)
The new interface to emit remarks.
A struct for saving information about induction variables.
Definition: LoopUtils.h:261
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
uint64_t getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:408
iterator end()
Definition: BasicBlock.h:230
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type...
Definition: Type.cpp:123
AnalysisUsage & addRequiredID(const void *ID)
Definition: Pass.cpp:289
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:58
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:843
Module.h This file contains the declarations for the Module class.
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:230
static cl::opt< unsigned > ForceTargetInstructionCost("force-target-instruction-cost", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's expected cost for ""an instruction to a single constant value. Mostly ""useful for getting consistent testing."))
Provides information about what library functions are available for the current target.
InductionKind getKind() const
Definition: LoopUtils.h:293
AddressSpace
Definition: NVPTXBaseInfo.h:22
static Type * convertPointerToIntegerType(const DataLayout &DL, Type *Ty)
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
TerminatorInst * SplitBlockAndInsertIfThen(Value *Cond, Instruction *SplitBefore, bool Unreachable, MDNode *BranchWeights=nullptr, DominatorTree *DT=nullptr, LoopInfo *LI=nullptr)
Split the containing block at the specified instruction - everything before SplitBefore stays in the ...
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:50
LLVM_NODISCARD T pop_back_val()
Definition: SmallVector.h:382
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:307
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:1656
Drive the analysis of memory accesses in the loop.
static Constant * get(Type *Ty, uint64_t V, bool isSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:558
Function * getCalledFunction() const
Return the function called, or null if this is an indirect function invocation.
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition: Constants.cpp:572
static BranchInst * Create(BasicBlock *IfTrue, Instruction *InsertBefore=nullptr)
static cl::opt< bool > EnableIndVarRegisterHeur("enable-ind-var-reg-heur", cl::init(true), cl::Hidden, cl::desc("Count the induction variable only once when interleaving"))
reverse_iterator rbegin()
Definition: MapVector.h:58
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:198
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
pred_range predecessors(BasicBlock *BB)
Definition: IR/CFG.h:110
static Constant * get(Type *Ty, double V)
This returns a ConstantFP, or a vector containing a splat of a ConstantFP, for the specified value in...
Definition: Constants.cpp:623
bool hasVectorInstrinsicScalarOpd(Intrinsic::ID ID, unsigned ScalarOpdIdx)
Identifies if the intrinsic has a scalar operand.
Definition: VectorUtils.cpp:71
static GCRegistry::Add< ShadowStackGC > C("shadow-stack","Very portable GC for uncooperative code generators")
void setOperand(unsigned i, Value *Val)
Definition: User.h:150
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
size_type count(const KeyT &Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:122
static Constant * getRecurrenceIdentity(RecurrenceKind K, Type *Tp)
Returns identity corresponding to the RecurrenceKind.
Definition: LoopUtils.cpp:563
Store the result of a depth first search within basic blocks contained by a single loop...
Definition: LoopIterator.h:98
void clear()
Completely clear the SetVector.
Definition: SetVector.h:210
Class to represent vector types.
Definition: DerivedTypes.h:369
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:602
Value * getIncomingValueForBlock(const BasicBlock *BB) const
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:195
iterator_range< user_iterator > users()
Definition: Value.h:370
BasicBlock * getSinglePredecessor()
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:226
static Value * createMinMaxOp(IRBuilder<> &Builder, MinMaxRecurrenceKind RK, Value *Left, Value *Right)
Returns a Min/Max operation corresponding to MinMaxRecurrenceKind.
Definition: LoopUtils.cpp:614
unsigned getVectorNumElements() const
Definition: DerivedTypes.h:438
static const char lv_name[]
This class uses information about analyze scalars to rewrite expressions in canonical form...
bool isPredicated(MCInstrInfo const &MCII, MCInst const &MCI)
static const unsigned MaxInterleaveFactor
Maximum vectorization interleave count.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:480
Holds information about the memory runtime legality checks to verify that a group of pointers do not ...
LLVM_NODISCARD LLVM_ATTRIBUTE_ALWAYS_INLINE bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:130
loop Loop Strength Reduction
bool hasUnsafeAlgebra()
Returns true if the induction type is FP and the binary operator does not have the "fast-math" proper...
Definition: LoopUtils.h:325
APInt And(const APInt &LHS, const APInt &RHS)
Bitwise AND function for APInt.
Definition: APInt.h:1942
Analysis pass that exposes the ScalarEvolution for a function.
unsigned getComplexity() const override
We estimate the complexity of a union predicate as the size number of predicates in the union...
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1132
Analysis pass providing a never-invalidated alias analysis result.
static cl::opt< bool > EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, cl::desc("Enable if-conversion during vectorization."))
iterator end() const
Definition: SmallPtrSet.h:405
const SCEV * getBackedgeTakenCount()
Get the (predicated) backedge count for the analyzed loop.
const SCEVUnionPredicate & getUnionPredicate() const
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.cpp:384
This analysis provides dependence information for the memory accesses of a loop.
Value * getCondition() const
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:205
bool isAggregateType() const
Return true if the type is an aggregate type.
Definition: Type.h:247
static const size_t npos
Definition: StringRef.h:51
static unsigned getVectorCallCost(CallInst *CI, unsigned VF, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, bool &NeedToScalarize)
bool isLegalMaskedLoad(Type *DataType) const
This class represents an analyzed expression in the program.
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:169
static cl::opt< bool > EnableInterleavedMemAccesses("enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop"))
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
Definition: LoopInfo.h:148
Floating point induction variable.
Definition: LoopUtils.h:268
Instruction * getLoopExitInstr()
Definition: LoopUtils.h:193
bool isFunctionVectorizable(StringRef F, unsigned VF) const
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:368
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
unsigned getAlignment() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:227
int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, unsigned Alignment, unsigned AddressSpace) const
static bool runImpl(CallGraphSCC &SCC, CallGraph &CG, function_ref< AAResults &(Function &F)> AARGetter, unsigned MaxElements)
const NodeList & List
Definition: RDFGraph.cpp:205
static void emitAnalysis(const LoopAccessReport &Message, const Loop *TheLoop, const char *PassName, OptimizationRemarkEmitter &ORE)
Emit an analysis note for PassName with the debug location from the instruction in Message if availab...
static Instruction::CastOps getCastOpcode(const Value *Val, bool SrcIsSigned, Type *Ty, bool DstIsSigned)
Returns the opcode necessary to cast Val into Ty using usual casting rules.
#define I(x, y, z)
Definition: MD5.cpp:54
TerminatorInst * getTerminator()
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.cpp:124
LLVM_ATTRIBUTE_ALWAYS_INLINE size_type size() const
Definition: SmallVector.h:135
bool hasOneUse() const
Return true if there is exactly one user of this value.
Definition: Value.h:383
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1099
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:4430
static cl::opt< unsigned, true > VectorizationFactor("force-vector-width", cl::Hidden, cl::desc("Sets the SIMD width. Zero is autoselect."), cl::location(VectorizerParams::VectorizationFactor))
iterator_range< value_op_iterator > operand_values()
Definition: User.h:237
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:287
static cl::opt< unsigned > ForceTargetMaxVectorInterleaveFactor("force-target-max-vector-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for ""vectorized loops."))
void preserve()
Mark an analysis as preserved.
Definition: PassManager.h:120
static volatile int Zero
static cl::opt< unsigned > SmallLoopCost("small-loop-cost", cl::init(20), cl::Hidden, cl::desc("The cost of a loop that is considered 'small' by the interleaver."))
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:374
uint64_t getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type...
Definition: DataLayout.h:391
Collection of parameters shared beetween the Loop Vectorizer and the Loop Access Analysis.
const SCEV * getStep() const
Definition: LoopUtils.h:294
static OptimizationRemarkAnalysis createMissedAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, Instruction *I=nullptr)
Create an analysis remark that explains why vectorization failed.
Analysis pass providing the TargetLibraryInfo.
const unsigned Kind
iterator_range< op_iterator > arg_operands()
Iteration adapter for range-for loops.
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:178
bool use_empty() const
Definition: Value.h:299
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static cl::opt< bool > LoopVectorizeWithBlockFrequency("loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden, cl::desc("Enable the use of the block frequency analysis to access PGO ""heuristics minimizing code growth in cold regions and being more ""aggressive in hot regions."))
SmallPtrSet< Instruction *, 8 > & getCastInsts()
Returns a reference to the instructions used for type-promoting the recurrence.
Definition: LoopUtils.h:235
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:33
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:463
unsigned getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition: Type.cpp:108
uint64_t PowerOf2Floor(uint64_t A)
Returns the power of two which is less than or equal to the given value.
Definition: MathExtras.h:631
LLVM Value Representation.
Definition: Value.h:71
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:111
static VectorType * get(Type *ElementType, unsigned NumElements)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:631
The LoopVectorize Pass.
Definition: LoopVectorize.h:71
OptimizationRemarkEmitter legacy analysis pass.
std::underlying_type< E >::type Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:81
static bool blockNeedsPredication(BasicBlock *BB, Loop *TheLoop, DominatorTree *DT)
Return true if the block BB needs to be predicated in order for the loop to be vectorized.
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
Definition: Instruction.cpp:95
Broadcast element 0 to all other elements.
unsigned getNumberOfRegisters(bool Vector) const
uint64_t getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:533
static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, SmallPtrSetImpl< Value * > &AllowedExit)
Check that the instruction has outside loop users and is not an identified reduction variable...
bool empty() const
Definition: LoopInfo.h:136
#define DEBUG(X)
Definition: Debug.h:100
The legacy pass manager's analysis pass to compute loop information.
Definition: LoopInfo.h:831
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
Definition: Type.cpp:678
print Print MemDeps of function
static unsigned VectorizationInterleave
Interleave factor as overridden by the user.
int getCFInstrCost(unsigned Opcode) const
Convenience struct for specifying and reasoning about fast-math flags.
Definition: Operator.h:168
OperandValueKind
Additional information about an operand's possible values.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:47
A single uniqued string.
Definition: Metadata.h:586
A container for analyses that lazily runs them and caches their results.
int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index=-1) const
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, const Twine &N="", Module *M=nullptr)
Definition: Function.h:117
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:217
iterator getFirstInsertionPt()
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:209
int * Ptr
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object...
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop)...
Definition: CodeMetrics.cpp:73
void setIncomingValue(unsigned i, Value *V)
static bool isInterleaveForced()
True if force-vector-interleave was specified by the user.
op_range incoming_values()
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:479
static GCRegistry::Add< ErlangGC > A("erlang","erlang-compatible garbage collector")
Root of the metadata hierarchy.
Definition: Metadata.h:55
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
int getAddressComputationCost(Type *Ty, ScalarEvolution *SE=nullptr, const SCEV *Ptr=nullptr) const
int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy=nullptr) const
The optimization diagnostic interface.
const BasicBlock * getParent() const
Definition: Instruction.h:62
static bool isInductionPHI(PHINode *Phi, const Loop *L, ScalarEvolution *SE, InductionDescriptor &D, const SCEV *Expr=nullptr)
Returns true if Phi is an induction in the loop L.
Definition: LoopUtils.cpp:854
bool isOne() const
This is just a convenience method to make client code smaller for a common case.
Definition: Constants.h:206
int getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info=OK_AnyValue, OperandValueKind Opd2Info=OK_AnyValue, OperandValueProperties Opd1PropInfo=OP_None, OperandValueProperties Opd2PropInfo=OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >()) const
#define T1
static cl::opt< unsigned > VectorizeSCEVCheckThreshold("vectorize-scev-check-threshold", cl::init(16), cl::Hidden, cl::desc("The maximum number of SCEV checks allowed."))
RecurrenceKind
This enum represents the kinds of recurrences that we support.
Definition: LoopUtils.h:67
Enumerate the SCCs of a directed graph in reverse topological order of the SCC DAG.
Definition: SCCIterator.h:43
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal].
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
This class represents a constant integer value.
Legacy wrapper pass to provide the BasicAAResult object.
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:905
Type * getRecurrenceType()
Returns the type of the recurrence.
Definition: LoopUtils.h:231
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
Definition: STLExtras.h:783
user_iterator user_end()
Definition: Value.h:354