LLVM API Documentation

LoopVectorize.cpp
Go to the documentation of this file.
00001 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
00011 // and generates target-independent LLVM-IR.
00012 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
00013 // of instructions in order to estimate the profitability of vectorization.
00014 //
00015 // The loop vectorizer combines consecutive loop iterations into a single
00016 // 'wide' iteration. After this transformation the index is incremented
00017 // by the SIMD vector width, and not by one.
00018 //
00019 // This pass has three parts:
00020 // 1. The main loop pass that drives the different parts.
00021 // 2. LoopVectorizationLegality - A unit that checks for the legality
00022 //    of the vectorization.
00023 // 3. InnerLoopVectorizer - A unit that performs the actual
00024 //    widening of instructions.
00025 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
00026 //    of vectorization. It decides on the optimal vector width, which
00027 //    can be one, if vectorization is not profitable.
00028 //
00029 //===----------------------------------------------------------------------===//
00030 //
00031 // The reduction-variable vectorization is based on the paper:
00032 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
00033 //
00034 // Variable uniformity checks are inspired by:
00035 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
00036 //
00037 // Other ideas/concepts are from:
00038 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
00039 //
00040 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
00041 //  Vectorizing Compilers.
00042 //
00043 //===----------------------------------------------------------------------===//
00044 
00045 #include "llvm/Transforms/Vectorize.h"
00046 #include "llvm/ADT/DenseMap.h"
00047 #include "llvm/ADT/EquivalenceClasses.h"
00048 #include "llvm/ADT/Hashing.h"
00049 #include "llvm/ADT/MapVector.h"
00050 #include "llvm/ADT/SetVector.h"
00051 #include "llvm/ADT/SmallPtrSet.h"
00052 #include "llvm/ADT/SmallSet.h"
00053 #include "llvm/ADT/SmallVector.h"
00054 #include "llvm/ADT/Statistic.h"
00055 #include "llvm/ADT/StringExtras.h"
00056 #include "llvm/Analysis/AliasAnalysis.h"
00057 #include "llvm/Analysis/AliasSetTracker.h"
00058 #include "llvm/Analysis/AssumptionTracker.h"
00059 #include "llvm/Analysis/BlockFrequencyInfo.h"
00060 #include "llvm/Analysis/CodeMetrics.h"
00061 #include "llvm/Analysis/LoopInfo.h"
00062 #include "llvm/Analysis/LoopIterator.h"
00063 #include "llvm/Analysis/LoopPass.h"
00064 #include "llvm/Analysis/ScalarEvolution.h"
00065 #include "llvm/Analysis/ScalarEvolutionExpander.h"
00066 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
00067 #include "llvm/Analysis/TargetTransformInfo.h"
00068 #include "llvm/Analysis/ValueTracking.h"
00069 #include "llvm/IR/Constants.h"
00070 #include "llvm/IR/DataLayout.h"
00071 #include "llvm/IR/DebugInfo.h"
00072 #include "llvm/IR/DerivedTypes.h"
00073 #include "llvm/IR/DiagnosticInfo.h"
00074 #include "llvm/IR/Dominators.h"
00075 #include "llvm/IR/Function.h"
00076 #include "llvm/IR/IRBuilder.h"
00077 #include "llvm/IR/Instructions.h"
00078 #include "llvm/IR/IntrinsicInst.h"
00079 #include "llvm/IR/LLVMContext.h"
00080 #include "llvm/IR/Module.h"
00081 #include "llvm/IR/PatternMatch.h"
00082 #include "llvm/IR/Type.h"
00083 #include "llvm/IR/Value.h"
00084 #include "llvm/IR/ValueHandle.h"
00085 #include "llvm/IR/Verifier.h"
00086 #include "llvm/Pass.h"
00087 #include "llvm/Support/BranchProbability.h"
00088 #include "llvm/Support/CommandLine.h"
00089 #include "llvm/Support/Debug.h"
00090 #include "llvm/Support/raw_ostream.h"
00091 #include "llvm/Transforms/Scalar.h"
00092 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
00093 #include "llvm/Transforms/Utils/Local.h"
00094 #include "llvm/Transforms/Utils/VectorUtils.h"
00095 #include <algorithm>
00096 #include <map>
00097 #include <tuple>
00098 
00099 using namespace llvm;
00100 using namespace llvm::PatternMatch;
00101 
00102 #define LV_NAME "loop-vectorize"
00103 #define DEBUG_TYPE LV_NAME
00104 
00105 STATISTIC(LoopsVectorized, "Number of loops vectorized");
00106 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
00107 
00108 static cl::opt<unsigned>
00109 VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
00110                     cl::desc("Sets the SIMD width. Zero is autoselect."));
00111 
00112 static cl::opt<unsigned>
00113 VectorizationInterleave("force-vector-interleave", cl::init(0), cl::Hidden,
00114                     cl::desc("Sets the vectorization interleave count. "
00115                              "Zero is autoselect."));
00116 
00117 static cl::opt<bool>
00118 EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
00119                    cl::desc("Enable if-conversion during vectorization."));
00120 
00121 /// We don't vectorize loops with a known constant trip count below this number.
00122 static cl::opt<unsigned>
00123 TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16),
00124                              cl::Hidden,
00125                              cl::desc("Don't vectorize loops with a constant "
00126                                       "trip count that is smaller than this "
00127                                       "value."));
00128 
00129 /// This enables versioning on the strides of symbolically striding memory
00130 /// accesses in code like the following.
00131 ///   for (i = 0; i < N; ++i)
00132 ///     A[i * Stride1] += B[i * Stride2] ...
00133 ///
00134 /// Will be roughly translated to
00135 ///    if (Stride1 == 1 && Stride2 == 1) {
00136 ///      for (i = 0; i < N; i+=4)
00137 ///       A[i:i+3] += ...
00138 ///    } else
00139 ///      ...
00140 static cl::opt<bool> EnableMemAccessVersioning(
00141     "enable-mem-access-versioning", cl::init(true), cl::Hidden,
00142     cl::desc("Enable symblic stride memory access versioning"));
00143 
00144 /// We don't unroll loops with a known constant trip count below this number.
00145 static const unsigned TinyTripCountUnrollThreshold = 128;
00146 
00147 /// When performing memory disambiguation checks at runtime do not make more
00148 /// than this number of comparisons.
00149 static const unsigned RuntimeMemoryCheckThreshold = 8;
00150 
00151 /// Maximum simd width.
00152 static const unsigned MaxVectorWidth = 64;
00153 
00154 static cl::opt<unsigned> ForceTargetNumScalarRegs(
00155     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
00156     cl::desc("A flag that overrides the target's number of scalar registers."));
00157 
00158 static cl::opt<unsigned> ForceTargetNumVectorRegs(
00159     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
00160     cl::desc("A flag that overrides the target's number of vector registers."));
00161 
00162 /// Maximum vectorization interleave count.
00163 static const unsigned MaxInterleaveFactor = 16;
00164 
00165 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
00166     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
00167     cl::desc("A flag that overrides the target's max interleave factor for "
00168              "scalar loops."));
00169 
00170 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
00171     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
00172     cl::desc("A flag that overrides the target's max interleave factor for "
00173              "vectorized loops."));
00174 
00175 static cl::opt<unsigned> ForceTargetInstructionCost(
00176     "force-target-instruction-cost", cl::init(0), cl::Hidden,
00177     cl::desc("A flag that overrides the target's expected cost for "
00178              "an instruction to a single constant value. Mostly "
00179              "useful for getting consistent testing."));
00180 
00181 static cl::opt<unsigned> SmallLoopCost(
00182     "small-loop-cost", cl::init(20), cl::Hidden,
00183     cl::desc("The cost of a loop that is considered 'small' by the unroller."));
00184 
00185 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
00186     "loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden,
00187     cl::desc("Enable the use of the block frequency analysis to access PGO "
00188              "heuristics minimizing code growth in cold regions and being more "
00189              "aggressive in hot regions."));
00190 
00191 // Runtime unroll loops for load/store throughput.
00192 static cl::opt<bool> EnableLoadStoreRuntimeUnroll(
00193     "enable-loadstore-runtime-unroll", cl::init(true), cl::Hidden,
00194     cl::desc("Enable runtime unrolling until load/store ports are saturated"));
00195 
00196 /// The number of stores in a loop that are allowed to need predication.
00197 static cl::opt<unsigned> NumberOfStoresToPredicate(
00198     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
00199     cl::desc("Max number of stores to be predicated behind an if."));
00200 
00201 static cl::opt<bool> EnableIndVarRegisterHeur(
00202     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
00203     cl::desc("Count the induction variable only once when unrolling"));
00204 
00205 static cl::opt<bool> EnableCondStoresVectorization(
00206     "enable-cond-stores-vec", cl::init(false), cl::Hidden,
00207     cl::desc("Enable if predication of stores during vectorization."));
00208 
00209 static cl::opt<unsigned> MaxNestedScalarReductionUF(
00210     "max-nested-scalar-reduction-unroll", cl::init(2), cl::Hidden,
00211     cl::desc("The maximum unroll factor to use when unrolling a scalar "
00212              "reduction in a nested loop."));
00213 
00214 namespace {
00215 
00216 // Forward declarations.
00217 class LoopVectorizationLegality;
00218 class LoopVectorizationCostModel;
00219 class LoopVectorizeHints;
00220 
00221 /// Optimization analysis message produced during vectorization. Messages inform
00222 /// the user why vectorization did not occur.
00223 class Report {
00224   std::string Message;
00225   raw_string_ostream Out;
00226   Instruction *Instr;
00227 
00228 public:
00229   Report(Instruction *I = nullptr) : Out(Message), Instr(I) {
00230     Out << "loop not vectorized: ";
00231   }
00232 
00233   template <typename A> Report &operator<<(const A &Value) {
00234     Out << Value;
00235     return *this;
00236   }
00237 
00238   Instruction *getInstr() { return Instr; }
00239 
00240   std::string &str() { return Out.str(); }
00241   operator Twine() { return Out.str(); }
00242 };
00243 
00244 /// InnerLoopVectorizer vectorizes loops which contain only one basic
00245 /// block to a specified vectorization factor (VF).
00246 /// This class performs the widening of scalars into vectors, or multiple
00247 /// scalars. This class also implements the following features:
00248 /// * It inserts an epilogue loop for handling loops that don't have iteration
00249 ///   counts that are known to be a multiple of the vectorization factor.
00250 /// * It handles the code generation for reduction variables.
00251 /// * Scalarization (implementation using scalars) of un-vectorizable
00252 ///   instructions.
00253 /// InnerLoopVectorizer does not perform any vectorization-legality
00254 /// checks, and relies on the caller to check for the different legality
00255 /// aspects. The InnerLoopVectorizer relies on the
00256 /// LoopVectorizationLegality class to provide information about the induction
00257 /// and reduction variables that were found to a given vectorization factor.
00258 class InnerLoopVectorizer {
00259 public:
00260   InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
00261                       DominatorTree *DT, const DataLayout *DL,
00262                       const TargetLibraryInfo *TLI, unsigned VecWidth,
00263                       unsigned UnrollFactor)
00264       : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), TLI(TLI),
00265         VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()),
00266         Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor),
00267         Legal(nullptr) {}
00268 
00269   // Perform the actual loop widening (vectorization).
00270   void vectorize(LoopVectorizationLegality *L) {
00271     Legal = L;
00272     // Create a new empty loop. Unlink the old loop and connect the new one.
00273     createEmptyLoop();
00274     // Widen each instruction in the old loop to a new one in the new loop.
00275     // Use the Legality module to find the induction and reduction variables.
00276     vectorizeLoop();
00277     // Register the new loop and update the analysis passes.
00278     updateAnalysis();
00279   }
00280 
00281   virtual ~InnerLoopVectorizer() {}
00282 
00283 protected:
00284   /// A small list of PHINodes.
00285   typedef SmallVector<PHINode*, 4> PhiVector;
00286   /// When we unroll loops we have multiple vector values for each scalar.
00287   /// This data structure holds the unrolled and vectorized values that
00288   /// originated from one scalar instruction.
00289   typedef SmallVector<Value*, 2> VectorParts;
00290 
00291   // When we if-convert we need create edge masks. We have to cache values so
00292   // that we don't end up with exponential recursion/IR.
00293   typedef DenseMap<std::pair<BasicBlock*, BasicBlock*>,
00294                    VectorParts> EdgeMaskCache;
00295 
00296   /// \brief Add code that checks at runtime if the accessed arrays overlap.
00297   ///
00298   /// Returns a pair of instructions where the first element is the first
00299   /// instruction generated in possibly a sequence of instructions and the
00300   /// second value is the final comparator value or NULL if no check is needed.
00301   std::pair<Instruction *, Instruction *> addRuntimeCheck(Instruction *Loc);
00302 
00303   /// \brief Add checks for strides that where assumed to be 1.
00304   ///
00305   /// Returns the last check instruction and the first check instruction in the
00306   /// pair as (first, last).
00307   std::pair<Instruction *, Instruction *> addStrideCheck(Instruction *Loc);
00308 
00309   /// Create an empty loop, based on the loop ranges of the old loop.
00310   void createEmptyLoop();
00311   /// Copy and widen the instructions from the old loop.
00312   virtual void vectorizeLoop();
00313 
00314   /// \brief The Loop exit block may have single value PHI nodes where the
00315   /// incoming value is 'Undef'. While vectorizing we only handled real values
00316   /// that were defined inside the loop. Here we fix the 'undef case'.
00317   /// See PR14725.
00318   void fixLCSSAPHIs();
00319 
00320   /// A helper function that computes the predicate of the block BB, assuming
00321   /// that the header block of the loop is set to True. It returns the *entry*
00322   /// mask for the block BB.
00323   VectorParts createBlockInMask(BasicBlock *BB);
00324   /// A helper function that computes the predicate of the edge between SRC
00325   /// and DST.
00326   VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
00327 
00328   /// A helper function to vectorize a single BB within the innermost loop.
00329   void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV);
00330 
00331   /// Vectorize a single PHINode in a block. This method handles the induction
00332   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
00333   /// arbitrary length vectors.
00334   void widenPHIInstruction(Instruction *PN, VectorParts &Entry,
00335                            unsigned UF, unsigned VF, PhiVector *PV);
00336 
00337   /// Insert the new loop to the loop hierarchy and pass manager
00338   /// and update the analysis passes.
00339   void updateAnalysis();
00340 
00341   /// This instruction is un-vectorizable. Implement it as a sequence
00342   /// of scalars. If \p IfPredicateStore is true we need to 'hide' each
00343   /// scalarized instruction behind an if block predicated on the control
00344   /// dependence of the instruction.
00345   virtual void scalarizeInstruction(Instruction *Instr,
00346                                     bool IfPredicateStore=false);
00347 
00348   /// Vectorize Load and Store instructions,
00349   virtual void vectorizeMemoryInstruction(Instruction *Instr);
00350 
00351   /// Create a broadcast instruction. This method generates a broadcast
00352   /// instruction (shuffle) for loop invariant values and for the induction
00353   /// value. If this is the induction variable then we extend it to N, N+1, ...
00354   /// this is needed because each iteration in the loop corresponds to a SIMD
00355   /// element.
00356   virtual Value *getBroadcastInstrs(Value *V);
00357 
00358   /// This function adds 0, 1, 2 ... to each vector element, starting at zero.
00359   /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...).
00360   /// The sequence starts at StartIndex.
00361   virtual Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate);
00362 
00363   /// When we go over instructions in the basic block we rely on previous
00364   /// values within the current basic block or on loop invariant values.
00365   /// When we widen (vectorize) values we place them in the map. If the values
00366   /// are not within the map, they have to be loop invariant, so we simply
00367   /// broadcast them into a vector.
00368   VectorParts &getVectorValue(Value *V);
00369 
00370   /// Generate a shuffle sequence that will reverse the vector Vec.
00371   virtual Value *reverseVector(Value *Vec);
00372 
00373   /// This is a helper class that holds the vectorizer state. It maps scalar
00374   /// instructions to vector instructions. When the code is 'unrolled' then
00375   /// then a single scalar value is mapped to multiple vector parts. The parts
00376   /// are stored in the VectorPart type.
00377   struct ValueMap {
00378     /// C'tor.  UnrollFactor controls the number of vectors ('parts') that
00379     /// are mapped.
00380     ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {}
00381 
00382     /// \return True if 'Key' is saved in the Value Map.
00383     bool has(Value *Key) const { return MapStorage.count(Key); }
00384 
00385     /// Initializes a new entry in the map. Sets all of the vector parts to the
00386     /// save value in 'Val'.
00387     /// \return A reference to a vector with splat values.
00388     VectorParts &splat(Value *Key, Value *Val) {
00389       VectorParts &Entry = MapStorage[Key];
00390       Entry.assign(UF, Val);
00391       return Entry;
00392     }
00393 
00394     ///\return A reference to the value that is stored at 'Key'.
00395     VectorParts &get(Value *Key) {
00396       VectorParts &Entry = MapStorage[Key];
00397       if (Entry.empty())
00398         Entry.resize(UF);
00399       assert(Entry.size() == UF);
00400       return Entry;
00401     }
00402 
00403   private:
00404     /// The unroll factor. Each entry in the map stores this number of vector
00405     /// elements.
00406     unsigned UF;
00407 
00408     /// Map storage. We use std::map and not DenseMap because insertions to a
00409     /// dense map invalidates its iterators.
00410     std::map<Value *, VectorParts> MapStorage;
00411   };
00412 
00413   /// The original loop.
00414   Loop *OrigLoop;
00415   /// Scev analysis to use.
00416   ScalarEvolution *SE;
00417   /// Loop Info.
00418   LoopInfo *LI;
00419   /// Dominator Tree.
00420   DominatorTree *DT;
00421   /// Alias Analysis.
00422   AliasAnalysis *AA;
00423   /// Data Layout.
00424   const DataLayout *DL;
00425   /// Target Library Info.
00426   const TargetLibraryInfo *TLI;
00427 
00428   /// The vectorization SIMD factor to use. Each vector will have this many
00429   /// vector elements.
00430   unsigned VF;
00431 
00432 protected:
00433   /// The vectorization unroll factor to use. Each scalar is vectorized to this
00434   /// many different vector instructions.
00435   unsigned UF;
00436 
00437   /// The builder that we use
00438   IRBuilder<> Builder;
00439 
00440   // --- Vectorization state ---
00441 
00442   /// The vector-loop preheader.
00443   BasicBlock *LoopVectorPreHeader;
00444   /// The scalar-loop preheader.
00445   BasicBlock *LoopScalarPreHeader;
00446   /// Middle Block between the vector and the scalar.
00447   BasicBlock *LoopMiddleBlock;
00448   ///The ExitBlock of the scalar loop.
00449   BasicBlock *LoopExitBlock;
00450   ///The vector loop body.
00451   SmallVector<BasicBlock *, 4> LoopVectorBody;
00452   ///The scalar loop body.
00453   BasicBlock *LoopScalarBody;
00454   /// A list of all bypass blocks. The first block is the entry of the loop.
00455   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
00456 
00457   /// The new Induction variable which was added to the new block.
00458   PHINode *Induction;
00459   /// The induction variable of the old basic block.
00460   PHINode *OldInduction;
00461   /// Holds the extended (to the widest induction type) start index.
00462   Value *ExtendedIdx;
00463   /// Maps scalars to widened vectors.
00464   ValueMap WidenMap;
00465   EdgeMaskCache MaskCache;
00466 
00467   LoopVectorizationLegality *Legal;
00468 };
00469 
00470 class InnerLoopUnroller : public InnerLoopVectorizer {
00471 public:
00472   InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
00473                     DominatorTree *DT, const DataLayout *DL,
00474                     const TargetLibraryInfo *TLI, unsigned UnrollFactor) :
00475     InnerLoopVectorizer(OrigLoop, SE, LI, DT, DL, TLI, 1, UnrollFactor) { }
00476 
00477 private:
00478   void scalarizeInstruction(Instruction *Instr,
00479                             bool IfPredicateStore = false) override;
00480   void vectorizeMemoryInstruction(Instruction *Instr) override;
00481   Value *getBroadcastInstrs(Value *V) override;
00482   Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate) override;
00483   Value *reverseVector(Value *Vec) override;
00484 };
00485 
00486 /// \brief Look for a meaningful debug location on the instruction or it's
00487 /// operands.
00488 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
00489   if (!I)
00490     return I;
00491 
00492   DebugLoc Empty;
00493   if (I->getDebugLoc() != Empty)
00494     return I;
00495 
00496   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
00497     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
00498       if (OpInst->getDebugLoc() != Empty)
00499         return OpInst;
00500   }
00501 
00502   return I;
00503 }
00504 
00505 /// \brief Set the debug location in the builder using the debug location in the
00506 /// instruction.
00507 static void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
00508   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr))
00509     B.SetCurrentDebugLocation(Inst->getDebugLoc());
00510   else
00511     B.SetCurrentDebugLocation(DebugLoc());
00512 }
00513 
00514 #ifndef NDEBUG
00515 /// \return string containing a file name and a line # for the given loop.
00516 static std::string getDebugLocString(const Loop *L) {
00517   std::string Result;
00518   if (L) {
00519     raw_string_ostream OS(Result);
00520     const DebugLoc LoopDbgLoc = L->getStartLoc();
00521     if (!LoopDbgLoc.isUnknown())
00522       LoopDbgLoc.print(L->getHeader()->getContext(), OS);
00523     else
00524       // Just print the module name.
00525       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
00526     OS.flush();
00527   }
00528   return Result;
00529 }
00530 #endif
00531 
00532 /// \brief Propagate known metadata from one instruction to another.
00533 static void propagateMetadata(Instruction *To, const Instruction *From) {
00534   SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
00535   From->getAllMetadataOtherThanDebugLoc(Metadata);
00536 
00537   for (auto M : Metadata) {
00538     unsigned Kind = M.first;
00539 
00540     // These are safe to transfer (this is safe for TBAA, even when we
00541     // if-convert, because should that metadata have had a control dependency
00542     // on the condition, and thus actually aliased with some other
00543     // non-speculated memory access when the condition was false, this would be
00544     // caught by the runtime overlap checks).
00545     if (Kind != LLVMContext::MD_tbaa &&
00546         Kind != LLVMContext::MD_alias_scope &&
00547         Kind != LLVMContext::MD_noalias &&
00548         Kind != LLVMContext::MD_fpmath)
00549       continue;
00550 
00551     To->setMetadata(Kind, M.second);
00552   }
00553 }
00554 
00555 /// \brief Propagate known metadata from one instruction to a vector of others.
00556 static void propagateMetadata(SmallVectorImpl<Value *> &To, const Instruction *From) {
00557   for (Value *V : To)
00558     if (Instruction *I = dyn_cast<Instruction>(V))
00559       propagateMetadata(I, From);
00560 }
00561 
00562 /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
00563 /// to what vectorization factor.
00564 /// This class does not look at the profitability of vectorization, only the
00565 /// legality. This class has two main kinds of checks:
00566 /// * Memory checks - The code in canVectorizeMemory checks if vectorization
00567 ///   will change the order of memory accesses in a way that will change the
00568 ///   correctness of the program.
00569 /// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory
00570 /// checks for a number of different conditions, such as the availability of a
00571 /// single induction variable, that all types are supported and vectorize-able,
00572 /// etc. This code reflects the capabilities of InnerLoopVectorizer.
00573 /// This class is also used by InnerLoopVectorizer for identifying
00574 /// induction variable and the different reduction variables.
00575 class LoopVectorizationLegality {
00576 public:
00577   unsigned NumLoads;
00578   unsigned NumStores;
00579   unsigned NumPredStores;
00580 
00581   LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, const DataLayout *DL,
00582                             DominatorTree *DT, TargetLibraryInfo *TLI,
00583                             AliasAnalysis *AA, Function *F,
00584                             const TargetTransformInfo *TTI)
00585       : NumLoads(0), NumStores(0), NumPredStores(0), TheLoop(L), SE(SE), DL(DL),
00586         DT(DT), TLI(TLI), AA(AA), TheFunction(F), TTI(TTI), Induction(nullptr),
00587         WidestIndTy(nullptr), HasFunNoNaNAttr(false), MaxSafeDepDistBytes(-1U) {
00588   }
00589 
00590   /// This enum represents the kinds of reductions that we support.
00591   enum ReductionKind {
00592     RK_NoReduction, ///< Not a reduction.
00593     RK_IntegerAdd,  ///< Sum of integers.
00594     RK_IntegerMult, ///< Product of integers.
00595     RK_IntegerOr,   ///< Bitwise or logical OR of numbers.
00596     RK_IntegerAnd,  ///< Bitwise or logical AND of numbers.
00597     RK_IntegerXor,  ///< Bitwise or logical XOR of numbers.
00598     RK_IntegerMinMax, ///< Min/max implemented in terms of select(cmp()).
00599     RK_FloatAdd,    ///< Sum of floats.
00600     RK_FloatMult,   ///< Product of floats.
00601     RK_FloatMinMax  ///< Min/max implemented in terms of select(cmp()).
00602   };
00603 
00604   /// This enum represents the kinds of inductions that we support.
00605   enum InductionKind {
00606     IK_NoInduction,         ///< Not an induction variable.
00607     IK_IntInduction,        ///< Integer induction variable. Step = 1.
00608     IK_ReverseIntInduction, ///< Reverse int induction variable. Step = -1.
00609     IK_PtrInduction,        ///< Pointer induction var. Step = sizeof(elem).
00610     IK_ReversePtrInduction  ///< Reverse ptr indvar. Step = - sizeof(elem).
00611   };
00612 
00613   // This enum represents the kind of minmax reduction.
00614   enum MinMaxReductionKind {
00615     MRK_Invalid,
00616     MRK_UIntMin,
00617     MRK_UIntMax,
00618     MRK_SIntMin,
00619     MRK_SIntMax,
00620     MRK_FloatMin,
00621     MRK_FloatMax
00622   };
00623 
00624   /// This struct holds information about reduction variables.
00625   struct ReductionDescriptor {
00626     ReductionDescriptor() : StartValue(nullptr), LoopExitInstr(nullptr),
00627       Kind(RK_NoReduction), MinMaxKind(MRK_Invalid) {}
00628 
00629     ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K,
00630                         MinMaxReductionKind MK)
00631         : StartValue(Start), LoopExitInstr(Exit), Kind(K), MinMaxKind(MK) {}
00632 
00633     // The starting value of the reduction.
00634     // It does not have to be zero!
00635     TrackingVH<Value> StartValue;
00636     // The instruction who's value is used outside the loop.
00637     Instruction *LoopExitInstr;
00638     // The kind of the reduction.
00639     ReductionKind Kind;
00640     // If this a min/max reduction the kind of reduction.
00641     MinMaxReductionKind MinMaxKind;
00642   };
00643 
00644   /// This POD struct holds information about a potential reduction operation.
00645   struct ReductionInstDesc {
00646     ReductionInstDesc(bool IsRedux, Instruction *I) :
00647       IsReduction(IsRedux), PatternLastInst(I), MinMaxKind(MRK_Invalid) {}
00648 
00649     ReductionInstDesc(Instruction *I, MinMaxReductionKind K) :
00650       IsReduction(true), PatternLastInst(I), MinMaxKind(K) {}
00651 
00652     // Is this instruction a reduction candidate.
00653     bool IsReduction;
00654     // The last instruction in a min/max pattern (select of the select(icmp())
00655     // pattern), or the current reduction instruction otherwise.
00656     Instruction *PatternLastInst;
00657     // If this is a min/max pattern the comparison predicate.
00658     MinMaxReductionKind MinMaxKind;
00659   };
00660 
00661   /// This struct holds information about the memory runtime legality
00662   /// check that a group of pointers do not overlap.
00663   struct RuntimePointerCheck {
00664     RuntimePointerCheck() : Need(false) {}
00665 
00666     /// Reset the state of the pointer runtime information.
00667     void reset() {
00668       Need = false;
00669       Pointers.clear();
00670       Starts.clear();
00671       Ends.clear();
00672       IsWritePtr.clear();
00673       DependencySetId.clear();
00674       AliasSetId.clear();
00675     }
00676 
00677     /// Insert a pointer and calculate the start and end SCEVs.
00678     void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr,
00679                 unsigned DepSetId, unsigned ASId, ValueToValueMap &Strides);
00680 
00681     /// This flag indicates if we need to add the runtime check.
00682     bool Need;
00683     /// Holds the pointers that we need to check.
00684     SmallVector<TrackingVH<Value>, 2> Pointers;
00685     /// Holds the pointer value at the beginning of the loop.
00686     SmallVector<const SCEV*, 2> Starts;
00687     /// Holds the pointer value at the end of the loop.
00688     SmallVector<const SCEV*, 2> Ends;
00689     /// Holds the information if this pointer is used for writing to memory.
00690     SmallVector<bool, 2> IsWritePtr;
00691     /// Holds the id of the set of pointers that could be dependent because of a
00692     /// shared underlying object.
00693     SmallVector<unsigned, 2> DependencySetId;
00694     /// Holds the id of the disjoint alias set to which this pointer belongs.
00695     SmallVector<unsigned, 2> AliasSetId;
00696   };
00697 
00698   /// A struct for saving information about induction variables.
00699   struct InductionInfo {
00700     InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {}
00701     InductionInfo() : StartValue(nullptr), IK(IK_NoInduction) {}
00702     /// Start value.
00703     TrackingVH<Value> StartValue;
00704     /// Induction kind.
00705     InductionKind IK;
00706   };
00707 
00708   /// ReductionList contains the reduction descriptors for all
00709   /// of the reductions that were found in the loop.
00710   typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
00711 
00712   /// InductionList saves induction variables and maps them to the
00713   /// induction descriptor.
00714   typedef MapVector<PHINode*, InductionInfo> InductionList;
00715 
00716   /// Returns true if it is legal to vectorize this loop.
00717   /// This does not mean that it is profitable to vectorize this
00718   /// loop, only that it is legal to do so.
00719   bool canVectorize();
00720 
00721   /// Returns the Induction variable.
00722   PHINode *getInduction() { return Induction; }
00723 
00724   /// Returns the reduction variables found in the loop.
00725   ReductionList *getReductionVars() { return &Reductions; }
00726 
00727   /// Returns the induction variables found in the loop.
00728   InductionList *getInductionVars() { return &Inductions; }
00729 
00730   /// Returns the widest induction type.
00731   Type *getWidestInductionType() { return WidestIndTy; }
00732 
00733   /// Returns True if V is an induction variable in this loop.
00734   bool isInductionVariable(const Value *V);
00735 
00736   /// Return true if the block BB needs to be predicated in order for the loop
00737   /// to be vectorized.
00738   bool blockNeedsPredication(BasicBlock *BB);
00739 
00740   /// Check if this  pointer is consecutive when vectorizing. This happens
00741   /// when the last index of the GEP is the induction variable, or that the
00742   /// pointer itself is an induction variable.
00743   /// This check allows us to vectorize A[idx] into a wide load/store.
00744   /// Returns:
00745   /// 0 - Stride is unknown or non-consecutive.
00746   /// 1 - Address is consecutive.
00747   /// -1 - Address is consecutive, and decreasing.
00748   int isConsecutivePtr(Value *Ptr);
00749 
00750   /// Returns true if the value V is uniform within the loop.
00751   bool isUniform(Value *V);
00752 
00753   /// Returns true if this instruction will remain scalar after vectorization.
00754   bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); }
00755 
00756   /// Returns the information that we collected about runtime memory check.
00757   RuntimePointerCheck *getRuntimePointerCheck() { return &PtrRtCheck; }
00758 
00759   /// This function returns the identity element (or neutral element) for
00760   /// the operation K.
00761   static Constant *getReductionIdentity(ReductionKind K, Type *Tp);
00762 
00763   unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; }
00764 
00765   bool hasStride(Value *V) { return StrideSet.count(V); }
00766   bool mustCheckStrides() { return !StrideSet.empty(); }
00767   SmallPtrSet<Value *, 8>::iterator strides_begin() {
00768     return StrideSet.begin();
00769   }
00770   SmallPtrSet<Value *, 8>::iterator strides_end() { return StrideSet.end(); }
00771 
00772   /// Returns true if the target machine supports masked store operation
00773   /// for the given \p DataType and kind of access to \p Ptr.
00774   bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
00775     return TTI->isLegalMaskedStore(DataType, isConsecutivePtr(Ptr));
00776   }
00777   /// Returns true if the target machine supports masked load operation
00778   /// for the given \p DataType and kind of access to \p Ptr.
00779   bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
00780     return TTI->isLegalMaskedLoad(DataType, isConsecutivePtr(Ptr));
00781   }
00782   /// Returns true if vector representation of the instruction \p I
00783   /// requires mask.
00784   bool isMaskRequired(const Instruction* I) {
00785     return (MaskedOp.count(I) != 0);
00786   }
00787 private:
00788   /// Check if a single basic block loop is vectorizable.
00789   /// At this point we know that this is a loop with a constant trip count
00790   /// and we only need to check individual instructions.
00791   bool canVectorizeInstrs();
00792 
00793   /// When we vectorize loops we may change the order in which
00794   /// we read and write from memory. This method checks if it is
00795   /// legal to vectorize the code, considering only memory constrains.
00796   /// Returns true if the loop is vectorizable
00797   bool canVectorizeMemory();
00798 
00799   /// Return true if we can vectorize this loop using the IF-conversion
00800   /// transformation.
00801   bool canVectorizeWithIfConvert();
00802 
00803   /// Collect the variables that need to stay uniform after vectorization.
00804   void collectLoopUniforms();
00805 
00806   /// Return true if all of the instructions in the block can be speculatively
00807   /// executed. \p SafePtrs is a list of addresses that are known to be legal
00808   /// and we know that we can read from them without segfault.
00809   bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs);
00810 
00811   /// Returns True, if 'Phi' is the kind of reduction variable for type
00812   /// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
00813   bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
00814   /// Returns a struct describing if the instruction 'I' can be a reduction
00815   /// variable of type 'Kind'. If the reduction is a min/max pattern of
00816   /// select(icmp()) this function advances the instruction pointer 'I' from the
00817   /// compare instruction to the select instruction and stores this pointer in
00818   /// 'PatternLastInst' member of the returned struct.
00819   ReductionInstDesc isReductionInstr(Instruction *I, ReductionKind Kind,
00820                                      ReductionInstDesc &Desc);
00821   /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction
00822   /// pattern corresponding to a min(X, Y) or max(X, Y).
00823   static ReductionInstDesc isMinMaxSelectCmpPattern(Instruction *I,
00824                                                     ReductionInstDesc &Prev);
00825   /// Returns the induction kind of Phi. This function may return NoInduction
00826   /// if the PHI is not an induction variable.
00827   InductionKind isInductionVariable(PHINode *Phi);
00828 
00829   /// \brief Collect memory access with loop invariant strides.
00830   ///
00831   /// Looks for accesses like "a[i * StrideA]" where "StrideA" is loop
00832   /// invariant.
00833   void collectStridedAcccess(Value *LoadOrStoreInst);
00834 
00835   /// Report an analysis message to assist the user in diagnosing loops that are
00836   /// not vectorized.
00837   void emitAnalysis(Report &Message) {
00838     DebugLoc DL = TheLoop->getStartLoc();
00839     if (Instruction *I = Message.getInstr())
00840       DL = I->getDebugLoc();
00841     emitOptimizationRemarkAnalysis(TheFunction->getContext(), DEBUG_TYPE,
00842                                    *TheFunction, DL, Message.str());
00843   }
00844 
00845   /// The loop that we evaluate.
00846   Loop *TheLoop;
00847   /// Scev analysis.
00848   ScalarEvolution *SE;
00849   /// DataLayout analysis.
00850   const DataLayout *DL;
00851   /// Dominators.
00852   DominatorTree *DT;
00853   /// Target Library Info.
00854   TargetLibraryInfo *TLI;
00855   /// Alias analysis.
00856   AliasAnalysis *AA;
00857   /// Parent function
00858   Function *TheFunction;
00859   /// Target Transform Info
00860   const TargetTransformInfo *TTI;
00861 
00862   //  ---  vectorization state --- //
00863 
00864   /// Holds the integer induction variable. This is the counter of the
00865   /// loop.
00866   PHINode *Induction;
00867   /// Holds the reduction variables.
00868   ReductionList Reductions;
00869   /// Holds all of the induction variables that we found in the loop.
00870   /// Notice that inductions don't need to start at zero and that induction
00871   /// variables can be pointers.
00872   InductionList Inductions;
00873   /// Holds the widest induction type encountered.
00874   Type *WidestIndTy;
00875 
00876   /// Allowed outside users. This holds the reduction
00877   /// vars which can be accessed from outside the loop.
00878   SmallPtrSet<Value*, 4> AllowedExit;
00879   /// This set holds the variables which are known to be uniform after
00880   /// vectorization.
00881   SmallPtrSet<Instruction*, 4> Uniforms;
00882   /// We need to check that all of the pointers in this list are disjoint
00883   /// at runtime.
00884   RuntimePointerCheck PtrRtCheck;
00885   /// Can we assume the absence of NaNs.
00886   bool HasFunNoNaNAttr;
00887 
00888   unsigned MaxSafeDepDistBytes;
00889 
00890   ValueToValueMap Strides;
00891   SmallPtrSet<Value *, 8> StrideSet;
00892   
00893   /// While vectorizing these instructions we have to generate a
00894   /// call to the appropriate masked intrinsic
00895   SmallPtrSet<const Instruction*, 8> MaskedOp;
00896 };
00897 
00898 /// LoopVectorizationCostModel - estimates the expected speedups due to
00899 /// vectorization.
00900 /// In many cases vectorization is not profitable. This can happen because of
00901 /// a number of reasons. In this class we mainly attempt to predict the
00902 /// expected speedup/slowdowns due to the supported instruction set. We use the
00903 /// TargetTransformInfo to query the different backends for the cost of
00904 /// different operations.
00905 class LoopVectorizationCostModel {
00906 public:
00907   LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI,
00908                              LoopVectorizationLegality *Legal,
00909                              const TargetTransformInfo &TTI,
00910                              const DataLayout *DL, const TargetLibraryInfo *TLI,
00911                              AssumptionTracker *AT, const Function *F,
00912                              const LoopVectorizeHints *Hints)
00913       : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI),
00914         TheFunction(F), Hints(Hints) {
00915     CodeMetrics::collectEphemeralValues(L, AT, EphValues);
00916   }
00917 
00918   /// Information about vectorization costs
00919   struct VectorizationFactor {
00920     unsigned Width; // Vector width with best cost
00921     unsigned Cost; // Cost of the loop with that width
00922   };
00923   /// \return The most profitable vectorization factor and the cost of that VF.
00924   /// This method checks every power of two up to VF. If UserVF is not ZERO
00925   /// then this vectorization factor will be selected if vectorization is
00926   /// possible.
00927   VectorizationFactor selectVectorizationFactor(bool OptForSize);
00928 
00929   /// \return The size (in bits) of the widest type in the code that
00930   /// needs to be vectorized. We ignore values that remain scalar such as
00931   /// 64 bit loop indices.
00932   unsigned getWidestType();
00933 
00934   /// \return The most profitable unroll factor.
00935   /// If UserUF is non-zero then this method finds the best unroll-factor
00936   /// based on register pressure and other parameters.
00937   /// VF and LoopCost are the selected vectorization factor and the cost of the
00938   /// selected VF.
00939   unsigned selectUnrollFactor(bool OptForSize, unsigned VF, unsigned LoopCost);
00940 
00941   /// \brief A struct that represents some properties of the register usage
00942   /// of a loop.
00943   struct RegisterUsage {
00944     /// Holds the number of loop invariant values that are used in the loop.
00945     unsigned LoopInvariantRegs;
00946     /// Holds the maximum number of concurrent live intervals in the loop.
00947     unsigned MaxLocalUsers;
00948     /// Holds the number of instructions in the loop.
00949     unsigned NumInstructions;
00950   };
00951 
00952   /// \return  information about the register usage of the loop.
00953   RegisterUsage calculateRegisterUsage();
00954 
00955 private:
00956   /// Returns the expected execution cost. The unit of the cost does
00957   /// not matter because we use the 'cost' units to compare different
00958   /// vector widths. The cost that is returned is *not* normalized by
00959   /// the factor width.
00960   unsigned expectedCost(unsigned VF);
00961 
00962   /// Returns the execution time cost of an instruction for a given vector
00963   /// width. Vector width of one means scalar.
00964   unsigned getInstructionCost(Instruction *I, unsigned VF);
00965 
00966   /// A helper function for converting Scalar types to vector types.
00967   /// If the incoming type is void, we return void. If the VF is 1, we return
00968   /// the scalar type.
00969   static Type* ToVectorTy(Type *Scalar, unsigned VF);
00970 
00971   /// Returns whether the instruction is a load or store and will be a emitted
00972   /// as a vector operation.
00973   bool isConsecutiveLoadOrStore(Instruction *I);
00974 
00975   /// Report an analysis message to assist the user in diagnosing loops that are
00976   /// not vectorized.
00977   void emitAnalysis(Report &Message) {
00978     DebugLoc DL = TheLoop->getStartLoc();
00979     if (Instruction *I = Message.getInstr())
00980       DL = I->getDebugLoc();
00981     emitOptimizationRemarkAnalysis(TheFunction->getContext(), DEBUG_TYPE,
00982                                    *TheFunction, DL, Message.str());
00983   }
00984 
00985   /// Values used only by @llvm.assume calls.
00986   SmallPtrSet<const Value *, 32> EphValues;
00987 
00988   /// The loop that we evaluate.
00989   Loop *TheLoop;
00990   /// Scev analysis.
00991   ScalarEvolution *SE;
00992   /// Loop Info analysis.
00993   LoopInfo *LI;
00994   /// Vectorization legality.
00995   LoopVectorizationLegality *Legal;
00996   /// Vector target information.
00997   const TargetTransformInfo &TTI;
00998   /// Target data layout information.
00999   const DataLayout *DL;
01000   /// Target Library Info.
01001   const TargetLibraryInfo *TLI;
01002   const Function *TheFunction;
01003   // Loop Vectorize Hint.
01004   const LoopVectorizeHints *Hints;
01005 };
01006 
01007 /// Utility class for getting and setting loop vectorizer hints in the form
01008 /// of loop metadata.
01009 /// This class keeps a number of loop annotations locally (as member variables)
01010 /// and can, upon request, write them back as metadata on the loop. It will
01011 /// initially scan the loop for existing metadata, and will update the local
01012 /// values based on information in the loop.
01013 /// We cannot write all values to metadata, as the mere presence of some info,
01014 /// for example 'force', means a decision has been made. So, we need to be
01015 /// careful NOT to add them if the user hasn't specifically asked so.
01016 class LoopVectorizeHints {
01017   enum HintKind {
01018     HK_WIDTH,
01019     HK_UNROLL,
01020     HK_FORCE
01021   };
01022 
01023   /// Hint - associates name and validation with the hint value.
01024   struct Hint {
01025     const char * Name;
01026     unsigned Value; // This may have to change for non-numeric values.
01027     HintKind Kind;
01028 
01029     Hint(const char * Name, unsigned Value, HintKind Kind)
01030       : Name(Name), Value(Value), Kind(Kind) { }
01031 
01032     bool validate(unsigned Val) {
01033       switch (Kind) {
01034       case HK_WIDTH:
01035         return isPowerOf2_32(Val) && Val <= MaxVectorWidth;
01036       case HK_UNROLL:
01037         return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
01038       case HK_FORCE:
01039         return (Val <= 1);
01040       }
01041       return false;
01042     }
01043   };
01044 
01045   /// Vectorization width.
01046   Hint Width;
01047   /// Vectorization interleave factor.
01048   Hint Interleave;
01049   /// Vectorization forced
01050   Hint Force;
01051 
01052   /// Return the loop metadata prefix.
01053   static StringRef Prefix() { return "llvm.loop."; }
01054 
01055 public:
01056   enum ForceKind {
01057     FK_Undefined = -1, ///< Not selected.
01058     FK_Disabled = 0,   ///< Forcing disabled.
01059     FK_Enabled = 1,    ///< Forcing enabled.
01060   };
01061 
01062   LoopVectorizeHints(const Loop *L, bool DisableInterleaving)
01063       : Width("vectorize.width", VectorizationFactor, HK_WIDTH),
01064         Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
01065         Force("vectorize.enable", FK_Undefined, HK_FORCE),
01066         TheLoop(L) {
01067     // Populate values with existing loop metadata.
01068     getHintsFromMetadata();
01069 
01070     // force-vector-interleave overrides DisableInterleaving.
01071     if (VectorizationInterleave.getNumOccurrences() > 0)
01072       Interleave.Value = VectorizationInterleave;
01073 
01074     DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()
01075           << "LV: Interleaving disabled by the pass manager\n");
01076   }
01077 
01078   /// Mark the loop L as already vectorized by setting the width to 1.
01079   void setAlreadyVectorized() {
01080     Width.Value = Interleave.Value = 1;
01081     Hint Hints[] = {Width, Interleave};
01082     writeHintsToMetadata(Hints);
01083   }
01084 
01085   /// Dumps all the hint information.
01086   std::string emitRemark() const {
01087     Report R;
01088     if (Force.Value == LoopVectorizeHints::FK_Disabled)
01089       R << "vectorization is explicitly disabled";
01090     else {
01091       R << "use -Rpass-analysis=loop-vectorize for more info";
01092       if (Force.Value == LoopVectorizeHints::FK_Enabled) {
01093         R << " (Force=true";
01094         if (Width.Value != 0)
01095           R << ", Vector Width=" << Width.Value;
01096         if (Interleave.Value != 0)
01097           R << ", Interleave Count=" << Interleave.Value;
01098         R << ")";
01099       }
01100     }
01101 
01102     return R.str();
01103   }
01104 
01105   unsigned getWidth() const { return Width.Value; }
01106   unsigned getInterleave() const { return Interleave.Value; }
01107   enum ForceKind getForce() const { return (ForceKind)Force.Value; }
01108 
01109 private:
01110   /// Find hints specified in the loop metadata and update local values.
01111   void getHintsFromMetadata() {
01112     MDNode *LoopID = TheLoop->getLoopID();
01113     if (!LoopID)
01114       return;
01115 
01116     // First operand should refer to the loop id itself.
01117     assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
01118     assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
01119 
01120     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
01121       const MDString *S = nullptr;
01122       SmallVector<Metadata *, 4> Args;
01123 
01124       // The expected hint is either a MDString or a MDNode with the first
01125       // operand a MDString.
01126       if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
01127         if (!MD || MD->getNumOperands() == 0)
01128           continue;
01129         S = dyn_cast<MDString>(MD->getOperand(0));
01130         for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
01131           Args.push_back(MD->getOperand(i));
01132       } else {
01133         S = dyn_cast<MDString>(LoopID->getOperand(i));
01134         assert(Args.size() == 0 && "too many arguments for MDString");
01135       }
01136 
01137       if (!S)
01138         continue;
01139 
01140       // Check if the hint starts with the loop metadata prefix.
01141       StringRef Name = S->getString();
01142       if (Args.size() == 1)
01143         setHint(Name, Args[0]);
01144     }
01145   }
01146 
01147   /// Checks string hint with one operand and set value if valid.
01148   void setHint(StringRef Name, Metadata *Arg) {
01149     if (!Name.startswith(Prefix()))
01150       return;
01151     Name = Name.substr(Prefix().size(), StringRef::npos);
01152 
01153     const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
01154     if (!C) return;
01155     unsigned Val = C->getZExtValue();
01156 
01157     Hint *Hints[] = {&Width, &Interleave, &Force};
01158     for (auto H : Hints) {
01159       if (Name == H->Name) {
01160         if (H->validate(Val))
01161           H->Value = Val;
01162         else
01163           DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
01164         break;
01165       }
01166     }
01167   }
01168 
01169   /// Create a new hint from name / value pair.
01170   MDNode *createHintMetadata(StringRef Name, unsigned V) const {
01171     LLVMContext &Context = TheLoop->getHeader()->getContext();
01172     Metadata *MDs[] = {MDString::get(Context, Name),
01173                        ConstantAsMetadata::get(
01174                            ConstantInt::get(Type::getInt32Ty(Context), V))};
01175     return MDNode::get(Context, MDs);
01176   }
01177 
01178   /// Matches metadata with hint name.
01179   bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) {
01180     MDString* Name = dyn_cast<MDString>(Node->getOperand(0));
01181     if (!Name)
01182       return false;
01183 
01184     for (auto H : HintTypes)
01185       if (Name->getString().endswith(H.Name))
01186         return true;
01187     return false;
01188   }
01189 
01190   /// Sets current hints into loop metadata, keeping other values intact.
01191   void writeHintsToMetadata(ArrayRef<Hint> HintTypes) {
01192     if (HintTypes.size() == 0)
01193       return;
01194 
01195     // Reserve the first element to LoopID (see below).
01196     SmallVector<Metadata *, 4> MDs(1);
01197     // If the loop already has metadata, then ignore the existing operands.
01198     MDNode *LoopID = TheLoop->getLoopID();
01199     if (LoopID) {
01200       for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
01201         MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
01202         // If node in update list, ignore old value.
01203         if (!matchesHintMetadataName(Node, HintTypes))
01204           MDs.push_back(Node);
01205       }
01206     }
01207 
01208     // Now, add the missing hints.
01209     for (auto H : HintTypes)
01210       MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));
01211 
01212     // Replace current metadata node with new one.
01213     LLVMContext &Context = TheLoop->getHeader()->getContext();
01214     MDNode *NewLoopID = MDNode::get(Context, MDs);
01215     // Set operand 0 to refer to the loop id itself.
01216     NewLoopID->replaceOperandWith(0, NewLoopID);
01217 
01218     TheLoop->setLoopID(NewLoopID);
01219     LoopID = NewLoopID;
01220   }
01221 
01222   /// The loop these hints belong to.
01223   const Loop *TheLoop;
01224 };
01225 
01226 static void emitMissedWarning(Function *F, Loop *L,
01227                               const LoopVectorizeHints &LH) {
01228   emitOptimizationRemarkMissed(F->getContext(), DEBUG_TYPE, *F,
01229                                L->getStartLoc(), LH.emitRemark());
01230 
01231   if (LH.getForce() == LoopVectorizeHints::FK_Enabled) {
01232     if (LH.getWidth() != 1)
01233       emitLoopVectorizeWarning(
01234           F->getContext(), *F, L->getStartLoc(),
01235           "failed explicitly specified loop vectorization");
01236     else if (LH.getInterleave() != 1)
01237       emitLoopInterleaveWarning(
01238           F->getContext(), *F, L->getStartLoc(),
01239           "failed explicitly specified loop interleaving");
01240   }
01241 }
01242 
01243 static void addInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {
01244   if (L.empty())
01245     return V.push_back(&L);
01246 
01247   for (Loop *InnerL : L)
01248     addInnerLoop(*InnerL, V);
01249 }
01250 
01251 /// The LoopVectorize Pass.
01252 struct LoopVectorize : public FunctionPass {
01253   /// Pass identification, replacement for typeid
01254   static char ID;
01255 
01256   explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true)
01257     : FunctionPass(ID),
01258       DisableUnrolling(NoUnrolling),
01259       AlwaysVectorize(AlwaysVectorize) {
01260     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
01261   }
01262 
01263   ScalarEvolution *SE;
01264   const DataLayout *DL;
01265   LoopInfo *LI;
01266   TargetTransformInfo *TTI;
01267   DominatorTree *DT;
01268   BlockFrequencyInfo *BFI;
01269   TargetLibraryInfo *TLI;
01270   AliasAnalysis *AA;
01271   AssumptionTracker *AT;
01272   bool DisableUnrolling;
01273   bool AlwaysVectorize;
01274 
01275   BlockFrequency ColdEntryFreq;
01276 
01277   bool runOnFunction(Function &F) override {
01278     SE = &getAnalysis<ScalarEvolution>();
01279     DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
01280     DL = DLP ? &DLP->getDataLayout() : nullptr;
01281     LI = &getAnalysis<LoopInfo>();
01282     TTI = &getAnalysis<TargetTransformInfo>();
01283     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
01284     BFI = &getAnalysis<BlockFrequencyInfo>();
01285     TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
01286     AA = &getAnalysis<AliasAnalysis>();
01287     AT = &getAnalysis<AssumptionTracker>();
01288 
01289     // Compute some weights outside of the loop over the loops. Compute this
01290     // using a BranchProbability to re-use its scaling math.
01291     const BranchProbability ColdProb(1, 5); // 20%
01292     ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb;
01293 
01294     // If the target claims to have no vector registers don't attempt
01295     // vectorization.
01296     if (!TTI->getNumberOfRegisters(true))
01297       return false;
01298 
01299     if (!DL) {
01300       DEBUG(dbgs() << "\nLV: Not vectorizing " << F.getName()
01301                    << ": Missing data layout\n");
01302       return false;
01303     }
01304 
01305     // Build up a worklist of inner-loops to vectorize. This is necessary as
01306     // the act of vectorizing or partially unrolling a loop creates new loops
01307     // and can invalidate iterators across the loops.
01308     SmallVector<Loop *, 8> Worklist;
01309 
01310     for (Loop *L : *LI)
01311       addInnerLoop(*L, Worklist);
01312 
01313     LoopsAnalyzed += Worklist.size();
01314 
01315     // Now walk the identified inner loops.
01316     bool Changed = false;
01317     while (!Worklist.empty())
01318       Changed |= processLoop(Worklist.pop_back_val());
01319 
01320     // Process each loop nest in the function.
01321     return Changed;
01322   }
01323 
01324   bool processLoop(Loop *L) {
01325     assert(L->empty() && "Only process inner loops.");
01326 
01327 #ifndef NDEBUG
01328     const std::string DebugLocStr = getDebugLocString(L);
01329 #endif /* NDEBUG */
01330 
01331     DEBUG(dbgs() << "\nLV: Checking a loop in \""
01332                  << L->getHeader()->getParent()->getName() << "\" from "
01333                  << DebugLocStr << "\n");
01334 
01335     LoopVectorizeHints Hints(L, DisableUnrolling);
01336 
01337     DEBUG(dbgs() << "LV: Loop hints:"
01338                  << " force="
01339                  << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
01340                          ? "disabled"
01341                          : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
01342                                 ? "enabled"
01343                                 : "?")) << " width=" << Hints.getWidth()
01344                  << " unroll=" << Hints.getInterleave() << "\n");
01345 
01346     // Function containing loop
01347     Function *F = L->getHeader()->getParent();
01348 
01349     // Looking at the diagnostic output is the only way to determine if a loop
01350     // was vectorized (other than looking at the IR or machine code), so it
01351     // is important to generate an optimization remark for each loop. Most of
01352     // these messages are generated by emitOptimizationRemarkAnalysis. Remarks
01353     // generated by emitOptimizationRemark and emitOptimizationRemarkMissed are
01354     // less verbose reporting vectorized loops and unvectorized loops that may
01355     // benefit from vectorization, respectively.
01356 
01357     if (Hints.getForce() == LoopVectorizeHints::FK_Disabled) {
01358       DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
01359       emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F,
01360                                      L->getStartLoc(), Hints.emitRemark());
01361       return false;
01362     }
01363 
01364     if (!AlwaysVectorize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) {
01365       DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
01366       emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F,
01367                                      L->getStartLoc(), Hints.emitRemark());
01368       return false;
01369     }
01370 
01371     if (Hints.getWidth() == 1 && Hints.getInterleave() == 1) {
01372       DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
01373       emitOptimizationRemarkAnalysis(
01374           F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
01375           "loop not vectorized: vector width and interleave count are "
01376           "explicitly set to 1");
01377       return false;
01378     }
01379 
01380     // Check the loop for a trip count threshold:
01381     // do not vectorize loops with a tiny trip count.
01382     const unsigned TC = SE->getSmallConstantTripCount(L);
01383     if (TC > 0u && TC < TinyTripCountVectorThreshold) {
01384       DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
01385                    << "This loop is not worth vectorizing.");
01386       if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
01387         DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
01388       else {
01389         DEBUG(dbgs() << "\n");
01390         emitOptimizationRemarkAnalysis(
01391             F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
01392             "vectorization is not beneficial and is not explicitly forced");
01393         return false;
01394       }
01395     }
01396 
01397     // Check if it is legal to vectorize the loop.
01398     LoopVectorizationLegality LVL(L, SE, DL, DT, TLI, AA, F, TTI);
01399     if (!LVL.canVectorize()) {
01400       DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
01401       emitMissedWarning(F, L, Hints);
01402       return false;
01403     }
01404 
01405     // Use the cost model.
01406     LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL, TLI, AT, F,
01407                                   &Hints);
01408 
01409     // Check the function attributes to find out if this function should be
01410     // optimized for size.
01411     bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
01412                       F->hasFnAttribute(Attribute::OptimizeForSize);
01413 
01414     // Compute the weighted frequency of this loop being executed and see if it
01415     // is less than 20% of the function entry baseline frequency. Note that we
01416     // always have a canonical loop here because we think we *can* vectoriez.
01417     // FIXME: This is hidden behind a flag due to pervasive problems with
01418     // exactly what block frequency models.
01419     if (LoopVectorizeWithBlockFrequency) {
01420       BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader());
01421       if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
01422           LoopEntryFreq < ColdEntryFreq)
01423         OptForSize = true;
01424     }
01425 
01426     // Check the function attributes to see if implicit floats are allowed.a
01427     // FIXME: This check doesn't seem possibly correct -- what if the loop is
01428     // an integer loop and the vector instructions selected are purely integer
01429     // vector instructions?
01430     if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
01431       DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
01432             "attribute is used.\n");
01433       emitOptimizationRemarkAnalysis(
01434           F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
01435           "loop not vectorized due to NoImplicitFloat attribute");
01436       emitMissedWarning(F, L, Hints);
01437       return false;
01438     }
01439 
01440     // Select the optimal vectorization factor.
01441     const LoopVectorizationCostModel::VectorizationFactor VF =
01442         CM.selectVectorizationFactor(OptForSize);
01443 
01444     // Select the unroll factor.
01445     const unsigned UF =
01446         CM.selectUnrollFactor(OptForSize, VF.Width, VF.Cost);
01447 
01448     DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
01449                  << DebugLocStr << '\n');
01450     DEBUG(dbgs() << "LV: Unroll Factor is " << UF << '\n');
01451 
01452     if (VF.Width == 1) {
01453       DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial\n");
01454 
01455       if (UF == 1) {
01456         emitOptimizationRemarkAnalysis(
01457             F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
01458             "not beneficial to vectorize and user disabled interleaving");
01459         return false;
01460       }
01461       DEBUG(dbgs() << "LV: Trying to at least unroll the loops.\n");
01462 
01463       // Report the unrolling decision.
01464       emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
01465                              Twine("unrolled with interleaving factor " +
01466                                    Twine(UF) +
01467                                    " (vectorization not beneficial)"));
01468 
01469       // We decided not to vectorize, but we may want to unroll.
01470 
01471       InnerLoopUnroller Unroller(L, SE, LI, DT, DL, TLI, UF);
01472       Unroller.vectorize(&LVL);
01473     } else {
01474       // If we decided that it is *legal* to vectorize the loop then do it.
01475       InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF);
01476       LB.vectorize(&LVL);
01477       ++LoopsVectorized;
01478 
01479       // Report the vectorization decision.
01480       emitOptimizationRemark(
01481           F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
01482           Twine("vectorized loop (vectorization factor: ") + Twine(VF.Width) +
01483               ", unrolling interleave factor: " + Twine(UF) + ")");
01484     }
01485 
01486     // Mark the loop as already vectorized to avoid vectorizing again.
01487     Hints.setAlreadyVectorized();
01488 
01489     DEBUG(verifyFunction(*L->getHeader()->getParent()));
01490     return true;
01491   }
01492 
01493   void getAnalysisUsage(AnalysisUsage &AU) const override {
01494     AU.addRequired<AssumptionTracker>();
01495     AU.addRequiredID(LoopSimplifyID);
01496     AU.addRequiredID(LCSSAID);
01497     AU.addRequired<BlockFrequencyInfo>();
01498     AU.addRequired<DominatorTreeWrapperPass>();
01499     AU.addRequired<LoopInfo>();
01500     AU.addRequired<ScalarEvolution>();
01501     AU.addRequired<TargetTransformInfo>();
01502     AU.addRequired<AliasAnalysis>();
01503     AU.addPreserved<LoopInfo>();
01504     AU.addPreserved<DominatorTreeWrapperPass>();
01505     AU.addPreserved<AliasAnalysis>();
01506   }
01507 
01508 };
01509 
01510 } // end anonymous namespace
01511 
01512 //===----------------------------------------------------------------------===//
01513 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
01514 // LoopVectorizationCostModel.
01515 //===----------------------------------------------------------------------===//
01516 
01517 static Value *stripIntegerCast(Value *V) {
01518   if (CastInst *CI = dyn_cast<CastInst>(V))
01519     if (CI->getOperand(0)->getType()->isIntegerTy())
01520       return CI->getOperand(0);
01521   return V;
01522 }
01523 
01524 ///\brief Replaces the symbolic stride in a pointer SCEV expression by one.
01525 ///
01526 /// If \p OrigPtr is not null, use it to look up the stride value instead of
01527 /// \p Ptr.
01528 static const SCEV *replaceSymbolicStrideSCEV(ScalarEvolution *SE,
01529                                              ValueToValueMap &PtrToStride,
01530                                              Value *Ptr, Value *OrigPtr = nullptr) {
01531 
01532   const SCEV *OrigSCEV = SE->getSCEV(Ptr);
01533 
01534   // If there is an entry in the map return the SCEV of the pointer with the
01535   // symbolic stride replaced by one.
01536   ValueToValueMap::iterator SI = PtrToStride.find(OrigPtr ? OrigPtr : Ptr);
01537   if (SI != PtrToStride.end()) {
01538     Value *StrideVal = SI->second;
01539 
01540     // Strip casts.
01541     StrideVal = stripIntegerCast(StrideVal);
01542 
01543     // Replace symbolic stride by one.
01544     Value *One = ConstantInt::get(StrideVal->getType(), 1);
01545     ValueToValueMap RewriteMap;
01546     RewriteMap[StrideVal] = One;
01547 
01548     const SCEV *ByOne =
01549         SCEVParameterRewriter::rewrite(OrigSCEV, *SE, RewriteMap, true);
01550     DEBUG(dbgs() << "LV: Replacing SCEV: " << *OrigSCEV << " by: " << *ByOne
01551                  << "\n");
01552     return ByOne;
01553   }
01554 
01555   // Otherwise, just return the SCEV of the original pointer.
01556   return SE->getSCEV(Ptr);
01557 }
01558 
01559 void LoopVectorizationLegality::RuntimePointerCheck::insert(
01560     ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr, unsigned DepSetId,
01561     unsigned ASId, ValueToValueMap &Strides) {
01562   // Get the stride replaced scev.
01563   const SCEV *Sc = replaceSymbolicStrideSCEV(SE, Strides, Ptr);
01564   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
01565   assert(AR && "Invalid addrec expression");
01566   const SCEV *Ex = SE->getBackedgeTakenCount(Lp);
01567   const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE);
01568   Pointers.push_back(Ptr);
01569   Starts.push_back(AR->getStart());
01570   Ends.push_back(ScEnd);
01571   IsWritePtr.push_back(WritePtr);
01572   DependencySetId.push_back(DepSetId);
01573   AliasSetId.push_back(ASId);
01574 }
01575 
01576 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
01577   // We need to place the broadcast of invariant variables outside the loop.
01578   Instruction *Instr = dyn_cast<Instruction>(V);
01579   bool NewInstr =
01580       (Instr && std::find(LoopVectorBody.begin(), LoopVectorBody.end(),
01581                           Instr->getParent()) != LoopVectorBody.end());
01582   bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
01583 
01584   // Place the code for broadcasting invariant variables in the new preheader.
01585   IRBuilder<>::InsertPointGuard Guard(Builder);
01586   if (Invariant)
01587     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
01588 
01589   // Broadcast the scalar into all locations in the vector.
01590   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
01591 
01592   return Shuf;
01593 }
01594 
01595 Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, int StartIdx,
01596                                                  bool Negate) {
01597   assert(Val->getType()->isVectorTy() && "Must be a vector");
01598   assert(Val->getType()->getScalarType()->isIntegerTy() &&
01599          "Elem must be an integer");
01600   // Create the types.
01601   Type *ITy = Val->getType()->getScalarType();
01602   VectorType *Ty = cast<VectorType>(Val->getType());
01603   int VLen = Ty->getNumElements();
01604   SmallVector<Constant*, 8> Indices;
01605 
01606   // Create a vector of consecutive numbers from zero to VF.
01607   for (int i = 0; i < VLen; ++i) {
01608     int64_t Idx = Negate ? (-i) : i;
01609     Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx, Negate));
01610   }
01611 
01612   // Add the consecutive indices to the vector value.
01613   Constant *Cv = ConstantVector::get(Indices);
01614   assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
01615   return Builder.CreateAdd(Val, Cv, "induction");
01616 }
01617 
01618 /// \brief Find the operand of the GEP that should be checked for consecutive
01619 /// stores. This ignores trailing indices that have no effect on the final
01620 /// pointer.
01621 static unsigned getGEPInductionOperand(const DataLayout *DL,
01622                                        const GetElementPtrInst *Gep) {
01623   unsigned LastOperand = Gep->getNumOperands() - 1;
01624   unsigned GEPAllocSize = DL->getTypeAllocSize(
01625       cast<PointerType>(Gep->getType()->getScalarType())->getElementType());
01626 
01627   // Walk backwards and try to peel off zeros.
01628   while (LastOperand > 1 && match(Gep->getOperand(LastOperand), m_Zero())) {
01629     // Find the type we're currently indexing into.
01630     gep_type_iterator GEPTI = gep_type_begin(Gep);
01631     std::advance(GEPTI, LastOperand - 1);
01632 
01633     // If it's a type with the same allocation size as the result of the GEP we
01634     // can peel off the zero index.
01635     if (DL->getTypeAllocSize(*GEPTI) != GEPAllocSize)
01636       break;
01637     --LastOperand;
01638   }
01639 
01640   return LastOperand;
01641 }
01642 
01643 int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
01644   assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr");
01645   // Make sure that the pointer does not point to structs.
01646   if (Ptr->getType()->getPointerElementType()->isAggregateType())
01647     return 0;
01648 
01649   // If this value is a pointer induction variable we know it is consecutive.
01650   PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);
01651   if (Phi && Inductions.count(Phi)) {
01652     InductionInfo II = Inductions[Phi];
01653     if (IK_PtrInduction == II.IK)
01654       return 1;
01655     else if (IK_ReversePtrInduction == II.IK)
01656       return -1;
01657   }
01658 
01659   GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
01660   if (!Gep)
01661     return 0;
01662 
01663   unsigned NumOperands = Gep->getNumOperands();
01664   Value *GpPtr = Gep->getPointerOperand();
01665   // If this GEP value is a consecutive pointer induction variable and all of
01666   // the indices are constant then we know it is consecutive. We can
01667   Phi = dyn_cast<PHINode>(GpPtr);
01668   if (Phi && Inductions.count(Phi)) {
01669 
01670     // Make sure that the pointer does not point to structs.
01671     PointerType *GepPtrType = cast<PointerType>(GpPtr->getType());
01672     if (GepPtrType->getElementType()->isAggregateType())
01673       return 0;
01674 
01675     // Make sure that all of the index operands are loop invariant.
01676     for (unsigned i = 1; i < NumOperands; ++i)
01677       if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
01678         return 0;
01679 
01680     InductionInfo II = Inductions[Phi];
01681     if (IK_PtrInduction == II.IK)
01682       return 1;
01683     else if (IK_ReversePtrInduction == II.IK)
01684       return -1;
01685   }
01686 
01687   unsigned InductionOperand = getGEPInductionOperand(DL, Gep);
01688 
01689   // Check that all of the gep indices are uniform except for our induction
01690   // operand.
01691   for (unsigned i = 0; i != NumOperands; ++i)
01692     if (i != InductionOperand &&
01693         !SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
01694       return 0;
01695 
01696   // We can emit wide load/stores only if the last non-zero index is the
01697   // induction variable.
01698   const SCEV *Last = nullptr;
01699   if (!Strides.count(Gep))
01700     Last = SE->getSCEV(Gep->getOperand(InductionOperand));
01701   else {
01702     // Because of the multiplication by a stride we can have a s/zext cast.
01703     // We are going to replace this stride by 1 so the cast is safe to ignore.
01704     //
01705     //  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
01706     //  %0 = trunc i64 %indvars.iv to i32
01707     //  %mul = mul i32 %0, %Stride1
01708     //  %idxprom = zext i32 %mul to i64  << Safe cast.
01709     //  %arrayidx = getelementptr inbounds i32* %B, i64 %idxprom
01710     //
01711     Last = replaceSymbolicStrideSCEV(SE, Strides,
01712                                      Gep->getOperand(InductionOperand), Gep);
01713     if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(Last))
01714       Last =
01715           (C->getSCEVType() == scSignExtend || C->getSCEVType() == scZeroExtend)
01716               ? C->getOperand()
01717               : Last;
01718   }
01719   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {
01720     const SCEV *Step = AR->getStepRecurrence(*SE);
01721 
01722     // The memory is consecutive because the last index is consecutive
01723     // and all other indices are loop invariant.
01724     if (Step->isOne())
01725       return 1;
01726     if (Step->isAllOnesValue())
01727       return -1;
01728   }
01729 
01730   return 0;
01731 }
01732 
01733 bool LoopVectorizationLegality::isUniform(Value *V) {
01734   return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
01735 }
01736 
01737 InnerLoopVectorizer::VectorParts&
01738 InnerLoopVectorizer::getVectorValue(Value *V) {
01739   assert(V != Induction && "The new induction variable should not be used.");
01740   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
01741 
01742   // If we have a stride that is replaced by one, do it here.
01743   if (Legal->hasStride(V))
01744     V = ConstantInt::get(V->getType(), 1);
01745 
01746   // If we have this scalar in the map, return it.
01747   if (WidenMap.has(V))
01748     return WidenMap.get(V);
01749 
01750   // If this scalar is unknown, assume that it is a constant or that it is
01751   // loop invariant. Broadcast V and save the value for future uses.
01752   Value *B = getBroadcastInstrs(V);
01753   return WidenMap.splat(V, B);
01754 }
01755 
01756 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
01757   assert(Vec->getType()->isVectorTy() && "Invalid type");
01758   SmallVector<Constant*, 8> ShuffleMask;
01759   for (unsigned i = 0; i < VF; ++i)
01760     ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
01761 
01762   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
01763                                      ConstantVector::get(ShuffleMask),
01764                                      "reverse");
01765 }
01766 
01767 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
01768   // Attempt to issue a wide load.
01769   LoadInst *LI = dyn_cast<LoadInst>(Instr);
01770   StoreInst *SI = dyn_cast<StoreInst>(Instr);
01771 
01772   assert((LI || SI) && "Invalid Load/Store instruction");
01773 
01774   Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType();
01775   Type *DataTy = VectorType::get(ScalarDataTy, VF);
01776   Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
01777   unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
01778   // An alignment of 0 means target abi alignment. We need to use the scalar's
01779   // target abi alignment in such a case.
01780   if (!Alignment)
01781     Alignment = DL->getABITypeAlignment(ScalarDataTy);
01782   unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
01783   unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy);
01784   unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF;
01785 
01786   if (SI && Legal->blockNeedsPredication(SI->getParent()) &&
01787       !Legal->isMaskRequired(SI))
01788     return scalarizeInstruction(Instr, true);
01789 
01790   if (ScalarAllocatedSize != VectorElementSize)
01791     return scalarizeInstruction(Instr);
01792 
01793   // If the pointer is loop invariant or if it is non-consecutive,
01794   // scalarize the load.
01795   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
01796   bool Reverse = ConsecutiveStride < 0;
01797   bool UniformLoad = LI && Legal->isUniform(Ptr);
01798   if (!ConsecutiveStride || UniformLoad)
01799     return scalarizeInstruction(Instr);
01800 
01801   Constant *Zero = Builder.getInt32(0);
01802   VectorParts &Entry = WidenMap.get(Instr);
01803 
01804   // Handle consecutive loads/stores.
01805   GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
01806   if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) {
01807     setDebugLocFromInst(Builder, Gep);
01808     Value *PtrOperand = Gep->getPointerOperand();
01809     Value *FirstBasePtr = getVectorValue(PtrOperand)[0];
01810     FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero);
01811 
01812     // Create the new GEP with the new induction variable.
01813     GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
01814     Gep2->setOperand(0, FirstBasePtr);
01815     Gep2->setName("gep.indvar.base");
01816     Ptr = Builder.Insert(Gep2);
01817   } else if (Gep) {
01818     setDebugLocFromInst(Builder, Gep);
01819     assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()),
01820                                OrigLoop) && "Base ptr must be invariant");
01821 
01822     // The last index does not have to be the induction. It can be
01823     // consecutive and be a function of the index. For example A[I+1];
01824     unsigned NumOperands = Gep->getNumOperands();
01825     unsigned InductionOperand = getGEPInductionOperand(DL, Gep);
01826     // Create the new GEP with the new induction variable.
01827     GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
01828 
01829     for (unsigned i = 0; i < NumOperands; ++i) {
01830       Value *GepOperand = Gep->getOperand(i);
01831       Instruction *GepOperandInst = dyn_cast<Instruction>(GepOperand);
01832 
01833       // Update last index or loop invariant instruction anchored in loop.
01834       if (i == InductionOperand ||
01835           (GepOperandInst && OrigLoop->contains(GepOperandInst))) {
01836         assert((i == InductionOperand ||
01837                SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) &&
01838                "Must be last index or loop invariant");
01839 
01840         VectorParts &GEPParts = getVectorValue(GepOperand);
01841         Value *Index = GEPParts[0];
01842         Index = Builder.CreateExtractElement(Index, Zero);
01843         Gep2->setOperand(i, Index);
01844         Gep2->setName("gep.indvar.idx");
01845       }
01846     }
01847     Ptr = Builder.Insert(Gep2);
01848   } else {
01849     // Use the induction element ptr.
01850     assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
01851     setDebugLocFromInst(Builder, Ptr);
01852     VectorParts &PtrVal = getVectorValue(Ptr);
01853     Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
01854   }
01855 
01856   // Handle Stores:
01857   if (SI) {
01858     assert(!Legal->isUniform(SI->getPointerOperand()) &&
01859            "We do not allow storing to uniform addresses");
01860     setDebugLocFromInst(Builder, SI);
01861     // We don't want to update the value in the map as it might be used in
01862     // another expression. So don't use a reference type for "StoredVal".
01863     VectorParts StoredVal = getVectorValue(SI->getValueOperand());
01864 
01865     for (unsigned Part = 0; Part < UF; ++Part) {
01866       // Calculate the pointer for the specific unroll-part.
01867       Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
01868 
01869       if (Reverse) {
01870         // If we store to reverse consecutive memory locations then we need
01871         // to reverse the order of elements in the stored value.
01872         StoredVal[Part] = reverseVector(StoredVal[Part]);
01873         // If the address is consecutive but reversed, then the
01874         // wide store needs to start at the last vector element.
01875         PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
01876         PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
01877       }
01878 
01879       Value *VecPtr = Builder.CreateBitCast(PartPtr,
01880                                             DataTy->getPointerTo(AddressSpace));
01881 
01882       Instruction *NewSI;
01883       if (Legal->isMaskRequired(SI)) {
01884         Type *I8PtrTy =
01885         Builder.getInt8PtrTy(PartPtr->getType()->getPointerAddressSpace());
01886 
01887         Value *I8Ptr = Builder.CreateBitCast(PartPtr, I8PtrTy);
01888 
01889         VectorParts Cond = createBlockInMask(SI->getParent());
01890         SmallVector <Value *, 8> Ops;
01891         Ops.push_back(I8Ptr);
01892         Ops.push_back(StoredVal[Part]);
01893         Ops.push_back(Builder.getInt32(Alignment));
01894         Ops.push_back(Cond[Part]);
01895         NewSI = Builder.CreateMaskedStore(Ops);
01896       }
01897       else 
01898         NewSI = Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment);
01899       propagateMetadata(NewSI, SI);
01900     }
01901     return;
01902   }
01903 
01904   // Handle loads.
01905   assert(LI && "Must have a load instruction");
01906   setDebugLocFromInst(Builder, LI);
01907   for (unsigned Part = 0; Part < UF; ++Part) {
01908     // Calculate the pointer for the specific unroll-part.
01909     Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
01910 
01911     if (Reverse) {
01912       // If the address is consecutive but reversed, then the
01913       // wide load needs to start at the last vector element.
01914       PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
01915       PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
01916     }
01917 
01918     Instruction* NewLI;
01919     if (Legal->isMaskRequired(LI)) {
01920       Type *I8PtrTy =
01921         Builder.getInt8PtrTy(PartPtr->getType()->getPointerAddressSpace());
01922 
01923       Value *I8Ptr = Builder.CreateBitCast(PartPtr, I8PtrTy);
01924 
01925       VectorParts SrcMask = createBlockInMask(LI->getParent());
01926       SmallVector <Value *, 8> Ops;
01927       Ops.push_back(I8Ptr);
01928       Ops.push_back(UndefValue::get(DataTy));
01929       Ops.push_back(Builder.getInt32(Alignment));
01930       Ops.push_back(SrcMask[Part]);
01931       NewLI = Builder.CreateMaskedLoad(Ops);
01932     }
01933     else {
01934       Value *VecPtr = Builder.CreateBitCast(PartPtr,
01935                                             DataTy->getPointerTo(AddressSpace));
01936       NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");
01937     }
01938     propagateMetadata(NewLI, LI);
01939     Entry[Part] = Reverse ? reverseVector(NewLI) :  NewLI;
01940   }
01941 }
01942 
01943 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredicateStore) {
01944   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
01945   // Holds vector parameters or scalars, in case of uniform vals.
01946   SmallVector<VectorParts, 4> Params;
01947 
01948   setDebugLocFromInst(Builder, Instr);
01949 
01950   // Find all of the vectorized parameters.
01951   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
01952     Value *SrcOp = Instr->getOperand(op);
01953 
01954     // If we are accessing the old induction variable, use the new one.
01955     if (SrcOp == OldInduction) {
01956       Params.push_back(getVectorValue(SrcOp));
01957       continue;
01958     }
01959 
01960     // Try using previously calculated values.
01961     Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
01962 
01963     // If the src is an instruction that appeared earlier in the basic block
01964     // then it should already be vectorized.
01965     if (SrcInst && OrigLoop->contains(SrcInst)) {
01966       assert(WidenMap.has(SrcInst) && "Source operand is unavailable");
01967       // The parameter is a vector value from earlier.
01968       Params.push_back(WidenMap.get(SrcInst));
01969     } else {
01970       // The parameter is a scalar from outside the loop. Maybe even a constant.
01971       VectorParts Scalars;
01972       Scalars.append(UF, SrcOp);
01973       Params.push_back(Scalars);
01974     }
01975   }
01976 
01977   assert(Params.size() == Instr->getNumOperands() &&
01978          "Invalid number of operands");
01979 
01980   // Does this instruction return a value ?
01981   bool IsVoidRetTy = Instr->getType()->isVoidTy();
01982 
01983   Value *UndefVec = IsVoidRetTy ? nullptr :
01984     UndefValue::get(VectorType::get(Instr->getType(), VF));
01985   // Create a new entry in the WidenMap and initialize it to Undef or Null.
01986   VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
01987 
01988   Instruction *InsertPt = Builder.GetInsertPoint();
01989   BasicBlock *IfBlock = Builder.GetInsertBlock();
01990   BasicBlock *CondBlock = nullptr;
01991 
01992   VectorParts Cond;
01993   Loop *VectorLp = nullptr;
01994   if (IfPredicateStore) {
01995     assert(Instr->getParent()->getSinglePredecessor() &&
01996            "Only support single predecessor blocks");
01997     Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
01998                           Instr->getParent());
01999     VectorLp = LI->getLoopFor(IfBlock);
02000     assert(VectorLp && "Must have a loop for this block");
02001   }
02002 
02003   // For each vector unroll 'part':
02004   for (unsigned Part = 0; Part < UF; ++Part) {
02005     // For each scalar that we create:
02006     for (unsigned Width = 0; Width < VF; ++Width) {
02007 
02008       // Start if-block.
02009       Value *Cmp = nullptr;
02010       if (IfPredicateStore) {
02011         Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width));
02012         Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1));
02013         CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
02014         LoopVectorBody.push_back(CondBlock);
02015         VectorLp->addBasicBlockToLoop(CondBlock, LI->getBase());
02016         // Update Builder with newly created basic block.
02017         Builder.SetInsertPoint(InsertPt);
02018       }
02019 
02020       Instruction *Cloned = Instr->clone();
02021       if (!IsVoidRetTy)
02022         Cloned->setName(Instr->getName() + ".cloned");
02023       // Replace the operands of the cloned instructions with extracted scalars.
02024       for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
02025         Value *Op = Params[op][Part];
02026         // Param is a vector. Need to extract the right lane.
02027         if (Op->getType()->isVectorTy())
02028           Op = Builder.CreateExtractElement(Op, Builder.getInt32(Width));
02029         Cloned->setOperand(op, Op);
02030       }
02031 
02032       // Place the cloned scalar in the new loop.
02033       Builder.Insert(Cloned);
02034 
02035       // If the original scalar returns a value we need to place it in a vector
02036       // so that future users will be able to use it.
02037       if (!IsVoidRetTy)
02038         VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned,
02039                                                        Builder.getInt32(Width));
02040       // End if-block.
02041       if (IfPredicateStore) {
02042          BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
02043          LoopVectorBody.push_back(NewIfBlock);
02044          VectorLp->addBasicBlockToLoop(NewIfBlock, LI->getBase());
02045          Builder.SetInsertPoint(InsertPt);
02046          Instruction *OldBr = IfBlock->getTerminator();
02047          BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
02048          OldBr->eraseFromParent();
02049          IfBlock = NewIfBlock;
02050       }
02051     }
02052   }
02053 }
02054 
02055 static Instruction *getFirstInst(Instruction *FirstInst, Value *V,
02056                                  Instruction *Loc) {
02057   if (FirstInst)
02058     return FirstInst;
02059   if (Instruction *I = dyn_cast<Instruction>(V))
02060     return I->getParent() == Loc->getParent() ? I : nullptr;
02061   return nullptr;
02062 }
02063 
02064 std::pair<Instruction *, Instruction *>
02065 InnerLoopVectorizer::addStrideCheck(Instruction *Loc) {
02066   Instruction *tnullptr = nullptr;
02067   if (!Legal->mustCheckStrides())
02068     return std::pair<Instruction *, Instruction *>(tnullptr, tnullptr);
02069 
02070   IRBuilder<> ChkBuilder(Loc);
02071 
02072   // Emit checks.
02073   Value *Check = nullptr;
02074   Instruction *FirstInst = nullptr;
02075   for (SmallPtrSet<Value *, 8>::iterator SI = Legal->strides_begin(),
02076                                          SE = Legal->strides_end();
02077        SI != SE; ++SI) {
02078     Value *Ptr = stripIntegerCast(*SI);
02079     Value *C = ChkBuilder.CreateICmpNE(Ptr, ConstantInt::get(Ptr->getType(), 1),
02080                                        "stride.chk");
02081     // Store the first instruction we create.
02082     FirstInst = getFirstInst(FirstInst, C, Loc);
02083     if (Check)
02084       Check = ChkBuilder.CreateOr(Check, C);
02085     else
02086       Check = C;
02087   }
02088 
02089   // We have to do this trickery because the IRBuilder might fold the check to a
02090   // constant expression in which case there is no Instruction anchored in a
02091   // the block.
02092   LLVMContext &Ctx = Loc->getContext();
02093   Instruction *TheCheck =
02094       BinaryOperator::CreateAnd(Check, ConstantInt::getTrue(Ctx));
02095   ChkBuilder.Insert(TheCheck, "stride.not.one");
02096   FirstInst = getFirstInst(FirstInst, TheCheck, Loc);
02097 
02098   return std::make_pair(FirstInst, TheCheck);
02099 }
02100 
02101 std::pair<Instruction *, Instruction *>
02102 InnerLoopVectorizer::addRuntimeCheck(Instruction *Loc) {
02103   LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck =
02104   Legal->getRuntimePointerCheck();
02105 
02106   Instruction *tnullptr = nullptr;
02107   if (!PtrRtCheck->Need)
02108     return std::pair<Instruction *, Instruction *>(tnullptr, tnullptr);
02109 
02110   unsigned NumPointers = PtrRtCheck->Pointers.size();
02111   SmallVector<TrackingVH<Value> , 2> Starts;
02112   SmallVector<TrackingVH<Value> , 2> Ends;
02113 
02114   LLVMContext &Ctx = Loc->getContext();
02115   SCEVExpander Exp(*SE, "induction");
02116   Instruction *FirstInst = nullptr;
02117 
02118   for (unsigned i = 0; i < NumPointers; ++i) {
02119     Value *Ptr = PtrRtCheck->Pointers[i];
02120     const SCEV *Sc = SE->getSCEV(Ptr);
02121 
02122     if (SE->isLoopInvariant(Sc, OrigLoop)) {
02123       DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" <<
02124             *Ptr <<"\n");
02125       Starts.push_back(Ptr);
02126       Ends.push_back(Ptr);
02127     } else {
02128       DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr << '\n');
02129       unsigned AS = Ptr->getType()->getPointerAddressSpace();
02130 
02131       // Use this type for pointer arithmetic.
02132       Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS);
02133 
02134       Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc);
02135       Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc);
02136       Starts.push_back(Start);
02137       Ends.push_back(End);
02138     }
02139   }
02140 
02141   IRBuilder<> ChkBuilder(Loc);
02142   // Our instructions might fold to a constant.
02143   Value *MemoryRuntimeCheck = nullptr;
02144   for (unsigned i = 0; i < NumPointers; ++i) {
02145     for (unsigned j = i+1; j < NumPointers; ++j) {
02146       // No need to check if two readonly pointers intersect.
02147       if (!PtrRtCheck->IsWritePtr[i] && !PtrRtCheck->IsWritePtr[j])
02148         continue;
02149 
02150       // Only need to check pointers between two different dependency sets.
02151       if (PtrRtCheck->DependencySetId[i] == PtrRtCheck->DependencySetId[j])
02152        continue;
02153       // Only need to check pointers in the same alias set.
02154       if (PtrRtCheck->AliasSetId[i] != PtrRtCheck->AliasSetId[j])
02155         continue;
02156 
02157       unsigned AS0 = Starts[i]->getType()->getPointerAddressSpace();
02158       unsigned AS1 = Starts[j]->getType()->getPointerAddressSpace();
02159 
02160       assert((AS0 == Ends[j]->getType()->getPointerAddressSpace()) &&
02161              (AS1 == Ends[i]->getType()->getPointerAddressSpace()) &&
02162              "Trying to bounds check pointers with different address spaces");
02163 
02164       Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0);
02165       Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1);
02166 
02167       Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy0, "bc");
02168       Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy1, "bc");
02169       Value *End0 =   ChkBuilder.CreateBitCast(Ends[i],   PtrArithTy1, "bc");
02170       Value *End1 =   ChkBuilder.CreateBitCast(Ends[j],   PtrArithTy0, "bc");
02171 
02172       Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0");
02173       FirstInst = getFirstInst(FirstInst, Cmp0, Loc);
02174       Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1");
02175       FirstInst = getFirstInst(FirstInst, Cmp1, Loc);
02176       Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict");
02177       FirstInst = getFirstInst(FirstInst, IsConflict, Loc);
02178       if (MemoryRuntimeCheck) {
02179         IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict,
02180                                          "conflict.rdx");
02181         FirstInst = getFirstInst(FirstInst, IsConflict, Loc);
02182       }
02183       MemoryRuntimeCheck = IsConflict;
02184     }
02185   }
02186 
02187   // We have to do this trickery because the IRBuilder might fold the check to a
02188   // constant expression in which case there is no Instruction anchored in a
02189   // the block.
02190   Instruction *Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck,
02191                                                  ConstantInt::getTrue(Ctx));
02192   ChkBuilder.Insert(Check, "memcheck.conflict");
02193   FirstInst = getFirstInst(FirstInst, Check, Loc);
02194   return std::make_pair(FirstInst, Check);
02195 }
02196 
02197 void InnerLoopVectorizer::createEmptyLoop() {
02198   /*
02199    In this function we generate a new loop. The new loop will contain
02200    the vectorized instructions while the old loop will continue to run the
02201    scalar remainder.
02202 
02203        [ ] <-- Back-edge taken count overflow check.
02204     /   |
02205    /    v
02206   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
02207   |  /  |
02208   | /   v
02209   ||   [ ]     <-- vector pre header.
02210   ||    |
02211   ||    v
02212   ||   [  ] \
02213   ||   [  ]_|   <-- vector loop.
02214   ||    |
02215   | \   v
02216   |   >[ ]   <--- middle-block.
02217   |  /  |
02218   | /   v
02219   -|- >[ ]     <--- new preheader.
02220    |    |
02221    |    v
02222    |   [ ] \
02223    |   [ ]_|   <-- old scalar loop to handle remainder.
02224     \   |
02225      \  v
02226       >[ ]     <-- exit block.
02227    ...
02228    */
02229 
02230   BasicBlock *OldBasicBlock = OrigLoop->getHeader();
02231   BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
02232   BasicBlock *ExitBlock = OrigLoop->getExitBlock();
02233   assert(BypassBlock && "Invalid loop structure");
02234   assert(ExitBlock && "Must have an exit block");
02235 
02236   // Some loops have a single integer induction variable, while other loops
02237   // don't. One example is c++ iterators that often have multiple pointer
02238   // induction variables. In the code below we also support a case where we
02239   // don't have a single induction variable.
02240   OldInduction = Legal->getInduction();
02241   Type *IdxTy = Legal->getWidestInductionType();
02242 
02243   // Find the loop boundaries.
02244   const SCEV *ExitCount = SE->getBackedgeTakenCount(OrigLoop);
02245   assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
02246 
02247   // The exit count might have the type of i64 while the phi is i32. This can
02248   // happen if we have an induction variable that is sign extended before the
02249   // compare. The only way that we get a backedge taken count is that the
02250   // induction variable was signed and as such will not overflow. In such a case
02251   // truncation is legal.
02252   if (ExitCount->getType()->getPrimitiveSizeInBits() >
02253       IdxTy->getPrimitiveSizeInBits())
02254     ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy);
02255 
02256   const SCEV *BackedgeTakeCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy);
02257   // Get the total trip count from the count by adding 1.
02258   ExitCount = SE->getAddExpr(BackedgeTakeCount,
02259                              SE->getConstant(BackedgeTakeCount->getType(), 1));
02260 
02261   // Expand the trip count and place the new instructions in the preheader.
02262   // Notice that the pre-header does not change, only the loop body.
02263   SCEVExpander Exp(*SE, "induction");
02264 
02265   // We need to test whether the backedge-taken count is uint##_max. Adding one
02266   // to it will cause overflow and an incorrect loop trip count in the vector
02267   // body. In case of overflow we want to directly jump to the scalar remainder
02268   // loop.
02269   Value *BackedgeCount =
02270       Exp.expandCodeFor(BackedgeTakeCount, BackedgeTakeCount->getType(),
02271                         BypassBlock->getTerminator());
02272   if (BackedgeCount->getType()->isPointerTy())
02273     BackedgeCount = CastInst::CreatePointerCast(BackedgeCount, IdxTy,
02274                                                 "backedge.ptrcnt.to.int",
02275                                                 BypassBlock->getTerminator());
02276   Instruction *CheckBCOverflow =
02277       CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, BackedgeCount,
02278                       Constant::getAllOnesValue(BackedgeCount->getType()),
02279                       "backedge.overflow", BypassBlock->getTerminator());
02280 
02281   // The loop index does not have to start at Zero. Find the original start
02282   // value from the induction PHI node. If we don't have an induction variable
02283   // then we know that it starts at zero.
02284   Builder.SetInsertPoint(BypassBlock->getTerminator());
02285   Value *StartIdx = ExtendedIdx = OldInduction ?
02286     Builder.CreateZExt(OldInduction->getIncomingValueForBlock(BypassBlock),
02287                        IdxTy):
02288     ConstantInt::get(IdxTy, 0);
02289 
02290   // We need an instruction to anchor the overflow check on. StartIdx needs to
02291   // be defined before the overflow check branch. Because the scalar preheader
02292   // is going to merge the start index and so the overflow branch block needs to
02293   // contain a definition of the start index.
02294   Instruction *OverflowCheckAnchor = BinaryOperator::CreateAdd(
02295       StartIdx, ConstantInt::get(IdxTy, 0), "overflow.check.anchor",
02296       BypassBlock->getTerminator());
02297 
02298   // Count holds the overall loop count (N).
02299   Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
02300                                    BypassBlock->getTerminator());
02301 
02302   LoopBypassBlocks.push_back(BypassBlock);
02303 
02304   // Split the single block loop into the two loop structure described above.
02305   BasicBlock *VectorPH =
02306   BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
02307   BasicBlock *VecBody =
02308   VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
02309   BasicBlock *MiddleBlock =
02310   VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
02311   BasicBlock *ScalarPH =
02312   MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
02313 
02314   // Create and register the new vector loop.
02315   Loop* Lp = new Loop();
02316   Loop *ParentLoop = OrigLoop->getParentLoop();
02317 
02318   // Insert the new loop into the loop nest and register the new basic blocks
02319   // before calling any utilities such as SCEV that require valid LoopInfo.
02320   if (ParentLoop) {
02321     ParentLoop->addChildLoop(Lp);
02322     ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase());
02323     ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase());
02324     ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase());
02325   } else {
02326     LI->addTopLevelLoop(Lp);
02327   }
02328   Lp->addBasicBlockToLoop(VecBody, LI->getBase());
02329 
02330   // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
02331   // inside the loop.
02332   Builder.SetInsertPoint(VecBody->getFirstNonPHI());
02333 
02334   // Generate the induction variable.
02335   setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction));
02336   Induction = Builder.CreatePHI(IdxTy, 2, "index");
02337   // The loop step is equal to the vectorization factor (num of SIMD elements)
02338   // times the unroll factor (num of SIMD instructions).
02339   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
02340 
02341   // This is the IR builder that we use to add all of the logic for bypassing
02342   // the new vector loop.
02343   IRBuilder<> BypassBuilder(BypassBlock->getTerminator());
02344   setDebugLocFromInst(BypassBuilder,
02345                       getDebugLocFromInstOrOperands(OldInduction));
02346 
02347   // We may need to extend the index in case there is a type mismatch.
02348   // We know that the count starts at zero and does not overflow.
02349   if (Count->getType() != IdxTy) {
02350     // The exit count can be of pointer type. Convert it to the correct
02351     // integer type.
02352     if (ExitCount->getType()->isPointerTy())
02353       Count = BypassBuilder.CreatePointerCast(Count, IdxTy, "ptrcnt.to.int");
02354     else
02355       Count = BypassBuilder.CreateZExtOrTrunc(Count, IdxTy, "cnt.cast");
02356   }
02357 
02358   // Add the start index to the loop count to get the new end index.
02359   Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx, "end.idx");
02360 
02361   // Now we need to generate the expression for N - (N % VF), which is
02362   // the part that the vectorized body will execute.
02363   Value *R = BypassBuilder.CreateURem(Count, Step, "n.mod.vf");
02364   Value *CountRoundDown = BypassBuilder.CreateSub(Count, R, "n.vec");
02365   Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx,
02366                                                      "end.idx.rnd.down");
02367 
02368   // Now, compare the new count to zero. If it is zero skip the vector loop and
02369   // jump to the scalar loop.
02370   Value *Cmp =
02371       BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, "cmp.zero");
02372 
02373   BasicBlock *LastBypassBlock = BypassBlock;
02374 
02375   // Generate code to check that the loops trip count that we computed by adding
02376   // one to the backedge-taken count will not overflow.
02377   {
02378     auto PastOverflowCheck =
02379         std::next(BasicBlock::iterator(OverflowCheckAnchor));
02380     BasicBlock *CheckBlock =
02381       LastBypassBlock->splitBasicBlock(PastOverflowCheck, "overflow.checked");
02382     if (ParentLoop)
02383       ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase());
02384     LoopBypassBlocks.push_back(CheckBlock);
02385     Instruction *OldTerm = LastBypassBlock->getTerminator();
02386     BranchInst::Create(ScalarPH, CheckBlock, CheckBCOverflow, OldTerm);
02387     OldTerm->eraseFromParent();
02388     LastBypassBlock = CheckBlock;
02389   }
02390 
02391   // Generate the code to check that the strides we assumed to be one are really
02392   // one. We want the new basic block to start at the first instruction in a
02393   // sequence of instructions that form a check.
02394   Instruction *StrideCheck;
02395   Instruction *FirstCheckInst;
02396   std::tie(FirstCheckInst, StrideCheck) =
02397       addStrideCheck(LastBypassBlock->getTerminator());
02398   if (StrideCheck) {
02399     // Create a new block containing the stride check.
02400     BasicBlock *CheckBlock =
02401         LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.stridecheck");
02402     if (ParentLoop)
02403       ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase());
02404     LoopBypassBlocks.push_back(CheckBlock);
02405 
02406     // Replace the branch into the memory check block with a conditional branch
02407     // for the "few elements case".
02408     Instruction *OldTerm = LastBypassBlock->getTerminator();
02409     BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm);
02410     OldTerm->eraseFromParent();
02411 
02412     Cmp = StrideCheck;
02413     LastBypassBlock = CheckBlock;
02414   }
02415 
02416   // Generate the code that checks in runtime if arrays overlap. We put the
02417   // checks into a separate block to make the more common case of few elements
02418   // faster.
02419   Instruction *MemRuntimeCheck;
02420   std::tie(FirstCheckInst, MemRuntimeCheck) =
02421       addRuntimeCheck(LastBypassBlock->getTerminator());
02422   if (MemRuntimeCheck) {
02423     // Create a new block containing the memory check.
02424     BasicBlock *CheckBlock =
02425         LastBypassBlock->splitBasicBlock(MemRuntimeCheck, "vector.memcheck");
02426     if (ParentLoop)
02427       ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase());
02428     LoopBypassBlocks.push_back(CheckBlock);
02429 
02430     // Replace the branch into the memory check block with a conditional branch
02431     // for the "few elements case".
02432     Instruction *OldTerm = LastBypassBlock->getTerminator();
02433     BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm);
02434     OldTerm->eraseFromParent();
02435 
02436     Cmp = MemRuntimeCheck;
02437     LastBypassBlock = CheckBlock;
02438   }
02439 
02440   LastBypassBlock->getTerminator()->eraseFromParent();
02441   BranchInst::Create(MiddleBlock, VectorPH, Cmp,
02442                      LastBypassBlock);
02443 
02444   // We are going to resume the execution of the scalar loop.
02445   // Go over all of the induction variables that we found and fix the
02446   // PHIs that are left in the scalar version of the loop.
02447   // The starting values of PHI nodes depend on the counter of the last
02448   // iteration in the vectorized loop.
02449   // If we come from a bypass edge then we need to start from the original
02450   // start value.
02451 
02452   // This variable saves the new starting index for the scalar loop.
02453   PHINode *ResumeIndex = nullptr;
02454   LoopVectorizationLegality::InductionList::iterator I, E;
02455   LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
02456   // Set builder to point to last bypass block.
02457   BypassBuilder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator());
02458   for (I = List->begin(), E = List->end(); I != E; ++I) {
02459     PHINode *OrigPhi = I->first;
02460     LoopVectorizationLegality::InductionInfo II = I->second;
02461 
02462     Type *ResumeValTy = (OrigPhi == OldInduction) ? IdxTy : OrigPhi->getType();
02463     PHINode *ResumeVal = PHINode::Create(ResumeValTy, 2, "resume.val",
02464                                          MiddleBlock->getTerminator());
02465     // We might have extended the type of the induction variable but we need a
02466     // truncated version for the scalar loop.
02467     PHINode *TruncResumeVal = (OrigPhi == OldInduction) ?
02468       PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val",
02469                       MiddleBlock->getTerminator()) : nullptr;
02470 
02471     // Create phi nodes to merge from the  backedge-taken check block.
02472     PHINode *BCResumeVal = PHINode::Create(ResumeValTy, 3, "bc.resume.val",
02473                                            ScalarPH->getTerminator());
02474     BCResumeVal->addIncoming(ResumeVal, MiddleBlock);
02475 
02476     PHINode *BCTruncResumeVal = nullptr;
02477     if (OrigPhi == OldInduction) {
02478       BCTruncResumeVal =
02479           PHINode::Create(OrigPhi->getType(), 2, "bc.trunc.resume.val",
02480                           ScalarPH->getTerminator());
02481       BCTruncResumeVal->addIncoming(TruncResumeVal, MiddleBlock);
02482     }
02483 
02484     Value *EndValue = nullptr;
02485     switch (II.IK) {
02486     case LoopVectorizationLegality::IK_NoInduction:
02487       llvm_unreachable("Unknown induction");
02488     case LoopVectorizationLegality::IK_IntInduction: {
02489       // Handle the integer induction counter.
02490       assert(OrigPhi->getType()->isIntegerTy() && "Invalid type");
02491 
02492       // We have the canonical induction variable.
02493       if (OrigPhi == OldInduction) {
02494         // Create a truncated version of the resume value for the scalar loop,
02495         // we might have promoted the type to a larger width.
02496         EndValue =
02497           BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType());
02498         // The new PHI merges the original incoming value, in case of a bypass,
02499         // or the value at the end of the vectorized loop.
02500         for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
02501           TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
02502         TruncResumeVal->addIncoming(EndValue, VecBody);
02503 
02504         BCTruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]);
02505 
02506         // We know what the end value is.
02507         EndValue = IdxEndRoundDown;
02508         // We also know which PHI node holds it.
02509         ResumeIndex = ResumeVal;
02510         break;
02511       }
02512 
02513       // Not the canonical induction variable - add the vector loop count to the
02514       // start value.
02515       Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,
02516                                                    II.StartValue->getType(),
02517                                                    "cast.crd");
02518       EndValue = BypassBuilder.CreateAdd(CRD, II.StartValue , "ind.end");
02519       break;
02520     }
02521     case LoopVectorizationLegality::IK_ReverseIntInduction: {
02522       // Convert the CountRoundDown variable to the PHI size.
02523       Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,
02524                                                    II.StartValue->getType(),
02525                                                    "cast.crd");
02526       // Handle reverse integer induction counter.
02527       EndValue = BypassBuilder.CreateSub(II.StartValue, CRD, "rev.ind.end");
02528       break;
02529     }
02530     case LoopVectorizationLegality::IK_PtrInduction: {
02531       // For pointer induction variables, calculate the offset using
02532       // the end index.
02533       EndValue = BypassBuilder.CreateGEP(II.StartValue, CountRoundDown,
02534                                          "ptr.ind.end");
02535       break;
02536     }
02537     case LoopVectorizationLegality::IK_ReversePtrInduction: {
02538       // The value at the end of the loop for the reverse pointer is calculated
02539       // by creating a GEP with a negative index starting from the start value.
02540       Value *Zero = ConstantInt::get(CountRoundDown->getType(), 0);
02541       Value *NegIdx = BypassBuilder.CreateSub(Zero, CountRoundDown,
02542                                               "rev.ind.end");
02543       EndValue = BypassBuilder.CreateGEP(II.StartValue, NegIdx,
02544                                          "rev.ptr.ind.end");
02545       break;
02546     }
02547     }// end of case
02548 
02549     // The new PHI merges the original incoming value, in case of a bypass,
02550     // or the value at the end of the vectorized loop.
02551     for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) {
02552       if (OrigPhi == OldInduction)
02553         ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]);
02554       else
02555         ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
02556     }
02557     ResumeVal->addIncoming(EndValue, VecBody);
02558 
02559     // Fix the scalar body counter (PHI node).
02560     unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
02561 
02562     // The old induction's phi node in the scalar body needs the truncated
02563     // value.
02564     if (OrigPhi == OldInduction) {
02565       BCResumeVal->addIncoming(StartIdx, LoopBypassBlocks[0]);
02566       OrigPhi->setIncomingValue(BlockIdx, BCTruncResumeVal);
02567     } else {
02568       BCResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]);
02569       OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
02570     }
02571   }
02572 
02573   // If we are generating a new induction variable then we also need to
02574   // generate the code that calculates the exit value. This value is not
02575   // simply the end of the counter because we may skip the vectorized body
02576   // in case of a runtime check.
02577   if (!OldInduction){
02578     assert(!ResumeIndex && "Unexpected resume value found");
02579     ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val",
02580                                   MiddleBlock->getTerminator());
02581     for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
02582       ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]);
02583     ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
02584   }
02585 
02586   // Make sure that we found the index where scalar loop needs to continue.
02587   assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() &&
02588          "Invalid resume Index");
02589 
02590   // Add a check in the middle block to see if we have completed
02591   // all of the iterations in the first vector loop.
02592   // If (N - N%VF) == N, then we *don't* need to run the remainder.
02593   Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd,
02594                                 ResumeIndex, "cmp.n",
02595                                 MiddleBlock->getTerminator());
02596 
02597   BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator());
02598   // Remove the old terminator.
02599   MiddleBlock->getTerminator()->eraseFromParent();
02600 
02601   // Create i+1 and fill the PHINode.
02602   Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next");
02603   Induction->addIncoming(StartIdx, VectorPH);
02604   Induction->addIncoming(NextIdx, VecBody);
02605   // Create the compare.
02606   Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown);
02607   Builder.CreateCondBr(ICmp, MiddleBlock, VecBody);
02608 
02609   // Now we have two terminators. Remove the old one from the block.
02610   VecBody->getTerminator()->eraseFromParent();
02611 
02612   // Get ready to start creating new instructions into the vectorized body.
02613   Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
02614 
02615   // Save the state.
02616   LoopVectorPreHeader = VectorPH;
02617   LoopScalarPreHeader = ScalarPH;
02618   LoopMiddleBlock = MiddleBlock;
02619   LoopExitBlock = ExitBlock;
02620   LoopVectorBody.push_back(VecBody);
02621   LoopScalarBody = OldBasicBlock;
02622 
02623   LoopVectorizeHints Hints(Lp, true);
02624   Hints.setAlreadyVectorized();
02625 }
02626 
02627 /// This function returns the identity element (or neutral element) for
02628 /// the operation K.
02629 Constant*
02630 LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp) {
02631   switch (K) {
02632   case RK_IntegerXor:
02633   case RK_IntegerAdd:
02634   case RK_IntegerOr:
02635     // Adding, Xoring, Oring zero to a number does not change it.
02636     return ConstantInt::get(Tp, 0);
02637   case RK_IntegerMult:
02638     // Multiplying a number by 1 does not change it.
02639     return ConstantInt::get(Tp, 1);
02640   case RK_IntegerAnd:
02641     // AND-ing a number with an all-1 value does not change it.
02642     return ConstantInt::get(Tp, -1, true);
02643   case  RK_FloatMult:
02644     // Multiplying a number by 1 does not change it.
02645     return ConstantFP::get(Tp, 1.0L);
02646   case  RK_FloatAdd:
02647     // Adding zero to a number does not change it.
02648     return ConstantFP::get(Tp, 0.0L);
02649   default:
02650     llvm_unreachable("Unknown reduction kind");
02651   }
02652 }
02653 
02654 /// This function translates the reduction kind to an LLVM binary operator.
02655 static unsigned
02656 getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) {
02657   switch (Kind) {
02658     case LoopVectorizationLegality::RK_IntegerAdd:
02659       return Instruction::Add;
02660     case LoopVectorizationLegality::RK_IntegerMult:
02661       return Instruction::Mul;
02662     case LoopVectorizationLegality::RK_IntegerOr:
02663       return Instruction::Or;
02664     case LoopVectorizationLegality::RK_IntegerAnd:
02665       return Instruction::And;
02666     case LoopVectorizationLegality::RK_IntegerXor:
02667       return Instruction::Xor;
02668     case LoopVectorizationLegality::RK_FloatMult:
02669       return Instruction::FMul;
02670     case LoopVectorizationLegality::RK_FloatAdd:
02671       return Instruction::FAdd;
02672     case LoopVectorizationLegality::RK_IntegerMinMax:
02673       return Instruction::ICmp;
02674     case LoopVectorizationLegality::RK_FloatMinMax:
02675       return Instruction::FCmp;
02676     default:
02677       llvm_unreachable("Unknown reduction operation");
02678   }
02679 }
02680 
02681 Value *createMinMaxOp(IRBuilder<> &Builder,
02682                       LoopVectorizationLegality::MinMaxReductionKind RK,
02683                       Value *Left,
02684                       Value *Right) {
02685   CmpInst::Predicate P = CmpInst::ICMP_NE;
02686   switch (RK) {
02687   default:
02688     llvm_unreachable("Unknown min/max reduction kind");
02689   case LoopVectorizationLegality::MRK_UIntMin:
02690     P = CmpInst::ICMP_ULT;
02691     break;
02692   case LoopVectorizationLegality::MRK_UIntMax:
02693     P = CmpInst::ICMP_UGT;
02694     break;
02695   case LoopVectorizationLegality::MRK_SIntMin:
02696     P = CmpInst::ICMP_SLT;
02697     break;
02698   case LoopVectorizationLegality::MRK_SIntMax:
02699     P = CmpInst::ICMP_SGT;
02700     break;
02701   case LoopVectorizationLegality::MRK_FloatMin:
02702     P = CmpInst::FCMP_OLT;
02703     break;
02704   case LoopVectorizationLegality::MRK_FloatMax:
02705     P = CmpInst::FCMP_OGT;
02706     break;
02707   }
02708 
02709   Value *Cmp;
02710   if (RK == LoopVectorizationLegality::MRK_FloatMin ||
02711       RK == LoopVectorizationLegality::MRK_FloatMax)
02712     Cmp = Builder.CreateFCmp(P, Left, Right, "rdx.minmax.cmp");
02713   else
02714     Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp");
02715 
02716   Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select");
02717   return Select;
02718 }
02719 
02720 namespace {
02721 struct CSEDenseMapInfo {
02722   static bool canHandle(Instruction *I) {
02723     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
02724            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
02725   }
02726   static inline Instruction *getEmptyKey() {
02727     return DenseMapInfo<Instruction *>::getEmptyKey();
02728   }
02729   static inline Instruction *getTombstoneKey() {
02730     return DenseMapInfo<Instruction *>::getTombstoneKey();
02731   }
02732   static unsigned getHashValue(Instruction *I) {
02733     assert(canHandle(I) && "Unknown instruction!");
02734     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
02735                                                            I->value_op_end()));
02736   }
02737   static bool isEqual(Instruction *LHS, Instruction *RHS) {
02738     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
02739         LHS == getTombstoneKey() || RHS == getTombstoneKey())
02740       return LHS == RHS;
02741     return LHS->isIdenticalTo(RHS);
02742   }
02743 };
02744 }
02745 
02746 /// \brief Check whether this block is a predicated block.
02747 /// Due to if predication of stores we might create a sequence of "if(pred) a[i]
02748 /// = ...;  " blocks. We start with one vectorized basic block. For every
02749 /// conditional block we split this vectorized block. Therefore, every second
02750 /// block will be a predicated one.
02751 static bool isPredicatedBlock(unsigned BlockNum) {
02752   return BlockNum % 2;
02753 }
02754 
02755 ///\brief Perform cse of induction variable instructions.
02756 static void cse(SmallVector<BasicBlock *, 4> &BBs) {
02757   // Perform simple cse.
02758   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
02759   for (unsigned i = 0, e = BBs.size(); i != e; ++i) {
02760     BasicBlock *BB = BBs[i];
02761     for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
02762       Instruction *In = I++;
02763 
02764       if (!CSEDenseMapInfo::canHandle(In))
02765         continue;
02766 
02767       // Check if we can replace this instruction with any of the
02768       // visited instructions.
02769       if (Instruction *V = CSEMap.lookup(In)) {
02770         In->replaceAllUsesWith(V);
02771         In->eraseFromParent();
02772         continue;
02773       }
02774       // Ignore instructions in conditional blocks. We create "if (pred) a[i] =
02775       // ...;" blocks for predicated stores. Every second block is a predicated
02776       // block.
02777       if (isPredicatedBlock(i))
02778         continue;
02779 
02780       CSEMap[In] = In;
02781     }
02782   }
02783 }
02784 
02785 /// \brief Adds a 'fast' flag to floating point operations.
02786 static Value *addFastMathFlag(Value *V) {
02787   if (isa<FPMathOperator>(V)){
02788     FastMathFlags Flags;
02789     Flags.setUnsafeAlgebra();
02790     cast<Instruction>(V)->setFastMathFlags(Flags);
02791   }
02792   return V;
02793 }
02794 
02795 void InnerLoopVectorizer::vectorizeLoop() {
02796   //===------------------------------------------------===//
02797   //
02798   // Notice: any optimization or new instruction that go
02799   // into the code below should be also be implemented in
02800   // the cost-model.
02801   //
02802   //===------------------------------------------------===//
02803   Constant *Zero = Builder.getInt32(0);
02804 
02805   // In order to support reduction variables we need to be able to vectorize
02806   // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
02807   // stages. First, we create a new vector PHI node with no incoming edges.
02808   // We use this value when we vectorize all of the instructions that use the
02809   // PHI. Next, after all of the instructions in the block are complete we
02810   // add the new incoming edges to the PHI. At this point all of the
02811   // instructions in the basic block are vectorized, so we can use them to
02812   // construct the PHI.
02813   PhiVector RdxPHIsToFix;
02814 
02815   // Scan the loop in a topological order to ensure that defs are vectorized
02816   // before users.
02817   LoopBlocksDFS DFS(OrigLoop);
02818   DFS.perform(LI);
02819 
02820   // Vectorize all of the blocks in the original loop.
02821   for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
02822        be = DFS.endRPO(); bb != be; ++bb)
02823     vectorizeBlockInLoop(*bb, &RdxPHIsToFix);
02824 
02825   // At this point every instruction in the original loop is widened to
02826   // a vector form. We are almost done. Now, we need to fix the PHI nodes
02827   // that we vectorized. The PHI nodes are currently empty because we did
02828   // not want to introduce cycles. Notice that the remaining PHI nodes
02829   // that we need to fix are reduction variables.
02830 
02831   // Create the 'reduced' values for each of the induction vars.
02832   // The reduced values are the vector values that we scalarize and combine
02833   // after the loop is finished.
02834   for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end();
02835        it != e; ++it) {
02836     PHINode *RdxPhi = *it;
02837     assert(RdxPhi && "Unable to recover vectorized PHI");
02838 
02839     // Find the reduction variable descriptor.
02840     assert(Legal->getReductionVars()->count(RdxPhi) &&
02841            "Unable to find the reduction variable");
02842     LoopVectorizationLegality::ReductionDescriptor RdxDesc =
02843     (*Legal->getReductionVars())[RdxPhi];
02844 
02845     setDebugLocFromInst(Builder, RdxDesc.StartValue);
02846 
02847     // We need to generate a reduction vector from the incoming scalar.
02848     // To do so, we need to generate the 'identity' vector and override
02849     // one of the elements with the incoming scalar reduction. We need
02850     // to do it in the vector-loop preheader.
02851     Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator());
02852 
02853     // This is the vector-clone of the value that leaves the loop.
02854     VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
02855     Type *VecTy = VectorExit[0]->getType();
02856 
02857     // Find the reduction identity variable. Zero for addition, or, xor,
02858     // one for multiplication, -1 for And.
02859     Value *Identity;
02860     Value *VectorStart;
02861     if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerMinMax ||
02862         RdxDesc.Kind == LoopVectorizationLegality::RK_FloatMinMax) {
02863       // MinMax reduction have the start value as their identify.
02864       if (VF == 1) {
02865         VectorStart = Identity = RdxDesc.StartValue;
02866       } else {
02867         VectorStart = Identity = Builder.CreateVectorSplat(VF,
02868                                                            RdxDesc.StartValue,
02869                                                            "minmax.ident");
02870       }
02871     } else {
02872       // Handle other reduction kinds:
02873       Constant *Iden =
02874       LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind,
02875                                                       VecTy->getScalarType());
02876       if (VF == 1) {
02877         Identity = Iden;
02878         // This vector is the Identity vector where the first element is the
02879         // incoming scalar reduction.
02880         VectorStart = RdxDesc.StartValue;
02881       } else {
02882         Identity = ConstantVector::getSplat(VF, Iden);
02883 
02884         // This vector is the Identity vector where the first element is the
02885         // incoming scalar reduction.
02886         VectorStart = Builder.CreateInsertElement(Identity,
02887                                                   RdxDesc.StartValue, Zero);
02888       }
02889     }
02890 
02891     // Fix the vector-loop phi.
02892 
02893     // Reductions do not have to start at zero. They can start with
02894     // any loop invariant values.
02895     VectorParts &VecRdxPhi = WidenMap.get(RdxPhi);
02896     BasicBlock *Latch = OrigLoop->getLoopLatch();
02897     Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch);
02898     VectorParts &Val = getVectorValue(LoopVal);
02899     for (unsigned part = 0; part < UF; ++part) {
02900       // Make sure to add the reduction stat value only to the
02901       // first unroll part.
02902       Value *StartVal = (part == 0) ? VectorStart : Identity;
02903       cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal,
02904                                                   LoopVectorPreHeader);
02905       cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part],
02906                                                   LoopVectorBody.back());
02907     }
02908 
02909     // Before each round, move the insertion point right between
02910     // the PHIs and the values we are going to write.
02911     // This allows us to write both PHINodes and the extractelement
02912     // instructions.
02913     Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
02914 
02915     VectorParts RdxParts;
02916     setDebugLocFromInst(Builder, RdxDesc.LoopExitInstr);
02917     for (unsigned part = 0; part < UF; ++part) {
02918       // This PHINode contains the vectorized reduction variable, or
02919       // the initial value vector, if we bypass the vector loop.
02920       VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr);
02921       PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
02922       Value *StartVal = (part == 0) ? VectorStart : Identity;
02923       for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
02924         NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]);
02925       NewPhi->addIncoming(RdxExitVal[part],
02926                           LoopVectorBody.back());
02927       RdxParts.push_back(NewPhi);
02928     }
02929 
02930     // Reduce all of the unrolled parts into a single vector.
02931     Value *ReducedPartRdx = RdxParts[0];
02932     unsigned Op = getReductionBinOp(RdxDesc.Kind);
02933     setDebugLocFromInst(Builder, ReducedPartRdx);
02934     for (unsigned part = 1; part < UF; ++part) {
02935       if (Op != Instruction::ICmp && Op != Instruction::FCmp)
02936         // Floating point operations had to be 'fast' to enable the reduction.
02937         ReducedPartRdx = addFastMathFlag(
02938             Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part],
02939                                 ReducedPartRdx, "bin.rdx"));
02940       else
02941         ReducedPartRdx = createMinMaxOp(Builder, RdxDesc.MinMaxKind,
02942                                         ReducedPartRdx, RdxParts[part]);
02943     }
02944 
02945     if (VF > 1) {
02946       // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
02947       // and vector ops, reducing the set of values being computed by half each
02948       // round.
02949       assert(isPowerOf2_32(VF) &&
02950              "Reduction emission only supported for pow2 vectors!");
02951       Value *TmpVec = ReducedPartRdx;
02952       SmallVector<Constant*, 32> ShuffleMask(VF, nullptr);
02953       for (unsigned i = VF; i != 1; i >>= 1) {
02954         // Move the upper half of the vector to the lower half.
02955         for (unsigned j = 0; j != i/2; ++j)
02956           ShuffleMask[j] = Builder.getInt32(i/2 + j);
02957 
02958         // Fill the rest of the mask with undef.
02959         std::fill(&ShuffleMask[i/2], ShuffleMask.end(),
02960                   UndefValue::get(Builder.getInt32Ty()));
02961 
02962         Value *Shuf =
02963         Builder.CreateShuffleVector(TmpVec,
02964                                     UndefValue::get(TmpVec->getType()),
02965                                     ConstantVector::get(ShuffleMask),
02966                                     "rdx.shuf");
02967 
02968         if (Op != Instruction::ICmp && Op != Instruction::FCmp)
02969           // Floating point operations had to be 'fast' to enable the reduction.
02970           TmpVec = addFastMathFlag(Builder.CreateBinOp(
02971               (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx"));
02972         else
02973           TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf);
02974       }
02975 
02976       // The result is in the first element of the vector.
02977       ReducedPartRdx = Builder.CreateExtractElement(TmpVec,
02978                                                     Builder.getInt32(0));
02979     }
02980 
02981     // Create a phi node that merges control-flow from the backedge-taken check
02982     // block and the middle block.
02983     PHINode *BCBlockPhi = PHINode::Create(RdxPhi->getType(), 2, "bc.merge.rdx",
02984                                           LoopScalarPreHeader->getTerminator());
02985     BCBlockPhi->addIncoming(RdxDesc.StartValue, LoopBypassBlocks[0]);
02986     BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
02987 
02988     // Now, we need to fix the users of the reduction variable
02989     // inside and outside of the scalar remainder loop.
02990     // We know that the loop is in LCSSA form. We need to update the
02991     // PHI nodes in the exit blocks.
02992     for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
02993          LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
02994       PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
02995       if (!LCSSAPhi) break;
02996 
02997       // All PHINodes need to have a single entry edge, or two if
02998       // we already fixed them.
02999       assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
03000 
03001       // We found our reduction value exit-PHI. Update it with the
03002       // incoming bypass edge.
03003       if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) {
03004         // Add an edge coming from the bypass.
03005         LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
03006         break;
03007       }
03008     }// end of the LCSSA phi scan.
03009 
03010     // Fix the scalar loop reduction variable with the incoming reduction sum
03011     // from the vector body and from the backedge value.
03012     int IncomingEdgeBlockIdx =
03013     (RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch());
03014     assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
03015     // Pick the other block.
03016     int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
03017     (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
03018     (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
03019   }// end of for each redux variable.
03020 
03021   fixLCSSAPHIs();
03022 
03023   // Remove redundant induction instructions.
03024   cse(LoopVectorBody);
03025 }
03026 
03027 void InnerLoopVectorizer::fixLCSSAPHIs() {
03028   for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
03029        LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
03030     PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
03031     if (!LCSSAPhi) break;
03032     if (LCSSAPhi->getNumIncomingValues() == 1)
03033       LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()),
03034                             LoopMiddleBlock);
03035   }
03036 }
03037 
03038 InnerLoopVectorizer::VectorParts
03039 InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
03040   assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) &&
03041          "Invalid edge");
03042 
03043   // Look for cached value.
03044   std::pair<BasicBlock*, BasicBlock*> Edge(Src, Dst);
03045   EdgeMaskCache::iterator ECEntryIt = MaskCache.find(Edge);
03046   if (ECEntryIt != MaskCache.end())
03047     return ECEntryIt->second;
03048 
03049   VectorParts SrcMask = createBlockInMask(Src);
03050 
03051   // The terminator has to be a branch inst!
03052   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
03053   assert(BI && "Unexpected terminator found");
03054 
03055   if (BI->isConditional()) {
03056     VectorParts EdgeMask = getVectorValue(BI->getCondition());
03057 
03058     if (BI->getSuccessor(0) != Dst)
03059       for (unsigned part = 0; part < UF; ++part)
03060         EdgeMask[part] = Builder.CreateNot(EdgeMask[part]);
03061 
03062     for (unsigned part = 0; part < UF; ++part)
03063       EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]);
03064 
03065     MaskCache[Edge] = EdgeMask;
03066     return EdgeMask;
03067   }
03068 
03069   MaskCache[Edge] = SrcMask;
03070   return SrcMask;
03071 }
03072 
03073 InnerLoopVectorizer::VectorParts
03074 InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
03075   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
03076 
03077   // Loop incoming mask is all-one.
03078   if (OrigLoop->getHeader() == BB) {
03079     Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1);
03080     return getVectorValue(C);
03081   }
03082 
03083   // This is the block mask. We OR all incoming edges, and with zero.
03084   Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0);
03085   VectorParts BlockMask = getVectorValue(Zero);
03086 
03087   // For each pred:
03088   for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) {
03089     VectorParts EM = createEdgeMask(*it, BB);
03090     for (unsigned part = 0; part < UF; ++part)
03091       BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]);
03092   }
03093 
03094   return BlockMask;
03095 }
03096 
03097 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
03098                                               InnerLoopVectorizer::VectorParts &Entry,
03099                                               unsigned UF, unsigned VF, PhiVector *PV) {
03100   PHINode* P = cast<PHINode>(PN);
03101   // Handle reduction variables:
03102   if (Legal->getReductionVars()->count(P)) {
03103     for (unsigned part = 0; part < UF; ++part) {
03104       // This is phase one of vectorizing PHIs.
03105       Type *VecTy = (VF == 1) ? PN->getType() :
03106       VectorType::get(PN->getType(), VF);
03107       Entry[part] = PHINode::Create(VecTy, 2, "vec.phi",
03108                                     LoopVectorBody.back()-> getFirstInsertionPt());
03109     }
03110     PV->push_back(P);
03111     return;
03112   }
03113 
03114   setDebugLocFromInst(Builder, P);
03115   // Check for PHI nodes that are lowered to vector selects.
03116   if (P->getParent() != OrigLoop->getHeader()) {
03117     // We know that all PHIs in non-header blocks are converted into
03118     // selects, so we don't have to worry about the insertion order and we
03119     // can just use the builder.
03120     // At this point we generate the predication tree. There may be
03121     // duplications since this is a simple recursive scan, but future
03122     // optimizations will clean it up.
03123 
03124     unsigned NumIncoming = P->getNumIncomingValues();
03125 
03126     // Generate a sequence of selects of the form:
03127     // SELECT(Mask3, In3,
03128     //      SELECT(Mask2, In2,
03129     //                   ( ...)))
03130     for (unsigned In = 0; In < NumIncoming; In++) {
03131       VectorParts Cond = createEdgeMask(P->getIncomingBlock(In),
03132                                         P->getParent());
03133       VectorParts &In0 = getVectorValue(P->getIncomingValue(In));
03134 
03135       for (unsigned part = 0; part < UF; ++part) {
03136         // We might have single edge PHIs (blocks) - use an identity
03137         // 'select' for the first PHI operand.
03138         if (In == 0)
03139           Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
03140                                              In0[part]);
03141         else
03142           // Select between the current value and the previous incoming edge
03143           // based on the incoming mask.
03144           Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
03145                                              Entry[part], "predphi");
03146       }
03147     }
03148     return;
03149   }
03150 
03151   // This PHINode must be an induction variable.
03152   // Make sure that we know about it.
03153   assert(Legal->getInductionVars()->count(P) &&
03154          "Not an induction variable");
03155 
03156   LoopVectorizationLegality::InductionInfo II =
03157   Legal->getInductionVars()->lookup(P);
03158 
03159   switch (II.IK) {
03160     case LoopVectorizationLegality::IK_NoInduction:
03161       llvm_unreachable("Unknown induction");
03162     case LoopVectorizationLegality::IK_IntInduction: {
03163       assert(P->getType() == II.StartValue->getType() && "Types must match");
03164       Type *PhiTy = P->getType();
03165       Value *Broadcasted;
03166       if (P == OldInduction) {
03167         // Handle the canonical induction variable. We might have had to
03168         // extend the type.
03169         Broadcasted = Builder.CreateTrunc(Induction, PhiTy);
03170       } else {
03171         // Handle other induction variables that are now based on the
03172         // canonical one.
03173         Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx,
03174                                                  "normalized.idx");
03175         NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy);
03176         Broadcasted = Builder.CreateAdd(II.StartValue, NormalizedIdx,
03177                                         "offset.idx");
03178       }
03179       Broadcasted = getBroadcastInstrs(Broadcasted);
03180       // After broadcasting the induction variable we need to make the vector
03181       // consecutive by adding 0, 1, 2, etc.
03182       for (unsigned part = 0; part < UF; ++part)
03183         Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false);
03184       return;
03185     }
03186     case LoopVectorizationLegality::IK_ReverseIntInduction:
03187     case LoopVectorizationLegality::IK_PtrInduction:
03188     case LoopVectorizationLegality::IK_ReversePtrInduction:
03189       // Handle reverse integer and pointer inductions.
03190       Value *StartIdx = ExtendedIdx;
03191       // This is the normalized GEP that starts counting at zero.
03192       Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx,
03193                                                "normalized.idx");
03194 
03195       // Handle the reverse integer induction variable case.
03196       if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) {
03197         IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType());
03198         Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy,
03199                                                "resize.norm.idx");
03200         Value *ReverseInd  = Builder.CreateSub(II.StartValue, CNI,
03201                                                "reverse.idx");
03202 
03203         // This is a new value so do not hoist it out.
03204         Value *Broadcasted = getBroadcastInstrs(ReverseInd);
03205         // After broadcasting the induction variable we need to make the
03206         // vector consecutive by adding  ... -3, -2, -1, 0.
03207         for (unsigned part = 0; part < UF; ++part)
03208           Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part,
03209                                              true);
03210         return;
03211       }
03212 
03213       // Handle the pointer induction variable case.
03214       assert(P->getType()->isPointerTy() && "Unexpected type.");
03215 
03216       // Is this a reverse induction ptr or a consecutive induction ptr.
03217       bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction ==
03218                       II.IK);
03219 
03220       // This is the vector of results. Notice that we don't generate
03221       // vector geps because scalar geps result in better code.
03222       for (unsigned part = 0; part < UF; ++part) {
03223         if (VF == 1) {
03224           int EltIndex = (part) * (Reverse ? -1 : 1);
03225           Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex);
03226           Value *GlobalIdx;
03227           if (Reverse)
03228             GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx");
03229           else
03230             GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
03231 
03232           Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
03233                                              "next.gep");
03234           Entry[part] = SclrGep;
03235           continue;
03236         }
03237 
03238         Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
03239         for (unsigned int i = 0; i < VF; ++i) {
03240           int EltIndex = (i + part * VF) * (Reverse ? -1 : 1);
03241           Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex);
03242           Value *GlobalIdx;
03243           if (!Reverse)
03244             GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
03245           else
03246             GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx");
03247 
03248           Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
03249                                              "next.gep");
03250           VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
03251                                                Builder.getInt32(i),
03252                                                "insert.gep");
03253         }
03254         Entry[part] = VecVal;
03255       }
03256       return;
03257   }
03258 }
03259 
03260 void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
03261   // For each instruction in the old loop.
03262   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
03263     VectorParts &Entry = WidenMap.get(it);
03264     switch (it->getOpcode()) {
03265     case Instruction::Br:
03266       // Nothing to do for PHIs and BR, since we already took care of the
03267       // loop control flow instructions.
03268       continue;
03269     case Instruction::PHI:{
03270       // Vectorize PHINodes.
03271       widenPHIInstruction(it, Entry, UF, VF, PV);
03272       continue;
03273     }// End of PHI.
03274 
03275     case Instruction::Add:
03276     case Instruction::FAdd:
03277     case Instruction::Sub:
03278     case Instruction::FSub:
03279     case Instruction::Mul:
03280     case Instruction::FMul:
03281     case Instruction::UDiv:
03282     case Instruction::SDiv:
03283     case Instruction::FDiv:
03284     case Instruction::URem:
03285     case Instruction::SRem:
03286     case Instruction::FRem:
03287     case Instruction::Shl:
03288     case Instruction::LShr:
03289     case Instruction::AShr:
03290     case Instruction::And:
03291     case Instruction::Or:
03292     case Instruction::Xor: {
03293       // Just widen binops.
03294       BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it);
03295       setDebugLocFromInst(Builder, BinOp);
03296       VectorParts &A = getVectorValue(it->getOperand(0));
03297       VectorParts &B = getVectorValue(it->getOperand(1));
03298 
03299       // Use this vector value for all users of the original instruction.
03300       for (unsigned Part = 0; Part < UF; ++Part) {
03301         Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]);
03302 
03303         if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
03304           VecOp->copyIRFlags(BinOp);
03305 
03306         Entry[Part] = V;
03307       }
03308 
03309       propagateMetadata(Entry, it);
03310       break;
03311     }
03312     case Instruction::Select: {
03313       // Widen selects.
03314       // If the selector is loop invariant we can create a select
03315       // instruction with a scalar condition. Otherwise, use vector-select.
03316       bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)),
03317                                                OrigLoop);
03318       setDebugLocFromInst(Builder, it);
03319 
03320       // The condition can be loop invariant  but still defined inside the
03321       // loop. This means that we can't just use the original 'cond' value.
03322       // We have to take the 'vectorized' value and pick the first lane.
03323       // Instcombine will make this a no-op.
03324       VectorParts &Cond = getVectorValue(it->getOperand(0));
03325       VectorParts &Op0  = getVectorValue(it->getOperand(1));
03326       VectorParts &Op1  = getVectorValue(it->getOperand(2));
03327 
03328       Value *ScalarCond = (VF == 1) ? Cond[0] :
03329         Builder.CreateExtractElement(Cond[0], Builder.getInt32(0));
03330 
03331       for (unsigned Part = 0; Part < UF; ++Part) {
03332         Entry[Part] = Builder.CreateSelect(
03333           InvariantCond ? ScalarCond : Cond[Part],
03334           Op0[Part],
03335           Op1[Part]);
03336       }
03337 
03338       propagateMetadata(Entry, it);
03339       break;
03340     }
03341 
03342     case Instruction::ICmp:
03343     case Instruction::FCmp: {
03344       // Widen compares. Generate vector compares.
03345       bool FCmp = (it->getOpcode() == Instruction::FCmp);
03346       CmpInst *Cmp = dyn_cast<CmpInst>(it);
03347       setDebugLocFromInst(Builder, it);
03348       VectorParts &A = getVectorValue(it->getOperand(0));
03349       VectorParts &B = getVectorValue(it->getOperand(1));
03350       for (unsigned Part = 0; Part < UF; ++Part) {
03351         Value *C = nullptr;
03352         if (FCmp)
03353           C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]);
03354         else
03355           C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]);
03356         Entry[Part] = C;
03357       }
03358 
03359       propagateMetadata(Entry, it);
03360       break;
03361     }
03362 
03363     case Instruction::Store:
03364     case Instruction::Load:
03365       vectorizeMemoryInstruction(it);
03366         break;
03367     case Instruction::ZExt:
03368     case Instruction::SExt:
03369     case Instruction::FPToUI:
03370     case Instruction::FPToSI:
03371     case Instruction::FPExt:
03372     case Instruction::PtrToInt:
03373     case Instruction::IntToPtr:
03374     case Instruction::SIToFP:
03375     case Instruction::UIToFP:
03376     case Instruction::Trunc:
03377     case Instruction::FPTrunc:
03378     case Instruction::BitCast: {
03379       CastInst *CI = dyn_cast<CastInst>(it);
03380       setDebugLocFromInst(Builder, it);
03381       /// Optimize the special case where the source is the induction
03382       /// variable. Notice that we can only optimize the 'trunc' case
03383       /// because: a. FP conversions lose precision, b. sext/zext may wrap,
03384       /// c. other casts depend on pointer size.
03385       if (CI->getOperand(0) == OldInduction &&
03386           it->getOpcode() == Instruction::Trunc) {
03387         Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction,
03388                                                CI->getType());
03389         Value *Broadcasted = getBroadcastInstrs(ScalarCast);
03390         for (unsigned Part = 0; Part < UF; ++Part)
03391           Entry[Part] = getConsecutiveVector(Broadcasted, VF * Part, false);
03392         propagateMetadata(Entry, it);
03393         break;
03394       }
03395       /// Vectorize casts.
03396       Type *DestTy = (VF == 1) ? CI->getType() :
03397                                  VectorType::get(CI->getType(), VF);
03398 
03399       VectorParts &A = getVectorValue(it->getOperand(0));
03400       for (unsigned Part = 0; Part < UF; ++Part)
03401         Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy);
03402       propagateMetadata(Entry, it);
03403       break;
03404     }
03405 
03406     case Instruction::Call: {
03407       // Ignore dbg intrinsics.
03408       if (isa<DbgInfoIntrinsic>(it))
03409         break;
03410       setDebugLocFromInst(Builder, it);
03411 
03412       Module *M = BB->getParent()->getParent();
03413       CallInst *CI = cast<CallInst>(it);
03414       Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
03415       assert(ID && "Not an intrinsic call!");
03416       switch (ID) {
03417       case Intrinsic::assume:
03418       case Intrinsic::lifetime_end:
03419       case Intrinsic::lifetime_start:
03420         scalarizeInstruction(it);
03421         break;
03422       default:
03423         bool HasScalarOpd = hasVectorInstrinsicScalarOpd(ID, 1);
03424         for (unsigned Part = 0; Part < UF; ++Part) {
03425           SmallVector<Value *, 4> Args;
03426           for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
03427             if (HasScalarOpd && i == 1) {
03428               Args.push_back(CI->getArgOperand(i));
03429               continue;
03430             }
03431             VectorParts &Arg = getVectorValue(CI->getArgOperand(i));
03432             Args.push_back(Arg[Part]);
03433           }
03434           Type *Tys[] = {CI->getType()};
03435           if (VF > 1)
03436             Tys[0] = VectorType::get(CI->getType()->getScalarType(), VF);
03437 
03438           Function *F = Intrinsic::getDeclaration(M, ID, Tys);
03439           Entry[Part] = Builder.CreateCall(F, Args);
03440         }
03441 
03442         propagateMetadata(Entry, it);
03443         break;
03444       }
03445       break;
03446     }
03447 
03448     default:
03449       // All other instructions are unsupported. Scalarize them.
03450       scalarizeInstruction(it);
03451       break;
03452     }// end of switch.
03453   }// end of for_each instr.
03454 }
03455 
03456 void InnerLoopVectorizer::updateAnalysis() {
03457   // Forget the original basic block.
03458   SE->forgetLoop(OrigLoop);
03459 
03460   // Update the dominator tree information.
03461   assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
03462          "Entry does not dominate exit.");
03463 
03464   for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
03465     DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]);
03466   DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back());
03467 
03468   // Due to if predication of stores we might create a sequence of "if(pred)
03469   // a[i] = ...;  " blocks.
03470   for (unsigned i = 0, e = LoopVectorBody.size(); i != e; ++i) {
03471     if (i == 0)
03472       DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader);
03473     else if (isPredicatedBlock(i)) {
03474       DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-1]);
03475     } else {
03476       DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-2]);
03477     }
03478   }
03479 
03480   DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks[1]);
03481   DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
03482   DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
03483   DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
03484 
03485   DEBUG(DT->verifyDomTree());
03486 }
03487 
03488 /// \brief Check whether it is safe to if-convert this phi node.
03489 ///
03490 /// Phi nodes with constant expressions that can trap are not safe to if
03491 /// convert.
03492 static bool canIfConvertPHINodes(BasicBlock *BB) {
03493   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
03494     PHINode *Phi = dyn_cast<PHINode>(I);
03495     if (!Phi)
03496       return true;
03497     for (unsigned p = 0, e = Phi->getNumIncomingValues(); p != e; ++p)
03498       if (Constant *C = dyn_cast<Constant>(Phi->getIncomingValue(p)))
03499         if (C->canTrap())
03500           return false;
03501   }
03502   return true;
03503 }
03504 
03505 bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
03506   if (!EnableIfConversion) {
03507     emitAnalysis(Report() << "if-conversion is disabled");
03508     return false;
03509   }
03510 
03511   assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
03512 
03513   // A list of pointers that we can safely read and write to.
03514   SmallPtrSet<Value *, 8> SafePointes;
03515 
03516   // Collect safe addresses.
03517   for (Loop::block_iterator BI = TheLoop->block_begin(),
03518          BE = TheLoop->block_end(); BI != BE; ++BI) {
03519     BasicBlock *BB = *BI;
03520 
03521     if (blockNeedsPredication(BB))
03522       continue;
03523 
03524     for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
03525       if (LoadInst *LI = dyn_cast<LoadInst>(I))
03526         SafePointes.insert(LI->getPointerOperand());
03527       else if (StoreInst *SI = dyn_cast<StoreInst>(I))
03528         SafePointes.insert(SI->getPointerOperand());
03529     }
03530   }
03531 
03532   // Collect the blocks that need predication.
03533   BasicBlock *Header = TheLoop->getHeader();
03534   for (Loop::block_iterator BI = TheLoop->block_begin(),
03535          BE = TheLoop->block_end(); BI != BE; ++BI) {
03536     BasicBlock *BB = *BI;
03537 
03538     // We don't support switch statements inside loops.
03539     if (!isa<BranchInst>(BB->getTerminator())) {
03540       emitAnalysis(Report(BB->getTerminator())
03541                    << "loop contains a switch statement");
03542       return false;
03543     }
03544 
03545     // We must be able to predicate all blocks that need to be predicated.
03546     if (blockNeedsPredication(BB)) {
03547       if (!blockCanBePredicated(BB, SafePointes)) {
03548         emitAnalysis(Report(BB->getTerminator())
03549                      << "control flow cannot be substituted for a select");
03550         return false;
03551       }
03552     } else if (BB != Header && !canIfConvertPHINodes(BB)) {
03553       emitAnalysis(Report(BB->getTerminator())
03554                    << "control flow cannot be substituted for a select");
03555       return false;
03556     }
03557   }
03558 
03559   // We can if-convert this loop.
03560   return true;
03561 }
03562 
03563 bool LoopVectorizationLegality::canVectorize() {
03564   // We must have a loop in canonical form. Loops with indirectbr in them cannot
03565   // be canonicalized.
03566   if (!TheLoop->getLoopPreheader()) {
03567     emitAnalysis(
03568         Report() << "loop control flow is not understood by vectorizer");
03569     return false;
03570   }
03571 
03572   // We can only vectorize innermost loops.
03573   if (TheLoop->getSubLoopsVector().size()) {
03574     emitAnalysis(Report() << "loop is not the innermost loop");
03575     return false;
03576   }
03577 
03578   // We must have a single backedge.
03579   if (TheLoop->getNumBackEdges() != 1) {
03580     emitAnalysis(
03581         Report() << "loop control flow is not understood by vectorizer");
03582     return false;
03583   }
03584 
03585   // We must have a single exiting block.
03586   if (!TheLoop->getExitingBlock()) {
03587     emitAnalysis(
03588         Report() << "loop control flow is not understood by vectorizer");
03589     return false;
03590   }
03591 
03592   // We only handle bottom-tested loops, i.e. loop in which the condition is
03593   // checked at the end of each iteration. With that we can assume that all
03594   // instructions in the loop are executed the same number of times.
03595   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
03596     emitAnalysis(
03597         Report() << "loop control flow is not understood by vectorizer");
03598     return false;
03599   }
03600 
03601   // We need to have a loop header.
03602   DEBUG(dbgs() << "LV: Found a loop: " <<
03603         TheLoop->getHeader()->getName() << '\n');
03604 
03605   // Check if we can if-convert non-single-bb loops.
03606   unsigned NumBlocks = TheLoop->getNumBlocks();
03607   if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
03608     DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
03609     return false;
03610   }
03611 
03612   // ScalarEvolution needs to be able to find the exit count.
03613   const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop);
03614   if (ExitCount == SE->getCouldNotCompute()) {
03615     emitAnalysis(Report() << "could not determine number of loop iterations");
03616     DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
03617     return false;
03618   }
03619 
03620   // Check if we can vectorize the instructions and CFG in this loop.
03621   if (!canVectorizeInstrs()) {
03622     DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
03623     return false;
03624   }
03625 
03626   // Go over each instruction and look at memory deps.
03627   if (!canVectorizeMemory()) {
03628     DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
03629     return false;
03630   }
03631 
03632   // Collect all of the variables that remain uniform after vectorization.
03633   collectLoopUniforms();
03634 
03635   DEBUG(dbgs() << "LV: We can vectorize this loop" <<
03636         (PtrRtCheck.Need ? " (with a runtime bound check)" : "")
03637         <<"!\n");
03638 
03639   // Okay! We can vectorize. At this point we don't have any other mem analysis
03640   // which may limit our maximum vectorization factor, so just return true with
03641   // no restrictions.
03642   return true;
03643 }
03644 
03645 static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {
03646   if (Ty->isPointerTy())
03647     return DL.getIntPtrType(Ty);
03648 
03649   // It is possible that char's or short's overflow when we ask for the loop's
03650   // trip count, work around this by changing the type size.
03651   if (Ty->getScalarSizeInBits() < 32)
03652     return Type::getInt32Ty(Ty->getContext());
03653 
03654   return Ty;
03655 }
03656 
03657 static Type* getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
03658   Ty0 = convertPointerToIntegerType(DL, Ty0);
03659   Ty1 = convertPointerToIntegerType(DL, Ty1);
03660   if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
03661     return Ty0;
03662   return Ty1;
03663 }
03664 
03665 /// \brief Check that the instruction has outside loop users and is not an
03666 /// identified reduction variable.
03667 static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
03668                                SmallPtrSetImpl<Value *> &Reductions) {
03669   // Reduction instructions are allowed to have exit users. All other
03670   // instructions must not have external users.
03671   if (!Reductions.count(Inst))
03672     //Check that all of the users of the loop are inside the BB.
03673     for (User *U : Inst->users()) {
03674       Instruction *UI = cast<Instruction>(U);
03675       // This user may be a reduction exit value.
03676       if (!TheLoop->contains(UI)) {
03677         DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n');
03678         return true;
03679       }
03680     }
03681   return false;
03682 }
03683 
03684 bool LoopVectorizationLegality::canVectorizeInstrs() {
03685   BasicBlock *PreHeader = TheLoop->getLoopPreheader();
03686   BasicBlock *Header = TheLoop->getHeader();
03687 
03688   // Look for the attribute signaling the absence of NaNs.
03689   Function &F = *Header->getParent();
03690   if (F.hasFnAttribute("no-nans-fp-math"))
03691     HasFunNoNaNAttr = F.getAttributes().getAttribute(
03692       AttributeSet::FunctionIndex,
03693       "no-nans-fp-math").getValueAsString() == "true";
03694 
03695   // For each block in the loop.
03696   for (Loop::block_iterator bb = TheLoop->block_begin(),
03697        be = TheLoop->block_end(); bb != be; ++bb) {
03698 
03699     // Scan the instructions in the block and look for hazards.
03700     for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
03701          ++it) {
03702 
03703       if (PHINode *Phi = dyn_cast<PHINode>(it)) {
03704         Type *PhiTy = Phi->getType();
03705         // Check that this PHI type is allowed.
03706         if (!PhiTy->isIntegerTy() &&
03707             !PhiTy->isFloatingPointTy() &&
03708             !PhiTy->isPointerTy()) {
03709           emitAnalysis(Report(it)
03710                        << "loop control flow is not understood by vectorizer");
03711           DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
03712           return false;
03713         }
03714 
03715         // If this PHINode is not in the header block, then we know that we
03716         // can convert it to select during if-conversion. No need to check if
03717         // the PHIs in this block are induction or reduction variables.
03718         if (*bb != Header) {
03719           // Check that this instruction has no outside users or is an
03720           // identified reduction value with an outside user.
03721           if (!hasOutsideLoopUser(TheLoop, it, AllowedExit))
03722             continue;
03723           emitAnalysis(Report(it) << "value could not be identified as "
03724                                      "an induction or reduction variable");
03725           return false;
03726         }
03727 
03728         // We only allow if-converted PHIs with more than two incoming values.
03729         if (Phi->getNumIncomingValues() != 2) {
03730           emitAnalysis(Report(it)
03731                        << "control flow not understood by vectorizer");
03732           DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
03733           return false;
03734         }
03735 
03736         // This is the value coming from the preheader.
03737         Value *StartValue = Phi->getIncomingValueForBlock(PreHeader);
03738         // Check if this is an induction variable.
03739         InductionKind IK = isInductionVariable(Phi);
03740 
03741         if (IK_NoInduction != IK) {
03742           // Get the widest type.
03743           if (!WidestIndTy)
03744             WidestIndTy = convertPointerToIntegerType(*DL, PhiTy);
03745           else
03746             WidestIndTy = getWiderType(*DL, PhiTy, WidestIndTy);
03747 
03748           // Int inductions are special because we only allow one IV.
03749           if (IK == IK_IntInduction) {
03750             // Use the phi node with the widest type as induction. Use the last
03751             // one if there are multiple (no good reason for doing this other
03752             // than it is expedient).
03753             if (!Induction || PhiTy == WidestIndTy)
03754               Induction = Phi;
03755           }
03756 
03757           DEBUG(dbgs() << "LV: Found an induction variable.\n");
03758           Inductions[Phi] = InductionInfo(StartValue, IK);
03759 
03760           // Until we explicitly handle the case of an induction variable with
03761           // an outside loop user we have to give up vectorizing this loop.
03762           if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) {
03763             emitAnalysis(Report(it) << "use of induction value outside of the "
03764                                        "loop is not handled by vectorizer");
03765             return false;
03766           }
03767 
03768           continue;
03769         }
03770 
03771         if (AddReductionVar(Phi, RK_IntegerAdd)) {
03772           DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n");
03773           continue;
03774         }
03775         if (AddReductionVar(Phi, RK_IntegerMult)) {
03776           DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n");
03777           continue;
03778         }
03779         if (AddReductionVar(Phi, RK_IntegerOr)) {
03780           DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n");
03781           continue;
03782         }
03783         if (AddReductionVar(Phi, RK_IntegerAnd)) {
03784           DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n");
03785           continue;
03786         }
03787         if (AddReductionVar(Phi, RK_IntegerXor)) {
03788           DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n");
03789           continue;
03790         }
03791         if (AddReductionVar(Phi, RK_IntegerMinMax)) {
03792           DEBUG(dbgs() << "LV: Found a MINMAX reduction PHI."<< *Phi <<"\n");
03793           continue;
03794         }
03795         if (AddReductionVar(Phi, RK_FloatMult)) {
03796           DEBUG(dbgs() << "LV: Found an FMult reduction PHI."<< *Phi <<"\n");
03797           continue;
03798         }
03799         if (AddReductionVar(Phi, RK_FloatAdd)) {
03800           DEBUG(dbgs() << "LV: Found an FAdd reduction PHI."<< *Phi <<"\n");
03801           continue;
03802         }
03803         if (AddReductionVar(Phi, RK_FloatMinMax)) {
03804           DEBUG(dbgs() << "LV: Found an float MINMAX reduction PHI."<< *Phi <<
03805                 "\n");
03806           continue;
03807         }
03808 
03809         emitAnalysis(Report(it) << "value that could not be identified as "
03810                                    "reduction is used outside the loop");
03811         DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
03812         return false;
03813       }// end of PHI handling
03814 
03815       // We still don't handle functions. However, we can ignore dbg intrinsic
03816       // calls and we do handle certain intrinsic and libm functions.
03817       CallInst *CI = dyn_cast<CallInst>(it);
03818       if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI)) {
03819         emitAnalysis(Report(it) << "call instruction cannot be vectorized");
03820         DEBUG(dbgs() << "LV: Found a call site.\n");
03821         return false;
03822       }
03823 
03824       // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the
03825       // second argument is the same (i.e. loop invariant)
03826       if (CI &&
03827           hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) {
03828         if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) {
03829           emitAnalysis(Report(it)
03830                        << "intrinsic instruction cannot be vectorized");
03831           DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n");
03832           return false;
03833         }
03834       }
03835 
03836       // Check that the instruction return type is vectorizable.
03837       // Also, we can't vectorize extractelement instructions.
03838       if ((!VectorType::isValidElementType(it->getType()) &&
03839            !it->getType()->isVoidTy()) || isa<ExtractElementInst>(it)) {
03840         emitAnalysis(Report(it)
03841                      << "instruction return type cannot be vectorized");
03842         DEBUG(dbgs() << "LV: Found unvectorizable type.\n");
03843         return false;
03844       }
03845 
03846       // Check that the stored type is vectorizable.
03847       if (StoreInst *ST = dyn_cast<StoreInst>(it)) {
03848         Type *T = ST->getValueOperand()->getType();
03849         if (!VectorType::isValidElementType(T)) {
03850           emitAnalysis(Report(ST) << "store instruction cannot be vectorized");
03851           return false;
03852         }
03853         if (EnableMemAccessVersioning)
03854           collectStridedAcccess(ST);
03855       }
03856 
03857       if (EnableMemAccessVersioning)
03858         if (LoadInst *LI = dyn_cast<LoadInst>(it))
03859           collectStridedAcccess(LI);
03860 
03861       // Reduction instructions are allowed to have exit users.
03862       // All other instructions must not have external users.
03863       if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) {
03864         emitAnalysis(Report(it) << "value cannot be used outside the loop");
03865         return false;
03866       }
03867 
03868     } // next instr.
03869 
03870   }
03871 
03872   if (!Induction) {
03873     DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
03874     if (Inductions.empty()) {
03875       emitAnalysis(Report()
03876                    << "loop induction variable could not be identified");
03877       return false;
03878     }
03879   }
03880 
03881   return true;
03882 }
03883 
03884 ///\brief Remove GEPs whose indices but the last one are loop invariant and
03885 /// return the induction operand of the gep pointer.
03886 static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE,
03887                                  const DataLayout *DL, Loop *Lp) {
03888   GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
03889   if (!GEP)
03890     return Ptr;
03891 
03892   unsigned InductionOperand = getGEPInductionOperand(DL, GEP);
03893 
03894   // Check that all of the gep indices are uniform except for our induction
03895   // operand.
03896   for (unsigned i = 0, e = GEP->getNumOperands(); i != e; ++i)
03897     if (i != InductionOperand &&
03898         !SE->isLoopInvariant(SE->getSCEV(GEP->getOperand(i)), Lp))
03899       return Ptr;
03900   return GEP->getOperand(InductionOperand);
03901 }
03902 
03903 ///\brief Look for a cast use of the passed value.
03904 static Value *getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty) {
03905   Value *UniqueCast = nullptr;
03906   for (User *U : Ptr->users()) {
03907     CastInst *CI = dyn_cast<CastInst>(U);
03908     if (CI && CI->getType() == Ty) {
03909       if (!UniqueCast)
03910         UniqueCast = CI;
03911       else
03912         return nullptr;
03913     }
03914   }
03915   return UniqueCast;
03916 }
03917 
03918 ///\brief Get the stride of a pointer access in a loop.
03919 /// Looks for symbolic strides "a[i*stride]". Returns the symbolic stride as a
03920 /// pointer to the Value, or null otherwise.
03921 static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE,
03922                                    const DataLayout *DL, Loop *Lp) {
03923   const PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
03924   if (!PtrTy || PtrTy->isAggregateType())
03925     return nullptr;
03926 
03927   // Try to remove a gep instruction to make the pointer (actually index at this
03928   // point) easier analyzable. If OrigPtr is equal to Ptr we are analzying the
03929   // pointer, otherwise, we are analyzing the index.
03930   Value *OrigPtr = Ptr;
03931 
03932   // The size of the pointer access.
03933   int64_t PtrAccessSize = 1;
03934 
03935   Ptr = stripGetElementPtr(Ptr, SE, DL, Lp);
03936   const SCEV *V = SE->getSCEV(Ptr);
03937 
03938   if (Ptr != OrigPtr)
03939     // Strip off casts.
03940     while (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(V))
03941       V = C->getOperand();
03942 
03943   const SCEVAddRecExpr *S = dyn_cast<SCEVAddRecExpr>(V);
03944   if (!S)
03945     return nullptr;
03946 
03947   V = S->getStepRecurrence(*SE);
03948   if (!V)
03949     return nullptr;
03950 
03951   // Strip off the size of access multiplication if we are still analyzing the
03952   // pointer.
03953   if (OrigPtr == Ptr) {
03954     DL->getTypeAllocSize(PtrTy->getElementType());
03955     if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(V)) {
03956       if (M->getOperand(0)->getSCEVType() != scConstant)
03957         return nullptr;
03958 
03959       const APInt &APStepVal =
03960           cast<SCEVConstant>(M->getOperand(0))->getValue()->getValue();
03961 
03962       // Huge step value - give up.
03963       if (APStepVal.getBitWidth() > 64)
03964         return nullptr;
03965 
03966       int64_t StepVal = APStepVal.getSExtValue();
03967       if (PtrAccessSize != StepVal)
03968         return nullptr;
03969       V = M->getOperand(1);
03970     }
03971   }
03972 
03973   // Strip off casts.
03974   Type *StripedOffRecurrenceCast = nullptr;
03975   if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(V)) {
03976     StripedOffRecurrenceCast = C->getType();
03977     V = C->getOperand();
03978   }
03979 
03980   // Look for the loop invariant symbolic value.
03981   const SCEVUnknown *U = dyn_cast<SCEVUnknown>(V);
03982   if (!U)
03983     return nullptr;
03984 
03985   Value *Stride = U->getValue();
03986   if (!Lp->isLoopInvariant(Stride))
03987     return nullptr;
03988 
03989   // If we have stripped off the recurrence cast we have to make sure that we
03990   // return the value that is used in this loop so that we can replace it later.
03991   if (StripedOffRecurrenceCast)
03992     Stride = getUniqueCastUse(Stride, Lp, StripedOffRecurrenceCast);
03993 
03994   return Stride;
03995 }
03996 
03997 void LoopVectorizationLegality::collectStridedAcccess(Value *MemAccess) {
03998   Value *Ptr = nullptr;
03999   if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess))
04000     Ptr = LI->getPointerOperand();
04001   else if (StoreInst *SI = dyn_cast<StoreInst>(MemAccess))
04002     Ptr = SI->getPointerOperand();
04003   else
04004     return;
04005 
04006   Value *Stride = getStrideFromPointer(Ptr, SE, DL, TheLoop);
04007   if (!Stride)
04008     return;
04009 
04010   DEBUG(dbgs() << "LV: Found a strided access that we can version");
04011   DEBUG(dbgs() << "  Ptr: " << *Ptr << " Stride: " << *Stride << "\n");
04012   Strides[Ptr] = Stride;
04013   StrideSet.insert(Stride);
04014 }
04015 
04016 void LoopVectorizationLegality::collectLoopUniforms() {
04017   // We now know that the loop is vectorizable!
04018   // Collect variables that will remain uniform after vectorization.
04019   std::vector<Value*> Worklist;
04020   BasicBlock *Latch = TheLoop->getLoopLatch();
04021 
04022   // Start with the conditional branch and walk up the block.
04023   Worklist.push_back(Latch->getTerminator()->getOperand(0));
04024 
04025   // Also add all consecutive pointer values; these values will be uniform
04026   // after vectorization (and subsequent cleanup) and, until revectorization is
04027   // supported, all dependencies must also be uniform.
04028   for (Loop::block_iterator B = TheLoop->block_begin(),
04029        BE = TheLoop->block_end(); B != BE; ++B)
04030     for (BasicBlock::iterator I = (*B)->begin(), IE = (*B)->end();
04031          I != IE; ++I)
04032       if (I->getType()->isPointerTy() && isConsecutivePtr(I))
04033         Worklist.insert(Worklist.end(), I->op_begin(), I->op_end());
04034 
04035   while (Worklist.size()) {
04036     Instruction *I = dyn_cast<Instruction>(Worklist.back());
04037     Worklist.pop_back();
04038 
04039     // Look at instructions inside this loop.
04040     // Stop when reaching PHI nodes.
04041     // TODO: we need to follow values all over the loop, not only in this block.
04042     if (!I || !TheLoop->contains(I) || isa<PHINode>(I))
04043       continue;
04044 
04045     // This is a known uniform.
04046     Uniforms.insert(I);
04047 
04048     // Insert all operands.
04049     Worklist.insert(Worklist.end(), I->op_begin(), I->op_end());
04050   }
04051 }
04052 
04053 namespace {
04054 /// \brief Analyses memory accesses in a loop.
04055 ///
04056 /// Checks whether run time pointer checks are needed and builds sets for data
04057 /// dependence checking.
04058 class AccessAnalysis {
04059 public:
04060   /// \brief Read or write access location.
04061   typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
04062   typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet;
04063 
04064   /// \brief Set of potential dependent memory accesses.
04065   typedef EquivalenceClasses<MemAccessInfo> DepCandidates;
04066 
04067   AccessAnalysis(const DataLayout *Dl, AliasAnalysis *AA, DepCandidates &DA) :
04068     DL(Dl), AST(*AA), DepCands(DA), IsRTCheckNeeded(false) {}
04069 
04070   /// \brief Register a load  and whether it is only read from.
04071   void addLoad(AliasAnalysis::Location &Loc, bool IsReadOnly) {
04072     Value *Ptr = const_cast<Value*>(Loc.Ptr);
04073     AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags);
04074     Accesses.insert(MemAccessInfo(Ptr, false));
04075     if (IsReadOnly)
04076       ReadOnlyPtr.insert(Ptr);
04077   }
04078 
04079   /// \brief Register a store.
04080   void addStore(AliasAnalysis::Location &Loc) {
04081     Value *Ptr = const_cast<Value*>(Loc.Ptr);
04082     AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags);
04083     Accesses.insert(MemAccessInfo(Ptr, true));
04084   }
04085 
04086   /// \brief Check whether we can check the pointers at runtime for
04087   /// non-intersection.
04088   bool canCheckPtrAtRT(LoopVectorizationLegality::RuntimePointerCheck &RtCheck,
04089                        unsigned &NumComparisons, ScalarEvolution *SE,
04090                        Loop *TheLoop, ValueToValueMap &Strides,
04091                        bool ShouldCheckStride = false);
04092 
04093   /// \brief Goes over all memory accesses, checks whether a RT check is needed
04094   /// and builds sets of dependent accesses.
04095   void buildDependenceSets() {
04096     processMemAccesses();
04097   }
04098 
04099   bool isRTCheckNeeded() { return IsRTCheckNeeded; }
04100 
04101   bool isDependencyCheckNeeded() { return !CheckDeps.empty(); }
04102   void resetDepChecks() { CheckDeps.clear(); }
04103 
04104   MemAccessInfoSet &getDependenciesToCheck() { return CheckDeps; }
04105 
04106 private:
04107   typedef SetVector<MemAccessInfo> PtrAccessSet;
04108 
04109   /// \brief Go over all memory access and check whether runtime pointer checks
04110   /// are needed /// and build sets of dependency check candidates.
04111   void processMemAccesses();
04112 
04113   /// Set of all accesses.
04114   PtrAccessSet Accesses;
04115 
04116   /// Set of accesses that need a further dependence check.
04117   MemAccessInfoSet CheckDeps;
04118 
04119   /// Set of pointers that are read only.
04120   SmallPtrSet<Value*, 16> ReadOnlyPtr;
04121 
04122   const DataLayout *DL;
04123 
04124   /// An alias set tracker to partition the access set by underlying object and
04125   //intrinsic property (such as TBAA metadata).
04126   AliasSetTracker AST;
04127 
04128   /// Sets of potentially dependent accesses - members of one set share an
04129   /// underlying pointer. The set "CheckDeps" identfies which sets really need a
04130   /// dependence check.
04131   DepCandidates &DepCands;
04132 
04133   bool IsRTCheckNeeded;
04134 };
04135 
04136 } // end anonymous namespace
04137 
04138 /// \brief Check whether a pointer can participate in a runtime bounds check.
04139 static bool hasComputableBounds(ScalarEvolution *SE, ValueToValueMap &Strides,
04140                                 Value *Ptr) {
04141   const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, Strides, Ptr);
04142   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
04143   if (!AR)
04144     return false;
04145 
04146   return AR->isAffine();
04147 }
04148 
04149 /// \brief Check the stride of the pointer and ensure that it does not wrap in
04150 /// the address space.
04151 static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr,
04152                         const Loop *Lp, ValueToValueMap &StridesMap);
04153 
04154 bool AccessAnalysis::canCheckPtrAtRT(
04155     LoopVectorizationLegality::RuntimePointerCheck &RtCheck,
04156     unsigned &NumComparisons, ScalarEvolution *SE, Loop *TheLoop,
04157     ValueToValueMap &StridesMap, bool ShouldCheckStride) {
04158   // Find pointers with computable bounds. We are going to use this information
04159   // to place a runtime bound check.
04160   bool CanDoRT = true;
04161 
04162   bool IsDepCheckNeeded = isDependencyCheckNeeded();
04163   NumComparisons = 0;
04164 
04165   // We assign a consecutive id to access from different alias sets.
04166   // Accesses between different groups doesn't need to be checked.
04167   unsigned ASId = 1;
04168   for (auto &AS : AST) {
04169     unsigned NumReadPtrChecks = 0;
04170     unsigned NumWritePtrChecks = 0;
04171 
04172     // We assign consecutive id to access from different dependence sets.
04173     // Accesses within the same set don't need a runtime check.
04174     unsigned RunningDepId = 1;
04175     DenseMap<Value *, unsigned> DepSetId;
04176 
04177     for (auto A : AS) {
04178       Value *Ptr = A.getValue();
04179       bool IsWrite = Accesses.count(MemAccessInfo(Ptr, true));
04180       MemAccessInfo Access(Ptr, IsWrite);
04181 
04182       if (IsWrite)
04183         ++NumWritePtrChecks;
04184       else
04185         ++NumReadPtrChecks;
04186 
04187       if (hasComputableBounds(SE, StridesMap, Ptr) &&
04188           // When we run after a failing dependency check we have to make sure we
04189           // don't have wrapping pointers.
04190           (!ShouldCheckStride ||
04191            isStridedPtr(SE, DL, Ptr, TheLoop, StridesMap) == 1)) {
04192         // The id of the dependence set.
04193         unsigned DepId;
04194 
04195         if (IsDepCheckNeeded) {
04196           Value *Leader = DepCands.getLeaderValue(Access).getPointer();
04197           unsigned &LeaderId = DepSetId[Leader];
04198           if (!LeaderId)
04199             LeaderId = RunningDepId++;
04200           DepId = LeaderId;
04201         } else
04202           // Each access has its own dependence set.
04203           DepId = RunningDepId++;
04204 
04205         RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap);
04206 
04207         DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr << '\n');
04208       } else {
04209         CanDoRT = false;
04210       }
04211     }
04212 
04213     if (IsDepCheckNeeded && CanDoRT && RunningDepId == 2)
04214       NumComparisons += 0; // Only one dependence set.
04215     else {
04216       NumComparisons += (NumWritePtrChecks * (NumReadPtrChecks +
04217                                               NumWritePtrChecks - 1));
04218     }
04219 
04220     ++ASId;
04221   }
04222 
04223   // If the pointers that we would use for the bounds comparison have different
04224   // address spaces, assume the values aren't directly comparable, so we can't
04225   // use them for the runtime check. We also have to assume they could
04226   // overlap. In the future there should be metadata for whether address spaces
04227   // are disjoint.
04228   unsigned NumPointers = RtCheck.Pointers.size();
04229   for (unsigned i = 0; i < NumPointers; ++i) {
04230     for (unsigned j = i + 1; j < NumPointers; ++j) {
04231       // Only need to check pointers between two different dependency sets.
04232       if (RtCheck.DependencySetId[i] == RtCheck.DependencySetId[j])
04233        continue;
04234       // Only need to check pointers in the same alias set.
04235       if (RtCheck.AliasSetId[i] != RtCheck.AliasSetId[j])
04236         continue;
04237 
04238       Value *PtrI = RtCheck.Pointers[i];
04239       Value *PtrJ = RtCheck.Pointers[j];
04240 
04241       unsigned ASi = PtrI->getType()->getPointerAddressSpace();
04242       unsigned ASj = PtrJ->getType()->getPointerAddressSpace();
04243       if (ASi != ASj) {
04244         DEBUG(dbgs() << "LV: Runtime check would require comparison between"
04245                        " different address spaces\n");
04246         return false;
04247       }
04248     }
04249   }
04250 
04251   return CanDoRT;
04252 }
04253 
04254 void AccessAnalysis::processMemAccesses() {
04255   // We process the set twice: first we process read-write pointers, last we
04256   // process read-only pointers. This allows us to skip dependence tests for
04257   // read-only pointers.
04258 
04259   DEBUG(dbgs() << "LV: Processing memory accesses...\n");
04260   DEBUG(dbgs() << "  AST: "; AST.dump());
04261   DEBUG(dbgs() << "LV:   Accesses:\n");
04262   DEBUG({
04263     for (auto A : Accesses)
04264       dbgs() << "\t" << *A.getPointer() << " (" <<
04265                 (A.getInt() ? "write" : (ReadOnlyPtr.count(A.getPointer()) ?
04266                                          "read-only" : "read")) << ")\n";
04267   });
04268 
04269   // The AliasSetTracker has nicely partitioned our pointers by metadata
04270   // compatibility and potential for underlying-object overlap. As a result, we
04271   // only need to check for potential pointer dependencies within each alias
04272   // set.
04273   for (auto &AS : AST) {
04274     // Note that both the alias-set tracker and the alias sets themselves used
04275     // linked lists internally and so the iteration order here is deterministic
04276     // (matching the original instruction order within each set).
04277 
04278     bool SetHasWrite = false;
04279 
04280     // Map of pointers to last access encountered.
04281     typedef DenseMap<Value*, MemAccessInfo> UnderlyingObjToAccessMap;
04282     UnderlyingObjToAccessMap ObjToLastAccess;
04283 
04284     // Set of access to check after all writes have been processed.
04285     PtrAccessSet DeferredAccesses;
04286 
04287     // Iterate over each alias set twice, once to process read/write pointers,
04288     // and then to process read-only pointers.
04289     for (int SetIteration = 0; SetIteration < 2; ++SetIteration) {
04290       bool UseDeferred = SetIteration > 0;
04291       PtrAccessSet &S = UseDeferred ? DeferredAccesses : Accesses;
04292 
04293       for (auto A : AS) {
04294         Value *Ptr = A.getValue();
04295         bool IsWrite = S.count(MemAccessInfo(Ptr, true));
04296 
04297         // If we're using the deferred access set, then it contains only reads.
04298         bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite;
04299         if (UseDeferred && !IsReadOnlyPtr)
04300           continue;
04301         // Otherwise, the pointer must be in the PtrAccessSet, either as a read
04302         // or a write.
04303         assert(((IsReadOnlyPtr && UseDeferred) || IsWrite ||
04304                  S.count(MemAccessInfo(Ptr, false))) &&
04305                "Alias-set pointer not in the access set?");
04306 
04307         MemAccessInfo Access(Ptr, IsWrite);
04308         DepCands.insert(Access);
04309 
04310         // Memorize read-only pointers for later processing and skip them in the
04311         // first round (they need to be checked after we have seen all write
04312         // pointers). Note: we also mark pointer that are not consecutive as
04313         // "read-only" pointers (so that we check "a[b[i]] +="). Hence, we need
04314         // the second check for "!IsWrite".
04315         if (!UseDeferred && IsReadOnlyPtr) {
04316           DeferredAccesses.insert(Access);
04317           continue;
04318         }
04319 
04320         // If this is a write - check other reads and writes for conflicts.  If
04321         // this is a read only check other writes for conflicts (but only if
04322         // there is no other write to the ptr - this is an optimization to
04323         // catch "a[i] = a[i] + " without having to do a dependence check).
04324         if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) {
04325           CheckDeps.insert(Access);
04326           IsRTCheckNeeded = true;
04327         }
04328 
04329         if (IsWrite)
04330           SetHasWrite = true;
04331 
04332         // Create sets of pointers connected by a shared alias set and
04333         // underlying object.
04334         typedef SmallVector<Value *, 16> ValueVector;
04335         ValueVector TempObjects;
04336         GetUnderlyingObjects(Ptr, TempObjects, DL);
04337         for (Value *UnderlyingObj : TempObjects) {
04338           UnderlyingObjToAccessMap::iterator Prev =
04339             ObjToLastAccess.find(UnderlyingObj);
04340           if (Prev != ObjToLastAccess.end())
04341             DepCands.unionSets(Access, Prev->second);
04342 
04343           ObjToLastAccess[UnderlyingObj] = Access;
04344         }
04345       }
04346     }
04347   }
04348 }
04349 
04350 namespace {
04351 /// \brief Checks memory dependences among accesses to the same underlying
04352 /// object to determine whether there vectorization is legal or not (and at
04353 /// which vectorization factor).
04354 ///
04355 /// This class works under the assumption that we already checked that memory
04356 /// locations with different underlying pointers are "must-not alias".
04357 /// We use the ScalarEvolution framework to symbolically evalutate access
04358 /// functions pairs. Since we currently don't restructure the loop we can rely
04359 /// on the program order of memory accesses to determine their safety.
04360 /// At the moment we will only deem accesses as safe for:
04361 ///  * A negative constant distance assuming program order.
04362 ///
04363 ///      Safe: tmp = a[i + 1];     OR     a[i + 1] = x;
04364 ///            a[i] = tmp;                y = a[i];
04365 ///
04366 ///   The latter case is safe because later checks guarantuee that there can't
04367 ///   be a cycle through a phi node (that is, we check that "x" and "y" is not
04368 ///   the same variable: a header phi can only be an induction or a reduction, a
04369 ///   reduction can't have a memory sink, an induction can't have a memory
04370 ///   source). This is important and must not be violated (or we have to
04371 ///   resort to checking for cycles through memory).
04372 ///
04373 ///  * A positive constant distance assuming program order that is bigger
04374 ///    than the biggest memory access.
04375 ///
04376 ///     tmp = a[i]        OR              b[i] = x
04377 ///     a[i+2] = tmp                      y = b[i+2];
04378 ///
04379 ///     Safe distance: 2 x sizeof(a[0]), and 2 x sizeof(b[0]), respectively.
04380 ///
04381 ///  * Zero distances and all accesses have the same size.
04382 ///
04383 class MemoryDepChecker {
04384 public:
04385   typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
04386   typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet;
04387 
04388   MemoryDepChecker(ScalarEvolution *Se, const DataLayout *Dl, const Loop *L)
04389       : SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0),
04390         ShouldRetryWithRuntimeCheck(false) {}
04391 
04392   /// \brief Register the location (instructions are given increasing numbers)
04393   /// of a write access.
04394   void addAccess(StoreInst *SI) {
04395     Value *Ptr = SI->getPointerOperand();
04396     Accesses[MemAccessInfo(Ptr, true)].push_back(AccessIdx);
04397     InstMap.push_back(SI);
04398     ++AccessIdx;
04399   }
04400 
04401   /// \brief Register the location (instructions are given increasing numbers)
04402   /// of a write access.
04403   void addAccess(LoadInst *LI) {
04404     Value *Ptr = LI->getPointerOperand();
04405     Accesses[MemAccessInfo(Ptr, false)].push_back(AccessIdx);
04406     InstMap.push_back(LI);
04407     ++AccessIdx;
04408   }
04409 
04410   /// \brief Check whether the dependencies between the accesses are safe.
04411   ///
04412   /// Only checks sets with elements in \p CheckDeps.
04413   bool areDepsSafe(AccessAnalysis::DepCandidates &AccessSets,
04414                    MemAccessInfoSet &CheckDeps, ValueToValueMap &Strides);
04415 
04416   /// \brief The maximum number of bytes of a vector register we can vectorize
04417   /// the accesses safely with.
04418   unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; }
04419 
04420   /// \brief In same cases when the dependency check fails we can still
04421   /// vectorize the loop with a dynamic array access check.
04422   bool shouldRetryWithRuntimeCheck() { return ShouldRetryWithRuntimeCheck; }
04423 
04424 private:
04425   ScalarEvolution *SE;
04426   const DataLayout *DL;
04427   const Loop *InnermostLoop;
04428 
04429   /// \brief Maps access locations (ptr, read/write) to program order.
04430   DenseMap<MemAccessInfo, std::vector<unsigned> > Accesses;
04431 
04432   /// \brief Memory access instructions in program order.
04433   SmallVector<Instruction *, 16> InstMap;
04434 
04435   /// \brief The program order index to be used for the next instruction.
04436   unsigned AccessIdx;
04437 
04438   // We can access this many bytes in parallel safely.
04439   unsigned MaxSafeDepDistBytes;
04440 
04441   /// \brief If we see a non-constant dependence distance we can still try to
04442   /// vectorize this loop with runtime checks.
04443   bool ShouldRetryWithRuntimeCheck;
04444 
04445   /// \brief Check whether there is a plausible dependence between the two
04446   /// accesses.
04447   ///
04448   /// Access \p A must happen before \p B in program order. The two indices
04449   /// identify the index into the program order map.
04450   ///
04451   /// This function checks  whether there is a plausible dependence (or the
04452   /// absence of such can't be proved) between the two accesses. If there is a
04453   /// plausible dependence but the dependence distance is bigger than one
04454   /// element access it records this distance in \p MaxSafeDepDistBytes (if this
04455   /// distance is smaller than any other distance encountered so far).
04456   /// Otherwise, this function returns true signaling a possible dependence.
04457   bool isDependent(const MemAccessInfo &A, unsigned AIdx,
04458                    const MemAccessInfo &B, unsigned BIdx,
04459                    ValueToValueMap &Strides);
04460 
04461   /// \brief Check whether the data dependence could prevent store-load
04462   /// forwarding.
04463   bool couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize);
04464 };
04465 
04466 } // end anonymous namespace
04467 
04468 static bool isInBoundsGep(Value *Ptr) {
04469   if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr))
04470     return GEP->isInBounds();
04471   return false;
04472 }
04473 
04474 /// \brief Check whether the access through \p Ptr has a constant stride.
04475 static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr,
04476                         const Loop *Lp, ValueToValueMap &StridesMap) {
04477   const Type *Ty = Ptr->getType();
04478   assert(Ty->isPointerTy() && "Unexpected non-ptr");
04479 
04480   // Make sure that the pointer does not point to aggregate types.
04481   const PointerType *PtrTy = cast<PointerType>(Ty);
04482   if (PtrTy->getElementType()->isAggregateType()) {
04483     DEBUG(dbgs() << "LV: Bad stride - Not a pointer to a scalar type" << *Ptr <<
04484           "\n");
04485     return 0;
04486   }
04487 
04488   const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, StridesMap, Ptr);
04489 
04490   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
04491   if (!AR) {
04492     DEBUG(dbgs() << "LV: Bad stride - Not an AddRecExpr pointer "
04493           << *Ptr << " SCEV: " << *PtrScev << "\n");
04494     return 0;
04495   }
04496 
04497   // The accesss function must stride over the innermost loop.
04498   if (Lp != AR->getLoop()) {
04499     DEBUG(dbgs() << "LV: Bad stride - Not striding over innermost loop " <<
04500           *Ptr << " SCEV: " << *PtrScev << "\n");
04501   }
04502 
04503   // The address calculation must not wrap. Otherwise, a dependence could be
04504   // inverted.
04505   // An inbounds getelementptr that is a AddRec with a unit stride
04506   // cannot wrap per definition. The unit stride requirement is checked later.
04507   // An getelementptr without an inbounds attribute and unit stride would have
04508   // to access the pointer value "0" which is undefined behavior in address
04509   // space 0, therefore we can also vectorize this case.
04510   bool IsInBoundsGEP = isInBoundsGep(Ptr);
04511   bool IsNoWrapAddRec = AR->getNoWrapFlags(SCEV::NoWrapMask);
04512   bool IsInAddressSpaceZero = PtrTy->getAddressSpace() == 0;
04513   if (!IsNoWrapAddRec && !IsInBoundsGEP && !IsInAddressSpaceZero) {
04514     DEBUG(dbgs() << "LV: Bad stride - Pointer may wrap in the address space "
04515           << *Ptr << " SCEV: " << *PtrScev << "\n");
04516     return 0;
04517   }
04518 
04519   // Check the step is constant.
04520   const SCEV *Step = AR->getStepRecurrence(*SE);
04521 
04522   // Calculate the pointer stride and check if it is consecutive.
04523   const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
04524   if (!C) {
04525     DEBUG(dbgs() << "LV: Bad stride - Not a constant strided " << *Ptr <<
04526           " SCEV: " << *PtrScev << "\n");
04527     return 0;
04528   }
04529 
04530   int64_t Size = DL->getTypeAllocSize(PtrTy->getElementType());
04531   const APInt &APStepVal = C->getValue()->getValue();
04532 
04533   // Huge step value - give up.
04534   if (APStepVal.getBitWidth() > 64)
04535     return 0;
04536 
04537   int64_t StepVal = APStepVal.getSExtValue();
04538 
04539   // Strided access.
04540   int64_t Stride = StepVal / Size;
04541   int64_t Rem = StepVal % Size;
04542   if (Rem)
04543     return 0;
04544 
04545   // If the SCEV could wrap but we have an inbounds gep with a unit stride we
04546   // know we can't "wrap around the address space". In case of address space
04547   // zero we know that this won't happen without triggering undefined behavior.
04548   if (!IsNoWrapAddRec && (IsInBoundsGEP || IsInAddressSpaceZero) &&
04549       Stride != 1 && Stride != -1)
04550     return 0;
04551 
04552   return Stride;
04553 }
04554 
04555 bool MemoryDepChecker::couldPreventStoreLoadForward(unsigned Distance,
04556                                                     unsigned TypeByteSize) {
04557   // If loads occur at a distance that is not a multiple of a feasible vector
04558   // factor store-load forwarding does not take place.
04559   // Positive dependences might cause troubles because vectorizing them might
04560   // prevent store-load forwarding making vectorized code run a lot slower.
04561   //   a[i] = a[i-3] ^ a[i-8];
04562   //   The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and
04563   //   hence on your typical architecture store-load forwarding does not take
04564   //   place. Vectorizing in such cases does not make sense.
04565   // Store-load forwarding distance.
04566   const unsigned NumCyclesForStoreLoadThroughMemory = 8*TypeByteSize;
04567   // Maximum vector factor.
04568   unsigned MaxVFWithoutSLForwardIssues = MaxVectorWidth*TypeByteSize;
04569   if(MaxSafeDepDistBytes < MaxVFWithoutSLForwardIssues)
04570     MaxVFWithoutSLForwardIssues = MaxSafeDepDistBytes;
04571 
04572   for (unsigned vf = 2*TypeByteSize; vf <= MaxVFWithoutSLForwardIssues;
04573        vf *= 2) {
04574     if (Distance % vf && Distance / vf < NumCyclesForStoreLoadThroughMemory) {
04575       MaxVFWithoutSLForwardIssues = (vf >>=1);
04576       break;
04577     }
04578   }
04579 
04580   if (MaxVFWithoutSLForwardIssues< 2*TypeByteSize) {
04581     DEBUG(dbgs() << "LV: Distance " << Distance <<
04582           " that could cause a store-load forwarding conflict\n");
04583     return true;
04584   }
04585 
04586   if (MaxVFWithoutSLForwardIssues < MaxSafeDepDistBytes &&
04587       MaxVFWithoutSLForwardIssues != MaxVectorWidth*TypeByteSize)
04588     MaxSafeDepDistBytes = MaxVFWithoutSLForwardIssues;
04589   return false;
04590 }
04591 
04592 bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
04593                                    const MemAccessInfo &B, unsigned BIdx,
04594                                    ValueToValueMap &Strides) {
04595   assert (AIdx < BIdx && "Must pass arguments in program order");
04596 
04597   Value *APtr = A.getPointer();
04598   Value *BPtr = B.getPointer();
04599   bool AIsWrite = A.getInt();
04600   bool BIsWrite = B.getInt();
04601 
04602   // Two reads are independent.
04603   if (!AIsWrite && !BIsWrite)
04604     return false;
04605 
04606   // We cannot check pointers in different address spaces.
04607   if (APtr->getType()->getPointerAddressSpace() !=
04608       BPtr->getType()->getPointerAddressSpace())
04609     return true;
04610 
04611   const SCEV *AScev = replaceSymbolicStrideSCEV(SE, Strides, APtr);
04612   const SCEV *BScev = replaceSymbolicStrideSCEV(SE, Strides, BPtr);
04613 
04614   int StrideAPtr = isStridedPtr(SE, DL, APtr, InnermostLoop, Strides);
04615   int StrideBPtr = isStridedPtr(SE, DL, BPtr, InnermostLoop, Strides);
04616 
04617   const SCEV *Src = AScev;
04618   const SCEV *Sink = BScev;
04619 
04620   // If the induction step is negative we have to invert source and sink of the
04621   // dependence.
04622   if (StrideAPtr < 0) {
04623     //Src = BScev;
04624     //Sink = AScev;
04625     std::swap(APtr, BPtr);
04626     std::swap(Src, Sink);
04627     std::swap(AIsWrite, BIsWrite);
04628     std::swap(AIdx, BIdx);
04629     std::swap(StrideAPtr, StrideBPtr);
04630   }
04631 
04632   const SCEV *Dist = SE->getMinusSCEV(Sink, Src);
04633 
04634   DEBUG(dbgs() << "LV: Src Scev: " << *Src << "Sink Scev: " << *Sink
04635         << "(Induction step: " << StrideAPtr <<  ")\n");
04636   DEBUG(dbgs() << "LV: Distance for " << *InstMap[AIdx] << " to "
04637         << *InstMap[BIdx] << ": " << *Dist << "\n");
04638 
04639   // Need consecutive accesses. We don't want to vectorize
04640   // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in
04641   // the address space.
04642   if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr){
04643     DEBUG(dbgs() << "Non-consecutive pointer access\n");
04644     return true;
04645   }
04646 
04647   const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
04648   if (!C) {
04649     DEBUG(dbgs() << "LV: Dependence because of non-constant distance\n");
04650     ShouldRetryWithRuntimeCheck = true;
04651     return true;
04652   }
04653 
04654   Type *ATy = APtr->getType()->getPointerElementType();
04655   Type *BTy = BPtr->getType()->getPointerElementType();
04656   unsigned TypeByteSize = DL->getTypeAllocSize(ATy);
04657 
04658   // Negative distances are not plausible dependencies.
04659   const APInt &Val = C->getValue()->getValue();
04660   if (Val.isNegative()) {
04661     bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
04662     if (IsTrueDataDependence &&
04663         (couldPreventStoreLoadForward(Val.abs().getZExtValue(), TypeByteSize) ||
04664          ATy != BTy))
04665       return true;
04666 
04667     DEBUG(dbgs() << "LV: Dependence is negative: NoDep\n");
04668     return false;
04669   }
04670 
04671   // Write to the same location with the same size.
04672   // Could be improved to assert type sizes are the same (i32 == float, etc).
04673   if (Val == 0) {
04674     if (ATy == BTy)
04675       return false;
04676     DEBUG(dbgs() << "LV: Zero dependence difference but different types\n");
04677     return true;
04678   }
04679 
04680   assert(Val.isStrictlyPositive() && "Expect a positive value");
04681 
04682   // Positive distance bigger than max vectorization factor.
04683   if (ATy != BTy) {
04684     DEBUG(dbgs() <<
04685           "LV: ReadWrite-Write positive dependency with different types\n");
04686     return false;
04687   }
04688 
04689   unsigned Distance = (unsigned) Val.getZExtValue();
04690 
04691   // Bail out early if passed-in parameters make vectorization not feasible.
04692   unsigned ForcedFactor = VectorizationFactor ? VectorizationFactor : 1;
04693   unsigned ForcedUnroll = VectorizationInterleave ? VectorizationInterleave : 1;
04694 
04695   // The distance must be bigger than the size needed for a vectorized version
04696   // of the operation and the size of the vectorized operation must not be
04697   // bigger than the currrent maximum size.
04698   if (Distance < 2*TypeByteSize ||
04699       2*TypeByteSize > MaxSafeDepDistBytes ||
04700       Distance < TypeByteSize * ForcedUnroll * ForcedFactor) {
04701     DEBUG(dbgs() << "LV: Failure because of Positive distance "
04702         << Val.getSExtValue() << '\n');
04703     return true;
04704   }
04705 
04706   MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ?
04707     Distance : MaxSafeDepDistBytes;
04708 
04709   bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
04710   if (IsTrueDataDependence &&
04711       couldPreventStoreLoadForward(Distance, TypeByteSize))
04712      return true;
04713 
04714   DEBUG(dbgs() << "LV: Positive distance " << Val.getSExtValue() <<
04715         " with max VF = " << MaxSafeDepDistBytes / TypeByteSize << '\n');
04716 
04717   return false;
04718 }
04719 
04720 bool MemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates &AccessSets,
04721                                    MemAccessInfoSet &CheckDeps,
04722                                    ValueToValueMap &Strides) {
04723 
04724   MaxSafeDepDistBytes = -1U;
04725   while (!CheckDeps.empty()) {
04726     MemAccessInfo CurAccess = *CheckDeps.begin();
04727 
04728     // Get the relevant memory access set.
04729     EquivalenceClasses<MemAccessInfo>::iterator I =
04730       AccessSets.findValue(AccessSets.getLeaderValue(CurAccess));
04731 
04732     // Check accesses within this set.
04733     EquivalenceClasses<MemAccessInfo>::member_iterator AI, AE;
04734     AI = AccessSets.member_begin(I), AE = AccessSets.member_end();
04735 
04736     // Check every access pair.
04737     while (AI != AE) {
04738       CheckDeps.erase(*AI);
04739       EquivalenceClasses<MemAccessInfo>::member_iterator OI = std::next(AI);
04740       while (OI != AE) {
04741         // Check every accessing instruction pair in program order.
04742         for (std::vector<unsigned>::iterator I1 = Accesses[*AI].begin(),
04743              I1E = Accesses[*AI].end(); I1 != I1E; ++I1)
04744           for (std::vector<unsigned>::iterator I2 = Accesses[*OI].begin(),
04745                I2E = Accesses[*OI].end(); I2 != I2E; ++I2) {
04746             if (*I1 < *I2 && isDependent(*AI, *I1, *OI, *I2, Strides))
04747               return false;
04748             if (*I2 < *I1 && isDependent(*OI, *I2, *AI, *I1, Strides))
04749               return false;
04750           }
04751         ++OI;
04752       }
04753       AI++;
04754     }
04755   }
04756   return true;
04757 }
04758 
04759 bool LoopVectorizationLegality::canVectorizeMemory() {
04760 
04761   typedef SmallVector<Value*, 16> ValueVector;
04762   typedef SmallPtrSet<Value*, 16> ValueSet;
04763 
04764   // Holds the Load and Store *instructions*.
04765   ValueVector Loads;
04766   ValueVector Stores;
04767 
04768   // Holds all the different accesses in the loop.
04769   unsigned NumReads = 0;
04770   unsigned NumReadWrites = 0;
04771 
04772   PtrRtCheck.Pointers.clear();
04773   PtrRtCheck.Need = false;
04774 
04775   const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
04776   MemoryDepChecker DepChecker(SE, DL, TheLoop);
04777 
04778   // For each block.
04779   for (Loop::block_iterator bb = TheLoop->block_begin(),
04780        be = TheLoop->block_end(); bb != be; ++bb) {
04781 
04782     // Scan the BB and collect legal loads and stores.
04783     for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
04784          ++it) {
04785 
04786       // If this is a load, save it. If this instruction can read from memory
04787       // but is not a load, then we quit. Notice that we don't handle function
04788       // calls that read or write.
04789       if (it->mayReadFromMemory()) {
04790         // Many math library functions read the rounding mode. We will only
04791         // vectorize a loop if it contains known function calls that don't set
04792         // the flag. Therefore, it is safe to ignore this read from memory.
04793         CallInst *Call = dyn_cast<CallInst>(it);
04794         if (Call && getIntrinsicIDForCall(Call, TLI))
04795           continue;
04796 
04797         LoadInst *Ld = dyn_cast<LoadInst>(it);
04798         if (!Ld || (!Ld->isSimple() && !IsAnnotatedParallel)) {
04799           emitAnalysis(Report(Ld)
04800                        << "read with atomic ordering or volatile read");
04801           DEBUG(dbgs() << "LV: Found a non-simple load.\n");
04802           return false;
04803         }
04804         NumLoads++;
04805         Loads.push_back(Ld);
04806         DepChecker.addAccess(Ld);
04807         continue;
04808       }
04809 
04810       // Save 'store' instructions. Abort if other instructions write to memory.
04811       if (it->mayWriteToMemory()) {
04812         StoreInst *St = dyn_cast<StoreInst>(it);
04813         if (!St) {
04814           emitAnalysis(Report(it) << "instruction cannot be vectorized");
04815           return false;
04816         }
04817         if (!St->isSimple() && !IsAnnotatedParallel) {
04818           emitAnalysis(Report(St)
04819                        << "write with atomic ordering or volatile write");
04820           DEBUG(dbgs() << "LV: Found a non-simple store.\n");
04821           return false;
04822         }
04823         NumStores++;
04824         Stores.push_back(St);
04825         DepChecker.addAccess(St);
04826       }
04827     } // Next instr.
04828   } // Next block.
04829 
04830   // Now we have two lists that hold the loads and the stores.
04831   // Next, we find the pointers that they use.
04832 
04833   // Check if we see any stores. If there are no stores, then we don't
04834   // care if the pointers are *restrict*.
04835   if (!Stores.size()) {
04836     DEBUG(dbgs() << "LV: Found a read-only loop!\n");
04837     return true;
04838   }
04839 
04840   AccessAnalysis::DepCandidates DependentAccesses;
04841   AccessAnalysis Accesses(DL, AA, DependentAccesses);
04842 
04843   // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
04844   // multiple times on the same object. If the ptr is accessed twice, once
04845   // for read and once for write, it will only appear once (on the write
04846   // list). This is okay, since we are going to check for conflicts between
04847   // writes and between reads and writes, but not between reads and reads.
04848   ValueSet Seen;
04849 
04850   ValueVector::iterator I, IE;
04851   for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) {
04852     StoreInst *ST = cast<StoreInst>(*I);
04853     Value* Ptr = ST->getPointerOperand();
04854 
04855     if (isUniform(Ptr)) {
04856       emitAnalysis(
04857           Report(ST)
04858           << "write to a loop invariant address could not be vectorized");
04859       DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
04860       return false;
04861     }
04862 
04863     // If we did *not* see this pointer before, insert it to  the read-write
04864     // list. At this phase it is only a 'write' list.
04865     if (Seen.insert(Ptr).second) {
04866       ++NumReadWrites;
04867 
04868       AliasAnalysis::Location Loc = AA->getLocation(ST);
04869       // The TBAA metadata could have a control dependency on the predication
04870       // condition, so we cannot rely on it when determining whether or not we
04871       // need runtime pointer checks.
04872       if (blockNeedsPredication(ST->getParent()))
04873         Loc.AATags.TBAA = nullptr;
04874 
04875       Accesses.addStore(Loc);
04876     }
04877   }
04878 
04879   if (IsAnnotatedParallel) {
04880     DEBUG(dbgs()
04881           << "LV: A loop annotated parallel, ignore memory dependency "
04882           << "checks.\n");
04883     return true;
04884   }
04885 
04886   for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
04887     LoadInst *LD = cast<LoadInst>(*I);
04888     Value* Ptr = LD->getPointerOperand();
04889     // If we did *not* see this pointer before, insert it to the
04890     // read list. If we *did* see it before, then it is already in
04891     // the read-write list. This allows us to vectorize expressions
04892     // such as A[i] += x;  Because the address of A[i] is a read-write
04893     // pointer. This only works if the index of A[i] is consecutive.
04894     // If the address of i is unknown (for example A[B[i]]) then we may
04895     // read a few words, modify, and write a few words, and some of the
04896     // words may be written to the same address.
04897     bool IsReadOnlyPtr = false;
04898     if (Seen.insert(Ptr).second ||
04899         !isStridedPtr(SE, DL, Ptr, TheLoop, Strides)) {
04900       ++NumReads;
04901       IsReadOnlyPtr = true;
04902     }
04903 
04904     AliasAnalysis::Location Loc = AA->getLocation(LD);
04905     // The TBAA metadata could have a control dependency on the predication
04906     // condition, so we cannot rely on it when determining whether or not we
04907     // need runtime pointer checks.
04908     if (blockNeedsPredication(LD->getParent()))
04909       Loc.AATags.TBAA = nullptr;
04910 
04911     Accesses.addLoad(Loc, IsReadOnlyPtr);
04912   }
04913 
04914   // If we write (or read-write) to a single destination and there are no
04915   // other reads in this loop then is it safe to vectorize.
04916   if (NumReadWrites == 1 && NumReads == 0) {
04917     DEBUG(dbgs() << "LV: Found a write-only loop!\n");
04918     return true;
04919   }
04920 
04921   // Build dependence sets and check whether we need a runtime pointer bounds
04922   // check.
04923   Accesses.buildDependenceSets();
04924   bool NeedRTCheck = Accesses.isRTCheckNeeded();
04925 
04926   // Find pointers with computable bounds. We are going to use this information
04927   // to place a runtime bound check.
04928   unsigned NumComparisons = 0;
04929   bool CanDoRT = false;
04930   if (NeedRTCheck)
04931     CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, TheLoop,
04932                                        Strides);
04933 
04934   DEBUG(dbgs() << "LV: We need to do " << NumComparisons <<
04935         " pointer comparisons.\n");
04936 
04937   // If we only have one set of dependences to check pointers among we don't
04938   // need a runtime check.
04939   if (NumComparisons == 0 && NeedRTCheck)
04940     NeedRTCheck = false;
04941 
04942   // Check that we did not collect too many pointers or found an unsizeable
04943   // pointer.
04944   if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) {
04945     PtrRtCheck.reset();
04946     CanDoRT = false;
04947   }
04948 
04949   if (CanDoRT) {
04950     DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n");
04951   }
04952 
04953   if (NeedRTCheck && !CanDoRT) {
04954     emitAnalysis(Report() << "cannot identify array bounds");
04955     DEBUG(dbgs() << "LV: We can't vectorize because we can't find " <<
04956           "the array bounds.\n");
04957     PtrRtCheck.reset();
04958     return false;
04959   }
04960 
04961   PtrRtCheck.Need = NeedRTCheck;
04962 
04963   bool CanVecMem = true;
04964   if (Accesses.isDependencyCheckNeeded()) {
04965     DEBUG(dbgs() << "LV: Checking memory dependencies\n");
04966     CanVecMem = DepChecker.areDepsSafe(
04967         DependentAccesses, Accesses.getDependenciesToCheck(), Strides);
04968     MaxSafeDepDistBytes = DepChecker.getMaxSafeDepDistBytes();
04969 
04970     if (!CanVecMem && DepChecker.shouldRetryWithRuntimeCheck()) {
04971       DEBUG(dbgs() << "LV: Retrying with memory checks\n");
04972       NeedRTCheck = true;
04973 
04974       // Clear the dependency checks. We assume they are not needed.
04975       Accesses.resetDepChecks();
04976 
04977       PtrRtCheck.reset();
04978       PtrRtCheck.Need = true;
04979 
04980       CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE,
04981                                          TheLoop, Strides, true);
04982       // Check that we did not collect too many pointers or found an unsizeable
04983       // pointer.
04984       if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) {
04985         if (!CanDoRT && NumComparisons > 0)
04986           emitAnalysis(Report()
04987                        << "cannot check memory dependencies at runtime");
04988         else
04989           emitAnalysis(Report()
04990                        << NumComparisons << " exceeds limit of "
04991                        << RuntimeMemoryCheckThreshold
04992                        << " dependent memory operations checked at runtime");
04993         DEBUG(dbgs() << "LV: Can't vectorize with memory checks\n");
04994         PtrRtCheck.reset();
04995         return false;
04996       }
04997 
04998       CanVecMem = true;
04999     }
05000   }
05001 
05002   if (!CanVecMem)
05003     emitAnalysis(Report() << "unsafe dependent memory operations in loop");
05004 
05005   DEBUG(dbgs() << "LV: We" << (NeedRTCheck ? "" : " don't") <<
05006         " need a runtime memory check.\n");
05007 
05008   return CanVecMem;
05009 }
05010 
05011 static bool hasMultipleUsesOf(Instruction *I,
05012                               SmallPtrSetImpl<Instruction *> &Insts) {
05013   unsigned NumUses = 0;
05014   for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) {
05015     if (Insts.count(dyn_cast<Instruction>(*Use)))
05016       ++NumUses;
05017     if (NumUses > 1)
05018       return true;
05019   }
05020 
05021   return false;
05022 }
05023 
05024 static bool areAllUsesIn(Instruction *I, SmallPtrSetImpl<Instruction *> &Set) {
05025   for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use)
05026     if (!Set.count(dyn_cast<Instruction>(*Use)))
05027       return false;
05028   return true;
05029 }
05030 
05031 bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
05032                                                 ReductionKind Kind) {
05033   if (Phi->getNumIncomingValues() != 2)
05034     return false;
05035 
05036   // Reduction variables are only found in the loop header block.
05037   if (Phi->getParent() != TheLoop->getHeader())
05038     return false;
05039 
05040   // Obtain the reduction start value from the value that comes from the loop
05041   // preheader.
05042   Value *RdxStart = Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader());
05043 
05044   // ExitInstruction is the single value which is used outside the loop.
05045   // We only allow for a single reduction value to be used outside the loop.
05046   // This includes users of the reduction, variables (which form a cycle
05047   // which ends in the phi node).
05048   Instruction *ExitInstruction = nullptr;
05049   // Indicates that we found a reduction operation in our scan.
05050   bool FoundReduxOp = false;
05051 
05052   // We start with the PHI node and scan for all of the users of this
05053   // instruction. All users must be instructions that can be used as reduction
05054   // variables (such as ADD). We must have a single out-of-block user. The cycle
05055   // must include the original PHI.
05056   bool FoundStartPHI = false;
05057 
05058   // To recognize min/max patterns formed by a icmp select sequence, we store
05059   // the number of instruction we saw from the recognized min/max pattern,
05060   //  to make sure we only see exactly the two instructions.
05061   unsigned NumCmpSelectPatternInst = 0;
05062   ReductionInstDesc ReduxDesc(false, nullptr);
05063 
05064   SmallPtrSet<Instruction *, 8> VisitedInsts;
05065   SmallVector<Instruction *, 8> Worklist;
05066   Worklist.push_back(Phi);
05067   VisitedInsts.insert(Phi);
05068 
05069   // A value in the reduction can be used:
05070   //  - By the reduction:
05071   //      - Reduction operation:
05072   //        - One use of reduction value (safe).
05073   //        - Multiple use of reduction value (not safe).
05074   //      - PHI:
05075   //        - All uses of the PHI must be the reduction (safe).
05076   //        - Otherwise, not safe.
05077   //  - By one instruction outside of the loop (safe).
05078   //  - By further instructions outside of the loop (not safe).
05079   //  - By an instruction that is not part of the reduction (not safe).
05080   //    This is either:
05081   //      * An instruction type other than PHI or the reduction operation.
05082   //      * A PHI in the header other than the initial PHI.
05083   while (!Worklist.empty()) {
05084     Instruction *Cur = Worklist.back();
05085     Worklist.pop_back();
05086 
05087     // No Users.
05088     // If the instruction has no users then this is a broken chain and can't be
05089     // a reduction variable.
05090     if (Cur->use_empty())
05091       return false;
05092 
05093     bool IsAPhi = isa<PHINode>(Cur);
05094 
05095     // A header PHI use other than the original PHI.
05096     if (Cur != Phi && IsAPhi && Cur->getParent() == Phi->getParent())
05097       return false;
05098 
05099     // Reductions of instructions such as Div, and Sub is only possible if the
05100     // LHS is the reduction variable.
05101     if (!Cur->isCommutative() && !IsAPhi && !isa<SelectInst>(Cur) &&
05102         !isa<ICmpInst>(Cur) && !isa<FCmpInst>(Cur) &&
05103         !VisitedInsts.count(dyn_cast<Instruction>(Cur->getOperand(0))))
05104       return false;
05105 
05106     // Any reduction instruction must be of one of the allowed kinds.
05107     ReduxDesc = isReductionInstr(Cur, Kind, ReduxDesc);
05108     if (!ReduxDesc.IsReduction)
05109       return false;
05110 
05111     // A reduction operation must only have one use of the reduction value.
05112     if (!IsAPhi && Kind != RK_IntegerMinMax && Kind != RK_FloatMinMax &&
05113         hasMultipleUsesOf(Cur, VisitedInsts))
05114       return false;
05115 
05116     // All inputs to a PHI node must be a reduction value.
05117     if(IsAPhi && Cur != Phi && !areAllUsesIn(Cur, VisitedInsts))
05118       return false;
05119 
05120     if (Kind == RK_IntegerMinMax && (isa<ICmpInst>(Cur) ||
05121                                      isa<SelectInst>(Cur)))
05122       ++NumCmpSelectPatternInst;
05123     if (Kind == RK_FloatMinMax && (isa<FCmpInst>(Cur) ||
05124                                    isa<SelectInst>(Cur)))
05125       ++NumCmpSelectPatternInst;
05126 
05127     // Check  whether we found a reduction operator.
05128     FoundReduxOp |= !IsAPhi;
05129 
05130     // Process users of current instruction. Push non-PHI nodes after PHI nodes
05131     // onto the stack. This way we are going to have seen all inputs to PHI
05132     // nodes once we get to them.
05133     SmallVector<Instruction *, 8> NonPHIs;
05134     SmallVector<Instruction *, 8> PHIs;
05135     for (User *U : Cur->users()) {
05136       Instruction *UI = cast<Instruction>(U);
05137 
05138       // Check if we found the exit user.
05139       BasicBlock *Parent = UI->getParent();
05140       if (!TheLoop->contains(Parent)) {
05141         // Exit if you find multiple outside users or if the header phi node is
05142         // being used. In this case the user uses the value of the previous
05143         // iteration, in which case we would loose "VF-1" iterations of the
05144         // reduction operation if we vectorize.
05145         if (ExitInstruction != nullptr || Cur == Phi)
05146           return false;
05147 
05148         // The instruction used by an outside user must be the last instruction
05149         // before we feed back to the reduction phi. Otherwise, we loose VF-1
05150         // operations on the value.
05151         if (std::find(Phi->op_begin(), Phi->op_end(), Cur) == Phi->op_end())
05152          return false;
05153 
05154         ExitInstruction = Cur;
05155         continue;
05156       }
05157 
05158       // Process instructions only once (termination). Each reduction cycle
05159       // value must only be used once, except by phi nodes and min/max
05160       // reductions which are represented as a cmp followed by a select.
05161       ReductionInstDesc IgnoredVal(false, nullptr);
05162       if (VisitedInsts.insert(UI).second) {
05163         if (isa<PHINode>(UI))
05164           PHIs.push_back(UI);
05165         else
05166           NonPHIs.push_back(UI);
05167       } else if (!isa<PHINode>(UI) &&
05168                  ((!isa<FCmpInst>(UI) &&
05169                    !isa<ICmpInst>(UI) &&
05170                    !isa<SelectInst>(UI)) ||
05171                   !isMinMaxSelectCmpPattern(UI, IgnoredVal).IsReduction))
05172         return false;
05173 
05174       // Remember that we completed the cycle.
05175       if (UI == Phi)
05176         FoundStartPHI = true;
05177     }
05178     Worklist.append(PHIs.begin(), PHIs.end());
05179     Worklist.append(NonPHIs.begin(), NonPHIs.end());
05180   }
05181 
05182   // This means we have seen one but not the other instruction of the
05183   // pattern or more than just a select and cmp.
05184   if ((Kind == RK_IntegerMinMax || Kind == RK_FloatMinMax) &&
05185       NumCmpSelectPatternInst != 2)
05186     return false;
05187 
05188   if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction)
05189     return false;
05190 
05191   // We found a reduction var if we have reached the original phi node and we
05192   // only have a single instruction with out-of-loop users.
05193 
05194   // This instruction is allowed to have out-of-loop users.
05195   AllowedExit.insert(ExitInstruction);
05196 
05197   // Save the description of this reduction variable.
05198   ReductionDescriptor RD(RdxStart, ExitInstruction, Kind,
05199                          ReduxDesc.MinMaxKind);
05200   Reductions[Phi] = RD;
05201   // We've ended the cycle. This is a reduction variable if we have an
05202   // outside user and it has a binary op.
05203 
05204   return true;
05205 }
05206 
05207 /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction
05208 /// pattern corresponding to a min(X, Y) or max(X, Y).
05209 LoopVectorizationLegality::ReductionInstDesc
05210 LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I,
05211                                                     ReductionInstDesc &Prev) {
05212 
05213   assert((isa<ICmpInst>(I) || isa<FCmpInst>(I) || isa<SelectInst>(I)) &&
05214          "Expect a select instruction");
05215   Instruction *Cmp = nullptr;
05216   SelectInst *Select = nullptr;
05217 
05218   // We must handle the select(cmp()) as a single instruction. Advance to the
05219   // select.
05220   if ((Cmp = dyn_cast<ICmpInst>(I)) || (Cmp = dyn_cast<FCmpInst>(I))) {
05221     if (!Cmp->hasOneUse() || !(Select = dyn_cast<SelectInst>(*I->user_begin())))
05222       return ReductionInstDesc(false, I);
05223     return ReductionInstDesc(Select, Prev.MinMaxKind);
05224   }
05225 
05226   // Only handle single use cases for now.
05227   if (!(Select = dyn_cast<SelectInst>(I)))
05228     return ReductionInstDesc(false, I);
05229   if (!(Cmp = dyn_cast<ICmpInst>(I->getOperand(0))) &&
05230       !(Cmp = dyn_cast<FCmpInst>(I->getOperand(0))))
05231     return ReductionInstDesc(false, I);
05232   if (!Cmp->hasOneUse())
05233     return ReductionInstDesc(false, I);
05234 
05235   Value *CmpLeft;
05236   Value *CmpRight;
05237 
05238   // Look for a min/max pattern.
05239   if (m_UMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
05240     return ReductionInstDesc(Select, MRK_UIntMin);
05241   else if (m_UMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
05242     return ReductionInstDesc(Select, MRK_UIntMax);
05243   else if (m_SMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
05244     return ReductionInstDesc(Select, MRK_SIntMax);
05245   else if (m_SMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
05246     return ReductionInstDesc(Select, MRK_SIntMin);
05247   else if (m_OrdFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
05248     return ReductionInstDesc(Select, MRK_FloatMin);
05249   else if (m_OrdFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
05250     return ReductionInstDesc(Select, MRK_FloatMax);
05251   else if (m_UnordFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
05252     return ReductionInstDesc(Select, MRK_FloatMin);
05253   else if (m_UnordFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
05254     return ReductionInstDesc(Select, MRK_FloatMax);
05255 
05256   return ReductionInstDesc(false, I);
05257 }
05258 
05259 LoopVectorizationLegality::ReductionInstDesc
05260 LoopVectorizationLegality::isReductionInstr(Instruction *I,
05261                                             ReductionKind Kind,
05262                                             ReductionInstDesc &Prev) {
05263   bool FP = I->getType()->isFloatingPointTy();
05264   bool FastMath = FP && I->hasUnsafeAlgebra();
05265   switch (I->getOpcode()) {
05266   default:
05267     return ReductionInstDesc(false, I);
05268   case Instruction::PHI:
05269       if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd &&
05270                  Kind != RK_FloatMinMax))
05271         return ReductionInstDesc(false, I);
05272     return ReductionInstDesc(I, Prev.MinMaxKind);
05273   case Instruction::Sub:
05274   case Instruction::Add:
05275     return ReductionInstDesc(Kind == RK_IntegerAdd, I);
05276   case Instruction::Mul:
05277     return ReductionInstDesc(Kind == RK_IntegerMult, I);
05278   case Instruction::And:
05279     return ReductionInstDesc(Kind == RK_IntegerAnd, I);
05280   case Instruction::Or:
05281     return ReductionInstDesc(Kind == RK_IntegerOr, I);
05282   case Instruction::Xor:
05283     return ReductionInstDesc(Kind == RK_IntegerXor, I);
05284   case Instruction::FMul:
05285     return ReductionInstDesc(Kind == RK_FloatMult && FastMath, I);
05286   case Instruction::FSub:
05287   case Instruction::FAdd:
05288     return ReductionInstDesc(Kind == RK_FloatAdd && FastMath, I);
05289   case Instruction::FCmp:
05290   case Instruction::ICmp:
05291   case Instruction::Select:
05292     if (Kind != RK_IntegerMinMax &&
05293         (!HasFunNoNaNAttr || Kind != RK_FloatMinMax))
05294       return ReductionInstDesc(false, I);
05295     return isMinMaxSelectCmpPattern(I, Prev);
05296   }
05297 }
05298 
05299 LoopVectorizationLegality::InductionKind
05300 LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
05301   Type *PhiTy = Phi->getType();
05302   // We only handle integer and pointer inductions variables.
05303   if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy())
05304     return IK_NoInduction;
05305 
05306   // Check that the PHI is consecutive.
05307   const SCEV *PhiScev = SE->getSCEV(Phi);
05308   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
05309   if (!AR) {
05310     DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
05311     return IK_NoInduction;
05312   }
05313   const SCEV *Step = AR->getStepRecurrence(*SE);
05314 
05315   // Integer inductions need to have a stride of one.
05316   if (PhiTy->isIntegerTy()) {
05317     if (Step->isOne())
05318       return IK_IntInduction;
05319     if (Step->isAllOnesValue())
05320       return IK_ReverseIntInduction;
05321     return IK_NoInduction;
05322   }
05323 
05324   // Calculate the pointer stride and check if it is consecutive.
05325   const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
05326   if (!C)
05327     return IK_NoInduction;
05328 
05329   assert(PhiTy->isPointerTy() && "The PHI must be a pointer");
05330   Type *PointerElementType = PhiTy->getPointerElementType();
05331   // The pointer stride cannot be determined if the pointer element type is not
05332   // sized.
05333   if (!PointerElementType->isSized())
05334     return IK_NoInduction;
05335 
05336   uint64_t Size = DL->getTypeAllocSize(PointerElementType);
05337   if (C->getValue()->equalsInt(Size))
05338     return IK_PtrInduction;
05339   else if (C->getValue()->equalsInt(0 - Size))
05340     return IK_ReversePtrInduction;
05341 
05342   return IK_NoInduction;
05343 }
05344 
05345 bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
05346   Value *In0 = const_cast<Value*>(V);
05347   PHINode *PN = dyn_cast_or_null<PHINode>(In0);
05348   if (!PN)
05349     return false;
05350 
05351   return Inductions.count(PN);
05352 }
05353 
05354 bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB)  {
05355   assert(TheLoop->contains(BB) && "Unknown block used");
05356 
05357   // Blocks that do not dominate the latch need predication.
05358   BasicBlock* Latch = TheLoop->getLoopLatch();
05359   return !DT->dominates(BB, Latch);
05360 }
05361 
05362 bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,
05363                                            SmallPtrSetImpl<Value *> &SafePtrs) {
05364   
05365   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
05366     // Check that we don't have a constant expression that can trap as operand.
05367     for (Instruction::op_iterator OI = it->op_begin(), OE = it->op_end();
05368          OI != OE; ++OI) {
05369       if (Constant *C = dyn_cast<Constant>(*OI))
05370         if (C->canTrap())
05371           return false;
05372     }
05373     // We might be able to hoist the load.
05374     if (it->mayReadFromMemory()) {
05375       LoadInst *LI = dyn_cast<LoadInst>(it);
05376       if (!LI)
05377         return false;
05378       if (!SafePtrs.count(LI->getPointerOperand())) {
05379         if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand())) {
05380           MaskedOp.insert(LI);
05381           continue;
05382         }
05383         return false;
05384       }
05385     }
05386 
05387     // We don't predicate stores at the moment.
05388     if (it->mayWriteToMemory()) {
05389       StoreInst *SI = dyn_cast<StoreInst>(it);
05390       // We only support predication of stores in basic blocks with one
05391       // predecessor.
05392       if (!SI)
05393         return false;
05394 
05395       bool isSafePtr = (SafePtrs.count(SI->getPointerOperand()) != 0);
05396       bool isSinglePredecessor = SI->getParent()->getSinglePredecessor();
05397       
05398       if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr ||
05399           !isSinglePredecessor) {
05400         // Build a masked store if it is legal for the target, otherwise scalarize
05401         // the block.
05402         bool isLegalMaskedOp =
05403           isLegalMaskedStore(SI->getValueOperand()->getType(),
05404                              SI->getPointerOperand());
05405         if (isLegalMaskedOp) {
05406           --NumPredStores;
05407           MaskedOp.insert(SI);
05408           continue;
05409         }
05410         return false;
05411       }
05412     }
05413     if (it->mayThrow())
05414       return false;
05415 
05416     // The instructions below can trap.
05417     switch (it->getOpcode()) {
05418     default: continue;
05419     case Instruction::UDiv:
05420     case Instruction::SDiv:
05421     case Instruction::URem:
05422     case Instruction::SRem:
05423       return false;
05424     }
05425   }
05426 
05427   return true;
05428 }
05429 
05430 LoopVectorizationCostModel::VectorizationFactor
05431 LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
05432   // Width 1 means no vectorize
05433   VectorizationFactor Factor = { 1U, 0U };
05434   if (OptForSize && Legal->getRuntimePointerCheck()->Need) {
05435     emitAnalysis(Report() << "runtime pointer checks needed. Enable vectorization of this loop with '#pragma clang loop vectorize(enable)' when compiling with -Os");
05436     DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n");
05437     return Factor;
05438   }
05439 
05440   if (!EnableCondStoresVectorization && Legal->NumPredStores) {
05441     emitAnalysis(Report() << "store that is conditionally executed prevents vectorization");
05442     DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");
05443     return Factor;
05444   }
05445 
05446   // Find the trip count.
05447   unsigned TC = SE->getSmallConstantTripCount(TheLoop);
05448   DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
05449 
05450   unsigned WidestType = getWidestType();
05451   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
05452   unsigned MaxSafeDepDist = -1U;
05453   if (Legal->getMaxSafeDepDistBytes() != -1U)
05454     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
05455   WidestRegister = ((WidestRegister < MaxSafeDepDist) ?
05456                     WidestRegister : MaxSafeDepDist);
05457   unsigned MaxVectorSize = WidestRegister / WidestType;
05458   DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n");
05459   DEBUG(dbgs() << "LV: The Widest register is: "
05460           << WidestRegister << " bits.\n");
05461 
05462   if (MaxVectorSize == 0) {
05463     DEBUG(dbgs() << "LV: The target has no vector registers.\n");
05464     MaxVectorSize = 1;
05465   }
05466 
05467   assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements"
05468          " into one vector!");
05469 
05470   unsigned VF = MaxVectorSize;
05471 
05472   // If we optimize the program for size, avoid creating the tail loop.
05473   if (OptForSize) {
05474     // If we are unable to calculate the trip count then don't try to vectorize.
05475     if (TC < 2) {
05476       emitAnalysis(Report() << "unable to calculate the loop count due to complex control flow");
05477       DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
05478       return Factor;
05479     }
05480 
05481     // Find the maximum SIMD width that can fit within the trip count.
05482     VF = TC % MaxVectorSize;
05483 
05484     if (VF == 0)
05485       VF = MaxVectorSize;
05486 
05487     // If the trip count that we found modulo the vectorization factor is not
05488     // zero then we require a tail.
05489     if (VF < 2) {
05490       emitAnalysis(Report() << "cannot optimize for size and vectorize at the "
05491                                "same time. Enable vectorization of this loop "
05492                                "with '#pragma clang loop vectorize(enable)' "
05493                                "when compiling with -Os");
05494       DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
05495       return Factor;
05496     }
05497   }
05498 
05499   int UserVF = Hints->getWidth();
05500   if (UserVF != 0) {
05501     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
05502     DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
05503 
05504     Factor.Width = UserVF;
05505     return Factor;
05506   }
05507 
05508   float Cost = expectedCost(1);
05509 #ifndef NDEBUG
05510   const float ScalarCost = Cost;
05511 #endif /* NDEBUG */
05512   unsigned Width = 1;
05513   DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
05514 
05515   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
05516   // Ignore scalar width, because the user explicitly wants vectorization.
05517   if (ForceVectorization && VF > 1) {
05518     Width = 2;
05519     Cost = expectedCost(Width) / (float)Width;
05520   }
05521 
05522   for (unsigned i=2; i <= VF; i*=2) {
05523     // Notice that the vector loop needs to be executed less times, so
05524     // we need to divide the cost of the vector loops by the width of
05525     // the vector elements.
05526     float VectorCost = expectedCost(i) / (float)i;
05527     DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " <<
05528           (int)VectorCost << ".\n");
05529     if (VectorCost < Cost) {
05530       Cost = VectorCost;
05531       Width = i;
05532     }
05533   }
05534 
05535   DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
05536         << "LV: Vectorization seems to be not beneficial, "
05537         << "but was forced by a user.\n");
05538   DEBUG(dbgs() << "LV: Selecting VF: "<< Width << ".\n");
05539   Factor.Width = Width;
05540   Factor.Cost = Width * Cost;
05541   return Factor;
05542 }
05543 
05544 unsigned LoopVectorizationCostModel::getWidestType() {
05545   unsigned MaxWidth = 8;
05546 
05547   // For each block.
05548   for (Loop::block_iterator bb = TheLoop->block_begin(),
05549        be = TheLoop->block_end(); bb != be; ++bb) {
05550     BasicBlock *BB = *bb;
05551 
05552     // For each instruction in the loop.
05553     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
05554       Type *T = it->getType();
05555 
05556       // Ignore ephemeral values.
05557       if (EphValues.count(it))
05558         continue;
05559 
05560       // Only examine Loads, Stores and PHINodes.
05561       if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it))
05562         continue;
05563 
05564       // Examine PHI nodes that are reduction variables.
05565       if (PHINode *PN = dyn_cast<PHINode>(it))
05566         if (!Legal->getReductionVars()->count(PN))
05567           continue;
05568 
05569       // Examine the stored values.
05570       if (StoreInst *ST = dyn_cast<StoreInst>(it))
05571         T = ST->getValueOperand()->getType();
05572 
05573       // Ignore loaded pointer types and stored pointer types that are not
05574       // consecutive. However, we do want to take consecutive stores/loads of
05575       // pointer vectors into account.
05576       if (T->isPointerTy() && !isConsecutiveLoadOrStore(it))
05577         continue;
05578 
05579       MaxWidth = std::max(MaxWidth,
05580                           (unsigned)DL->getTypeSizeInBits(T->getScalarType()));
05581     }
05582   }
05583 
05584   return MaxWidth;
05585 }
05586 
05587 unsigned
05588 LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
05589                                                unsigned VF,
05590                                                unsigned LoopCost) {
05591 
05592   // -- The unroll heuristics --
05593   // We unroll the loop in order to expose ILP and reduce the loop overhead.
05594   // There are many micro-architectural considerations that we can't predict
05595   // at this level. For example, frontend pressure (on decode or fetch) due to
05596   // code size, or the number and capabilities of the execution ports.
05597   //
05598   // We use the following heuristics to select the unroll factor:
05599   // 1. If the code has reductions, then we unroll in order to break the cross
05600   // iteration dependency.
05601   // 2. If the loop is really small, then we unroll in order to reduce the loop
05602   // overhead.
05603   // 3. We don't unroll if we think that we will spill registers to memory due
05604   // to the increased register pressure.
05605 
05606   // Use the user preference, unless 'auto' is selected.
05607   int UserUF = Hints->getInterleave();
05608   if (UserUF != 0)
05609     return UserUF;
05610 
05611   // When we optimize for size, we don't unroll.
05612   if (OptForSize)
05613     return 1;
05614 
05615   // We used the distance for the unroll factor.
05616   if (Legal->getMaxSafeDepDistBytes() != -1U)
05617     return 1;
05618 
05619   // Do not unroll loops with a relatively small trip count.
05620   unsigned TC = SE->getSmallConstantTripCount(TheLoop);
05621   if (TC > 1 && TC < TinyTripCountUnrollThreshold)
05622     return 1;
05623 
05624   unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
05625   DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters <<
05626         " registers\n");
05627 
05628   if (VF == 1) {
05629     if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
05630       TargetNumRegisters = ForceTargetNumScalarRegs;
05631   } else {
05632     if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
05633       TargetNumRegisters = ForceTargetNumVectorRegs;
05634   }
05635 
05636   LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage();
05637   // We divide by these constants so assume that we have at least one
05638   // instruction that uses at least one register.
05639   R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
05640   R.NumInstructions = std::max(R.NumInstructions, 1U);
05641 
05642   // We calculate the unroll factor using the following formula.
05643   // Subtract the number of loop invariants from the number of available
05644   // registers. These registers are used by all of the unrolled instances.
05645   // Next, divide the remaining registers by the number of registers that is
05646   // required by the loop, in order to estimate how many parallel instances
05647   // fit without causing spills. All of this is rounded down if necessary to be
05648   // a power of two. We want power of two unroll factors to simplify any
05649   // addressing operations or alignment considerations.
05650   unsigned UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
05651                               R.MaxLocalUsers);
05652 
05653   // Don't count the induction variable as unrolled.
05654   if (EnableIndVarRegisterHeur)
05655     UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
05656                        std::max(1U, (R.MaxLocalUsers - 1)));
05657 
05658   // Clamp the unroll factor ranges to reasonable factors.
05659   unsigned MaxInterleaveSize = TTI.getMaxInterleaveFactor();
05660 
05661   // Check if the user has overridden the unroll max.
05662   if (VF == 1) {
05663     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
05664       MaxInterleaveSize = ForceTargetMaxScalarInterleaveFactor;
05665   } else {
05666     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
05667       MaxInterleaveSize = ForceTargetMaxVectorInterleaveFactor;
05668   }
05669 
05670   // If we did not calculate the cost for VF (because the user selected the VF)
05671   // then we calculate the cost of VF here.
05672   if (LoopCost == 0)
05673     LoopCost = expectedCost(VF);
05674 
05675   // Clamp the calculated UF to be between the 1 and the max unroll factor
05676   // that the target allows.
05677   if (UF > MaxInterleaveSize)
05678     UF = MaxInterleaveSize;
05679   else if (UF < 1)
05680     UF = 1;
05681 
05682   // Unroll if we vectorized this loop and there is a reduction that could
05683   // benefit from unrolling.
05684   if (VF > 1 && Legal->getReductionVars()->size()) {
05685     DEBUG(dbgs() << "LV: Unrolling because of reductions.\n");
05686     return UF;
05687   }
05688 
05689   // Note that if we've already vectorized the loop we will have done the
05690   // runtime check and so unrolling won't require further checks.
05691   bool UnrollingRequiresRuntimePointerCheck =
05692       (VF == 1 && Legal->getRuntimePointerCheck()->Need);
05693 
05694   // We want to unroll small loops in order to reduce the loop overhead and
05695   // potentially expose ILP opportunities.
05696   DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
05697   if (!UnrollingRequiresRuntimePointerCheck &&
05698       LoopCost < SmallLoopCost) {
05699     // We assume that the cost overhead is 1 and we use the cost model
05700     // to estimate the cost of the loop and unroll until the cost of the
05701     // loop overhead is about 5% of the cost of the loop.
05702     unsigned SmallUF = std::min(UF, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
05703 
05704     // Unroll until store/load ports (estimated by max unroll factor) are
05705     // saturated.
05706     unsigned StoresUF = UF / (Legal->NumStores ? Legal->NumStores : 1);
05707     unsigned LoadsUF = UF /  (Legal->NumLoads ? Legal->NumLoads : 1);
05708 
05709     // If we have a scalar reduction (vector reductions are already dealt with
05710     // by this point), we can increase the critical path length if the loop
05711     // we're unrolling is inside another loop. Limit, by default to 2, so the
05712     // critical path only gets increased by one reduction operation.
05713     if (Legal->getReductionVars()->size() &&
05714         TheLoop->getLoopDepth() > 1) {
05715       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionUF);
05716       SmallUF = std::min(SmallUF, F);
05717       StoresUF = std::min(StoresUF, F);
05718       LoadsUF = std::min(LoadsUF, F);
05719     }
05720 
05721     if (EnableLoadStoreRuntimeUnroll && std::max(StoresUF, LoadsUF) > SmallUF) {
05722       DEBUG(dbgs() << "LV: Unrolling to saturate store or load ports.\n");
05723       return std::max(StoresUF, LoadsUF);
05724     }
05725 
05726     DEBUG(dbgs() << "LV: Unrolling to reduce branch cost.\n");
05727     return SmallUF;
05728   }
05729 
05730   DEBUG(dbgs() << "LV: Not Unrolling.\n");
05731   return 1;
05732 }
05733 
05734 LoopVectorizationCostModel::RegisterUsage
05735 LoopVectorizationCostModel::calculateRegisterUsage() {
05736   // This function calculates the register usage by measuring the highest number
05737   // of values that are alive at a single location. Obviously, this is a very
05738   // rough estimation. We scan the loop in a topological order in order and
05739   // assign a number to each instruction. We use RPO to ensure that defs are
05740   // met before their users. We assume that each instruction that has in-loop
05741   // users starts an interval. We record every time that an in-loop value is
05742   // used, so we have a list of the first and last occurrences of each
05743   // instruction. Next, we transpose this data structure into a multi map that
05744   // holds the list of intervals that *end* at a specific location. This multi
05745   // map allows us to perform a linear search. We scan the instructions linearly
05746   // and record each time that a new interval starts, by placing it in a set.
05747   // If we find this value in the multi-map then we remove it from the set.
05748   // The max register usage is the maximum size of the set.
05749   // We also search for instructions that are defined outside the loop, but are
05750   // used inside the loop. We need this number separately from the max-interval
05751   // usage number because when we unroll, loop-invariant values do not take
05752   // more register.
05753   LoopBlocksDFS DFS(TheLoop);
05754   DFS.perform(LI);
05755 
05756   RegisterUsage R;
05757   R.NumInstructions = 0;
05758 
05759   // Each 'key' in the map opens a new interval. The values
05760   // of the map are the index of the 'last seen' usage of the
05761   // instruction that is the key.
05762   typedef DenseMap<Instruction*, unsigned> IntervalMap;
05763   // Maps instruction to its index.
05764   DenseMap<unsigned, Instruction*> IdxToInstr;
05765   // Marks the end of each interval.
05766   IntervalMap EndPoint;
05767   // Saves the list of instruction indices that are used in the loop.
05768   SmallSet<Instruction*, 8> Ends;
05769   // Saves the list of values that are used in the loop but are
05770   // defined outside the loop, such as arguments and constants.
05771   SmallPtrSet<Value*, 8> LoopInvariants;
05772 
05773   unsigned Index = 0;
05774   for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
05775        be = DFS.endRPO(); bb != be; ++bb) {
05776     R.NumInstructions += (*bb)->size();
05777     for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
05778          ++it) {
05779       Instruction *I = it;
05780       IdxToInstr[Index++] = I;
05781 
05782       // Save the end location of each USE.
05783       for (unsigned i = 0; i < I->getNumOperands(); ++i) {
05784         Value *U = I->getOperand(i);
05785         Instruction *Instr = dyn_cast<Instruction>(U);
05786 
05787         // Ignore non-instruction values such as arguments, constants, etc.
05788         if (!Instr) continue;
05789 
05790         // If this instruction is outside the loop then record it and continue.
05791         if (!TheLoop->contains(Instr)) {
05792           LoopInvariants.insert(Instr);
05793           continue;
05794         }
05795 
05796         // Overwrite previous end points.
05797         EndPoint[Instr] = Index;
05798         Ends.insert(Instr);
05799       }
05800     }
05801   }
05802 
05803   // Saves the list of intervals that end with the index in 'key'.
05804   typedef SmallVector<Instruction*, 2> InstrList;
05805   DenseMap<unsigned, InstrList> TransposeEnds;
05806 
05807   // Transpose the EndPoints to a list of values that end at each index.
05808   for (IntervalMap::iterator it = EndPoint.begin(), e = EndPoint.end();
05809        it != e; ++it)
05810     TransposeEnds[it->second].push_back(it->first);
05811 
05812   SmallSet<Instruction*, 8> OpenIntervals;
05813   unsigned MaxUsage = 0;
05814 
05815 
05816   DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
05817   for (unsigned int i = 0; i < Index; ++i) {
05818     Instruction *I = IdxToInstr[i];
05819     // Ignore instructions that are never used within the loop.
05820     if (!Ends.count(I)) continue;
05821 
05822     // Ignore ephemeral values.
05823     if (EphValues.count(I))
05824       continue;
05825 
05826     // Remove all of the instructions that end at this location.
05827     InstrList &List = TransposeEnds[i];
05828     for (unsigned int j=0, e = List.size(); j < e; ++j)
05829       OpenIntervals.erase(List[j]);
05830 
05831     // Count the number of live interals.
05832     MaxUsage = std::max(MaxUsage, OpenIntervals.size());
05833 
05834     DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " <<
05835           OpenIntervals.size() << '\n');
05836 
05837     // Add the current instruction to the list of open intervals.
05838     OpenIntervals.insert(I);
05839   }
05840 
05841   unsigned Invariant = LoopInvariants.size();
05842   DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << '\n');
05843   DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n');
05844   DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << '\n');
05845 
05846   R.LoopInvariantRegs = Invariant;
05847   R.MaxLocalUsers = MaxUsage;
05848   return R;
05849 }
05850 
05851 unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
05852   unsigned Cost = 0;
05853 
05854   // For each block.
05855   for (Loop::block_iterator bb = TheLoop->block_begin(),
05856        be = TheLoop->block_end(); bb != be; ++bb) {
05857     unsigned BlockCost = 0;
05858     BasicBlock *BB = *bb;
05859 
05860     // For each instruction in the old loop.
05861     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
05862       // Skip dbg intrinsics.
05863       if (isa<DbgInfoIntrinsic>(it))
05864         continue;
05865 
05866       // Ignore ephemeral values.
05867       if (EphValues.count(it))
05868         continue;
05869 
05870       unsigned C = getInstructionCost(it, VF);
05871 
05872       // Check if we should override the cost.
05873       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
05874         C = ForceTargetInstructionCost;
05875 
05876       BlockCost += C;
05877       DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " <<
05878             VF << " For instruction: " << *it << '\n');
05879     }
05880 
05881     // We assume that if-converted blocks have a 50% chance of being executed.
05882     // When the code is scalar then some of the blocks are avoided due to CF.
05883     // When the code is vectorized we execute all code paths.
05884     if (VF == 1 && Legal->blockNeedsPredication(*bb))
05885       BlockCost /= 2;
05886 
05887     Cost += BlockCost;
05888   }
05889 
05890   return Cost;
05891 }
05892 
05893 /// \brief Check whether the address computation for a non-consecutive memory
05894 /// access looks like an unlikely candidate for being merged into the indexing
05895 /// mode.
05896 ///
05897 /// We look for a GEP which has one index that is an induction variable and all
05898 /// other indices are loop invariant. If the stride of this access is also
05899 /// within a small bound we decide that this address computation can likely be
05900 /// merged into the addressing mode.
05901 /// In all other cases, we identify the address computation as complex.
05902 static bool isLikelyComplexAddressComputation(Value *Ptr,
05903                                               LoopVectorizationLegality *Legal,
05904                                               ScalarEvolution *SE,
05905                                               const Loop *TheLoop) {
05906   GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
05907   if (!Gep)
05908     return true;
05909 
05910   // We are looking for a gep with all loop invariant indices except for one
05911   // which should be an induction variable.
05912   unsigned NumOperands = Gep->getNumOperands();
05913   for (unsigned i = 1; i < NumOperands; ++i) {
05914     Value *Opd = Gep->getOperand(i);
05915     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
05916         !Legal->isInductionVariable(Opd))
05917       return true;
05918   }
05919 
05920   // Now we know we have a GEP ptr, %inv, %ind, %inv. Make sure that the step
05921   // can likely be merged into the address computation.
05922   unsigned MaxMergeDistance = 64;
05923 
05924   const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Ptr));
05925   if (!AddRec)
05926     return true;
05927 
05928   // Check the step is constant.
05929   const SCEV *Step = AddRec->getStepRecurrence(*SE);
05930   // Calculate the pointer stride and check if it is consecutive.
05931   const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
05932   if (!C)
05933     return true;
05934 
05935   const APInt &APStepVal = C->getValue()->getValue();
05936 
05937   // Huge step value - give up.
05938   if (APStepVal.getBitWidth() > 64)
05939     return true;
05940 
05941   int64_t StepVal = APStepVal.getSExtValue();
05942 
05943   return StepVal > MaxMergeDistance;
05944 }
05945 
05946 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
05947   if (Legal->hasStride(I->getOperand(0)) || Legal->hasStride(I->getOperand(1)))
05948     return true;
05949   return false;
05950 }
05951 
05952 unsigned
05953 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
05954   // If we know that this instruction will remain uniform, check the cost of
05955   // the scalar version.
05956   if (Legal->isUniformAfterVectorization(I))
05957     VF = 1;
05958 
05959   Type *RetTy = I->getType();
05960   Type *VectorTy = ToVectorTy(RetTy, VF);
05961 
05962   // TODO: We need to estimate the cost of intrinsic calls.
05963   switch (I->getOpcode()) {
05964   case Instruction::GetElementPtr:
05965     // We mark this instruction as zero-cost because the cost of GEPs in
05966     // vectorized code depends on whether the corresponding memory instruction
05967     // is scalarized or not. Therefore, we handle GEPs with the memory
05968     // instruction cost.
05969     return 0;
05970   case Instruction::Br: {
05971     return TTI.getCFInstrCost(I->getOpcode());
05972   }
05973   case Instruction::PHI:
05974     //TODO: IF-converted IFs become selects.
05975     return 0;
05976   case Instruction::Add:
05977   case Instruction::FAdd:
05978   case Instruction::Sub:
05979   case Instruction::FSub:
05980   case Instruction::Mul:
05981   case Instruction::FMul:
05982   case Instruction::UDiv:
05983   case Instruction::SDiv:
05984   case Instruction::FDiv:
05985   case Instruction::URem:
05986   case Instruction::SRem:
05987   case Instruction::FRem:
05988   case Instruction::Shl:
05989   case Instruction::LShr:
05990   case Instruction::AShr:
05991   case Instruction::And:
05992   case Instruction::Or:
05993   case Instruction::Xor: {
05994     // Since we will replace the stride by 1 the multiplication should go away.
05995     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
05996       return 0;
05997     // Certain instructions can be cheaper to vectorize if they have a constant
05998     // second vector operand. One example of this are shifts on x86.
05999     TargetTransformInfo::OperandValueKind Op1VK =
06000       TargetTransformInfo::OK_AnyValue;
06001     TargetTransformInfo::OperandValueKind Op2VK =
06002       TargetTransformInfo::OK_AnyValue;
06003     TargetTransformInfo::OperandValueProperties Op1VP =
06004         TargetTransformInfo::OP_None;
06005     TargetTransformInfo::OperandValueProperties Op2VP =
06006         TargetTransformInfo::OP_None;
06007     Value *Op2 = I->getOperand(1);
06008 
06009     // Check for a splat of a constant or for a non uniform vector of constants.
06010     if (isa<ConstantInt>(Op2)) {
06011       ConstantInt *CInt = cast<ConstantInt>(Op2);
06012       if (CInt && CInt->getValue().isPowerOf2())
06013         Op2VP = TargetTransformInfo::OP_PowerOf2;
06014       Op2VK = TargetTransformInfo::OK_UniformConstantValue;
06015     } else if (isa<ConstantVector>(Op2) || isa<ConstantDataVector>(Op2)) {
06016       Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
06017       Constant *SplatValue = cast<Constant>(Op2)->getSplatValue();
06018       if (SplatValue) {
06019         ConstantInt *CInt = dyn_cast<ConstantInt>(SplatValue);
06020         if (CInt && CInt->getValue().isPowerOf2())
06021           Op2VP = TargetTransformInfo::OP_PowerOf2;
06022         Op2VK = TargetTransformInfo::OK_UniformConstantValue;
06023       }
06024     }
06025 
06026     return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK,
06027                                       Op1VP, Op2VP);
06028   }
06029   case Instruction::Select: {
06030     SelectInst *SI = cast<SelectInst>(I);
06031     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
06032     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
06033     Type *CondTy = SI->getCondition()->getType();
06034     if (!ScalarCond)
06035       CondTy = VectorType::get(CondTy, VF);
06036 
06037     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
06038   }
06039   case Instruction::ICmp:
06040   case Instruction::FCmp: {
06041     Type *ValTy = I->getOperand(0)->getType();
06042     VectorTy = ToVectorTy(ValTy, VF);
06043     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy);
06044   }
06045   case Instruction::Store:
06046   case Instruction::Load: {
06047     StoreInst *SI = dyn_cast<StoreInst>(I);
06048     LoadInst *LI = dyn_cast<LoadInst>(I);
06049     Type *ValTy = (SI ? SI->getValueOperand()->getType() :
06050                    LI->getType());
06051     VectorTy = ToVectorTy(ValTy, VF);
06052 
06053     unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment();
06054     unsigned AS = SI ? SI->getPointerAddressSpace() :
06055       LI->getPointerAddressSpace();
06056     Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand();
06057     // We add the cost of address computation here instead of with the gep
06058     // instruction because only here we know whether the operation is
06059     // scalarized.
06060     if (VF == 1)
06061       return TTI.getAddressComputationCost(VectorTy) +
06062         TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
06063 
06064     // Scalarized loads/stores.
06065     int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
06066     bool Reverse = ConsecutiveStride < 0;
06067     unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ValTy);
06068     unsigned VectorElementSize = DL->getTypeStoreSize(VectorTy)/VF;
06069     if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) {
06070       bool IsComplexComputation =
06071         isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop);
06072       unsigned Cost = 0;
06073       // The cost of extracting from the value vector and pointer vector.
06074       Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
06075       for (unsigned i = 0; i < VF; ++i) {
06076         //  The cost of extracting the pointer operand.
06077         Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i);
06078         // In case of STORE, the cost of ExtractElement from the vector.
06079         // In case of LOAD, the cost of InsertElement into the returned
06080         // vector.
06081         Cost += TTI.getVectorInstrCost(SI ? Instruction::ExtractElement :
06082                                             Instruction::InsertElement,
06083                                             VectorTy, i);
06084       }
06085 
06086       // The cost of the scalar loads/stores.
06087       Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation);
06088       Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
06089                                        Alignment, AS);
06090       return Cost;
06091     }
06092 
06093     // Wide load/stores.
06094     unsigned Cost = TTI.getAddressComputationCost(VectorTy);
06095     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
06096 
06097     if (Reverse)
06098       Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
06099                                   VectorTy, 0);
06100     return Cost;
06101   }
06102   case Instruction::ZExt:
06103   case Instruction::SExt:
06104   case Instruction::FPToUI:
06105   case Instruction::FPToSI:
06106   case Instruction::FPExt:
06107   case Instruction::PtrToInt:
06108   case Instruction::IntToPtr:
06109   case Instruction::SIToFP:
06110   case Instruction::UIToFP:
06111   case Instruction::Trunc:
06112   case Instruction::FPTrunc:
06113   case Instruction::BitCast: {
06114     // We optimize the truncation of induction variable.
06115     // The cost of these is the same as the scalar operation.
06116     if (I->getOpcode() == Instruction::Trunc &&
06117         Legal->isInductionVariable(I->getOperand(0)))
06118       return TTI.getCastInstrCost(I->getOpcode(), I->getType(),
06119                                   I->getOperand(0)->getType());
06120 
06121     Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
06122     return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
06123   }
06124   case Instruction::Call: {
06125     CallInst *CI = cast<CallInst>(I);
06126     Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
06127     assert(ID && "Not an intrinsic call!");
06128     Type *RetTy = ToVectorTy(CI->getType(), VF);
06129     SmallVector<Type*, 4> Tys;
06130     for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
06131       Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF));
06132     return TTI.getIntrinsicInstrCost(ID, RetTy, Tys);
06133   }
06134   default: {
06135     // We are scalarizing the instruction. Return the cost of the scalar
06136     // instruction, plus the cost of insert and extract into vector
06137     // elements, times the vector width.
06138     unsigned Cost = 0;
06139 
06140     if (!RetTy->isVoidTy() && VF != 1) {
06141       unsigned InsCost = TTI.getVectorInstrCost(Instruction::InsertElement,
06142                                                 VectorTy);
06143       unsigned ExtCost = TTI.getVectorInstrCost(Instruction::ExtractElement,
06144                                                 VectorTy);
06145 
06146       // The cost of inserting the results plus extracting each one of the
06147       // operands.
06148       Cost += VF * (InsCost + ExtCost * I->getNumOperands());
06149     }
06150 
06151     // The cost of executing VF copies of the scalar instruction. This opcode
06152     // is unknown. Assume that it is the same as 'mul'.
06153     Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy);
06154     return Cost;
06155   }
06156   }// end of switch.
06157 }
06158 
06159 Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) {
06160   if (Scalar->isVoidTy() || VF == 1)
06161     return Scalar;
06162   return VectorType::get(Scalar, VF);
06163 }
06164 
06165 char LoopVectorize::ID = 0;
06166 static const char lv_name[] = "Loop Vectorization";
06167 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
06168 INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
06169 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
06170 INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
06171 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfo)
06172 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
06173 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
06174 INITIALIZE_PASS_DEPENDENCY(LCSSA)
06175 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
06176 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
06177 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
06178 
06179 namespace llvm {
06180   Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) {
06181     return new LoopVectorize(NoUnrolling, AlwaysVectorize);
06182   }
06183 }
06184 
06185 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
06186   // Check for a store.
06187   if (StoreInst *ST = dyn_cast<StoreInst>(Inst))
06188     return Legal->isConsecutivePtr(ST->getPointerOperand()) != 0;
06189 
06190   // Check for a load.
06191   if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
06192     return Legal->isConsecutivePtr(LI->getPointerOperand()) != 0;
06193 
06194   return false;
06195 }
06196 
06197 
06198 void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
06199                                              bool IfPredicateStore) {
06200   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
06201   // Holds vector parameters or scalars, in case of uniform vals.
06202   SmallVector<VectorParts, 4> Params;
06203 
06204   setDebugLocFromInst(Builder, Instr);
06205 
06206   // Find all of the vectorized parameters.
06207   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
06208     Value *SrcOp = Instr->getOperand(op);
06209 
06210     // If we are accessing the old induction variable, use the new one.
06211     if (SrcOp == OldInduction) {
06212       Params.push_back(getVectorValue(SrcOp));
06213       continue;
06214     }
06215 
06216     // Try using previously calculated values.
06217     Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
06218 
06219     // If the src is an instruction that appeared earlier in the basic block
06220     // then it should already be vectorized.
06221     if (SrcInst && OrigLoop->contains(SrcInst)) {
06222       assert(WidenMap.has(SrcInst) && "Source operand is unavailable");
06223       // The parameter is a vector value from earlier.
06224       Params.push_back(WidenMap.get(SrcInst));
06225     } else {
06226       // The parameter is a scalar from outside the loop. Maybe even a constant.
06227       VectorParts Scalars;
06228       Scalars.append(UF, SrcOp);
06229       Params.push_back(Scalars);
06230     }
06231   }
06232 
06233   assert(Params.size() == Instr->getNumOperands() &&
06234          "Invalid number of operands");
06235 
06236   // Does this instruction return a value ?
06237   bool IsVoidRetTy = Instr->getType()->isVoidTy();
06238 
06239   Value *UndefVec = IsVoidRetTy ? nullptr :
06240   UndefValue::get(Instr->getType());
06241   // Create a new entry in the WidenMap and initialize it to Undef or Null.
06242   VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
06243 
06244   Instruction *InsertPt = Builder.GetInsertPoint();
06245   BasicBlock *IfBlock = Builder.GetInsertBlock();
06246   BasicBlock *CondBlock = nullptr;
06247 
06248   VectorParts Cond;
06249   Loop *VectorLp = nullptr;
06250   if (IfPredicateStore) {
06251     assert(Instr->getParent()->getSinglePredecessor() &&
06252            "Only support single predecessor blocks");
06253     Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
06254                           Instr->getParent());
06255     VectorLp = LI->getLoopFor(IfBlock);
06256     assert(VectorLp && "Must have a loop for this block");
06257   }
06258 
06259   // For each vector unroll 'part':
06260   for (unsigned Part = 0; Part < UF; ++Part) {
06261     // For each scalar that we create:
06262 
06263     // Start an "if (pred) a[i] = ..." block.
06264     Value *Cmp = nullptr;
06265     if (IfPredicateStore) {
06266       if (Cond[Part]->getType()->isVectorTy())
06267         Cond[Part] =
06268             Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0));
06269       Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part],
06270                                ConstantInt::get(Cond[Part]->getType(), 1));
06271       CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
06272       LoopVectorBody.push_back(CondBlock);
06273       VectorLp->addBasicBlockToLoop(CondBlock, LI->getBase());
06274       // Update Builder with newly created basic block.
06275       Builder.SetInsertPoint(InsertPt);
06276     }
06277 
06278     Instruction *Cloned = Instr->clone();
06279       if (!IsVoidRetTy)
06280         Cloned->setName(Instr->getName() + ".cloned");
06281       // Replace the operands of the cloned instructions with extracted scalars.
06282       for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
06283         Value *Op = Params[op][Part];
06284         Cloned->setOperand(op, Op);
06285       }
06286 
06287       // Place the cloned scalar in the new loop.
06288       Builder.Insert(Cloned);
06289 
06290       // If the original scalar returns a value we need to place it in a vector
06291       // so that future users will be able to use it.
06292       if (!IsVoidRetTy)
06293         VecResults[Part] = Cloned;
06294 
06295     // End if-block.
06296       if (IfPredicateStore) {
06297         BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
06298         LoopVectorBody.push_back(NewIfBlock);
06299         VectorLp->addBasicBlockToLoop(NewIfBlock, LI->getBase());
06300         Builder.SetInsertPoint(InsertPt);
06301         Instruction *OldBr = IfBlock->getTerminator();
06302         BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
06303         OldBr->eraseFromParent();
06304         IfBlock = NewIfBlock;
06305       }
06306   }
06307 }
06308 
06309 void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) {
06310   StoreInst *SI = dyn_cast<StoreInst>(Instr);
06311   bool IfPredicateStore = (SI && Legal->blockNeedsPredication(SI->getParent()));
06312 
06313   return scalarizeInstruction(Instr, IfPredicateStore);
06314 }
06315 
06316 Value *InnerLoopUnroller::reverseVector(Value *Vec) {
06317   return Vec;
06318 }
06319 
06320 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) {
06321   return V;
06322 }
06323 
06324 Value *InnerLoopUnroller::getConsecutiveVector(Value* Val, int StartIdx,
06325                                                bool Negate) {
06326   // When unrolling and the VF is 1, we only need to add a simple scalar.
06327   Type *ITy = Val->getType();
06328   assert(!ITy->isVectorTy() && "Val must be a scalar");
06329   Constant *C = ConstantInt::get(ITy, StartIdx, Negate);
06330   return Builder.CreateAdd(Val, C, "induction");
06331 }