LLVM API Documentation

LoopVectorize.cpp
Go to the documentation of this file.
00001 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
00011 // and generates target-independent LLVM-IR.
00012 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
00013 // of instructions in order to estimate the profitability of vectorization.
00014 //
00015 // The loop vectorizer combines consecutive loop iterations into a single
00016 // 'wide' iteration. After this transformation the index is incremented
00017 // by the SIMD vector width, and not by one.
00018 //
00019 // This pass has three parts:
00020 // 1. The main loop pass that drives the different parts.
00021 // 2. LoopVectorizationLegality - A unit that checks for the legality
00022 //    of the vectorization.
00023 // 3. InnerLoopVectorizer - A unit that performs the actual
00024 //    widening of instructions.
00025 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
00026 //    of vectorization. It decides on the optimal vector width, which
00027 //    can be one, if vectorization is not profitable.
00028 //
00029 //===----------------------------------------------------------------------===//
00030 //
00031 // The reduction-variable vectorization is based on the paper:
00032 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
00033 //
00034 // Variable uniformity checks are inspired by:
00035 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
00036 //
00037 // Other ideas/concepts are from:
00038 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
00039 //
00040 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
00041 //  Vectorizing Compilers.
00042 //
00043 //===----------------------------------------------------------------------===//
00044 
00045 #include "llvm/Transforms/Vectorize.h"
00046 #include "llvm/ADT/DenseMap.h"
00047 #include "llvm/ADT/EquivalenceClasses.h"
00048 #include "llvm/ADT/Hashing.h"
00049 #include "llvm/ADT/MapVector.h"
00050 #include "llvm/ADT/SetVector.h"
00051 #include "llvm/ADT/SmallPtrSet.h"
00052 #include "llvm/ADT/SmallSet.h"
00053 #include "llvm/ADT/SmallVector.h"
00054 #include "llvm/ADT/Statistic.h"
00055 #include "llvm/ADT/StringExtras.h"
00056 #include "llvm/Analysis/AliasAnalysis.h"
00057 #include "llvm/Analysis/AliasSetTracker.h"
00058 #include "llvm/Analysis/AssumptionCache.h"
00059 #include "llvm/Analysis/BlockFrequencyInfo.h"
00060 #include "llvm/Analysis/CodeMetrics.h"
00061 #include "llvm/Analysis/LoopAccessAnalysis.h"
00062 #include "llvm/Analysis/LoopInfo.h"
00063 #include "llvm/Analysis/LoopIterator.h"
00064 #include "llvm/Analysis/LoopPass.h"
00065 #include "llvm/Analysis/ScalarEvolution.h"
00066 #include "llvm/Analysis/ScalarEvolutionExpander.h"
00067 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
00068 #include "llvm/Analysis/TargetTransformInfo.h"
00069 #include "llvm/Analysis/ValueTracking.h"
00070 #include "llvm/IR/Constants.h"
00071 #include "llvm/IR/DataLayout.h"
00072 #include "llvm/IR/DebugInfo.h"
00073 #include "llvm/IR/DerivedTypes.h"
00074 #include "llvm/IR/DiagnosticInfo.h"
00075 #include "llvm/IR/Dominators.h"
00076 #include "llvm/IR/Function.h"
00077 #include "llvm/IR/IRBuilder.h"
00078 #include "llvm/IR/Instructions.h"
00079 #include "llvm/IR/IntrinsicInst.h"
00080 #include "llvm/IR/LLVMContext.h"
00081 #include "llvm/IR/Module.h"
00082 #include "llvm/IR/PatternMatch.h"
00083 #include "llvm/IR/Type.h"
00084 #include "llvm/IR/Value.h"
00085 #include "llvm/IR/ValueHandle.h"
00086 #include "llvm/IR/Verifier.h"
00087 #include "llvm/Pass.h"
00088 #include "llvm/Support/BranchProbability.h"
00089 #include "llvm/Support/CommandLine.h"
00090 #include "llvm/Support/Debug.h"
00091 #include "llvm/Support/raw_ostream.h"
00092 #include "llvm/Transforms/Scalar.h"
00093 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
00094 #include "llvm/Transforms/Utils/Local.h"
00095 #include "llvm/Transforms/Utils/VectorUtils.h"
00096 #include <algorithm>
00097 #include <map>
00098 #include <tuple>
00099 
00100 using namespace llvm;
00101 using namespace llvm::PatternMatch;
00102 
00103 #define LV_NAME "loop-vectorize"
00104 #define DEBUG_TYPE LV_NAME
00105 
00106 STATISTIC(LoopsVectorized, "Number of loops vectorized");
00107 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
00108 
00109 static cl::opt<bool>
00110 EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
00111                    cl::desc("Enable if-conversion during vectorization."));
00112 
00113 /// We don't vectorize loops with a known constant trip count below this number.
00114 static cl::opt<unsigned>
00115 TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16),
00116                              cl::Hidden,
00117                              cl::desc("Don't vectorize loops with a constant "
00118                                       "trip count that is smaller than this "
00119                                       "value."));
00120 
00121 /// This enables versioning on the strides of symbolically striding memory
00122 /// accesses in code like the following.
00123 ///   for (i = 0; i < N; ++i)
00124 ///     A[i * Stride1] += B[i * Stride2] ...
00125 ///
00126 /// Will be roughly translated to
00127 ///    if (Stride1 == 1 && Stride2 == 1) {
00128 ///      for (i = 0; i < N; i+=4)
00129 ///       A[i:i+3] += ...
00130 ///    } else
00131 ///      ...
00132 static cl::opt<bool> EnableMemAccessVersioning(
00133     "enable-mem-access-versioning", cl::init(true), cl::Hidden,
00134     cl::desc("Enable symblic stride memory access versioning"));
00135 
00136 /// We don't unroll loops with a known constant trip count below this number.
00137 static const unsigned TinyTripCountUnrollThreshold = 128;
00138 
00139 static cl::opt<unsigned> ForceTargetNumScalarRegs(
00140     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
00141     cl::desc("A flag that overrides the target's number of scalar registers."));
00142 
00143 static cl::opt<unsigned> ForceTargetNumVectorRegs(
00144     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
00145     cl::desc("A flag that overrides the target's number of vector registers."));
00146 
00147 /// Maximum vectorization interleave count.
00148 static const unsigned MaxInterleaveFactor = 16;
00149 
00150 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
00151     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
00152     cl::desc("A flag that overrides the target's max interleave factor for "
00153              "scalar loops."));
00154 
00155 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
00156     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
00157     cl::desc("A flag that overrides the target's max interleave factor for "
00158              "vectorized loops."));
00159 
00160 static cl::opt<unsigned> ForceTargetInstructionCost(
00161     "force-target-instruction-cost", cl::init(0), cl::Hidden,
00162     cl::desc("A flag that overrides the target's expected cost for "
00163              "an instruction to a single constant value. Mostly "
00164              "useful for getting consistent testing."));
00165 
00166 static cl::opt<unsigned> SmallLoopCost(
00167     "small-loop-cost", cl::init(20), cl::Hidden,
00168     cl::desc("The cost of a loop that is considered 'small' by the unroller."));
00169 
00170 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
00171     "loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden,
00172     cl::desc("Enable the use of the block frequency analysis to access PGO "
00173              "heuristics minimizing code growth in cold regions and being more "
00174              "aggressive in hot regions."));
00175 
00176 // Runtime unroll loops for load/store throughput.
00177 static cl::opt<bool> EnableLoadStoreRuntimeUnroll(
00178     "enable-loadstore-runtime-unroll", cl::init(true), cl::Hidden,
00179     cl::desc("Enable runtime unrolling until load/store ports are saturated"));
00180 
00181 /// The number of stores in a loop that are allowed to need predication.
00182 static cl::opt<unsigned> NumberOfStoresToPredicate(
00183     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
00184     cl::desc("Max number of stores to be predicated behind an if."));
00185 
00186 static cl::opt<bool> EnableIndVarRegisterHeur(
00187     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
00188     cl::desc("Count the induction variable only once when unrolling"));
00189 
00190 static cl::opt<bool> EnableCondStoresVectorization(
00191     "enable-cond-stores-vec", cl::init(false), cl::Hidden,
00192     cl::desc("Enable if predication of stores during vectorization."));
00193 
00194 static cl::opt<unsigned> MaxNestedScalarReductionUF(
00195     "max-nested-scalar-reduction-unroll", cl::init(2), cl::Hidden,
00196     cl::desc("The maximum unroll factor to use when unrolling a scalar "
00197              "reduction in a nested loop."));
00198 
00199 namespace {
00200 
00201 // Forward declarations.
00202 class LoopVectorizationLegality;
00203 class LoopVectorizationCostModel;
00204 class LoopVectorizeHints;
00205 
00206 /// \brief This modifies LoopAccessReport to initialize message with
00207 /// loop-vectorizer-specific part.
00208 class VectorizationReport : public LoopAccessReport {
00209 public:
00210   VectorizationReport(Instruction *I = nullptr)
00211       : LoopAccessReport("loop not vectorized: ", I) {}
00212 
00213   /// \brief This allows promotion of the loop-access analysis report into the
00214   /// loop-vectorizer report.  It modifies the message to add the
00215   /// loop-vectorizer-specific part of the message.
00216   explicit VectorizationReport(const LoopAccessReport &R)
00217       : LoopAccessReport(Twine("loop not vectorized: ") + R.str(),
00218                          R.getInstr()) {}
00219 };
00220 
00221 /// InnerLoopVectorizer vectorizes loops which contain only one basic
00222 /// block to a specified vectorization factor (VF).
00223 /// This class performs the widening of scalars into vectors, or multiple
00224 /// scalars. This class also implements the following features:
00225 /// * It inserts an epilogue loop for handling loops that don't have iteration
00226 ///   counts that are known to be a multiple of the vectorization factor.
00227 /// * It handles the code generation for reduction variables.
00228 /// * Scalarization (implementation using scalars) of un-vectorizable
00229 ///   instructions.
00230 /// InnerLoopVectorizer does not perform any vectorization-legality
00231 /// checks, and relies on the caller to check for the different legality
00232 /// aspects. The InnerLoopVectorizer relies on the
00233 /// LoopVectorizationLegality class to provide information about the induction
00234 /// and reduction variables that were found to a given vectorization factor.
00235 class InnerLoopVectorizer {
00236 public:
00237   InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
00238                       DominatorTree *DT, const DataLayout *DL,
00239                       const TargetLibraryInfo *TLI, unsigned VecWidth,
00240                       unsigned UnrollFactor)
00241       : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), TLI(TLI),
00242         VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()),
00243         Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor),
00244         Legal(nullptr) {}
00245 
00246   // Perform the actual loop widening (vectorization).
00247   void vectorize(LoopVectorizationLegality *L) {
00248     Legal = L;
00249     // Create a new empty loop. Unlink the old loop and connect the new one.
00250     createEmptyLoop();
00251     // Widen each instruction in the old loop to a new one in the new loop.
00252     // Use the Legality module to find the induction and reduction variables.
00253     vectorizeLoop();
00254     // Register the new loop and update the analysis passes.
00255     updateAnalysis();
00256   }
00257 
00258   virtual ~InnerLoopVectorizer() {}
00259 
00260 protected:
00261   /// A small list of PHINodes.
00262   typedef SmallVector<PHINode*, 4> PhiVector;
00263   /// When we unroll loops we have multiple vector values for each scalar.
00264   /// This data structure holds the unrolled and vectorized values that
00265   /// originated from one scalar instruction.
00266   typedef SmallVector<Value*, 2> VectorParts;
00267 
00268   // When we if-convert we need create edge masks. We have to cache values so
00269   // that we don't end up with exponential recursion/IR.
00270   typedef DenseMap<std::pair<BasicBlock*, BasicBlock*>,
00271                    VectorParts> EdgeMaskCache;
00272 
00273   /// \brief Add checks for strides that where assumed to be 1.
00274   ///
00275   /// Returns the last check instruction and the first check instruction in the
00276   /// pair as (first, last).
00277   std::pair<Instruction *, Instruction *> addStrideCheck(Instruction *Loc);
00278 
00279   /// Create an empty loop, based on the loop ranges of the old loop.
00280   void createEmptyLoop();
00281   /// Copy and widen the instructions from the old loop.
00282   virtual void vectorizeLoop();
00283 
00284   /// \brief The Loop exit block may have single value PHI nodes where the
00285   /// incoming value is 'Undef'. While vectorizing we only handled real values
00286   /// that were defined inside the loop. Here we fix the 'undef case'.
00287   /// See PR14725.
00288   void fixLCSSAPHIs();
00289 
00290   /// A helper function that computes the predicate of the block BB, assuming
00291   /// that the header block of the loop is set to True. It returns the *entry*
00292   /// mask for the block BB.
00293   VectorParts createBlockInMask(BasicBlock *BB);
00294   /// A helper function that computes the predicate of the edge between SRC
00295   /// and DST.
00296   VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
00297 
00298   /// A helper function to vectorize a single BB within the innermost loop.
00299   void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV);
00300 
00301   /// Vectorize a single PHINode in a block. This method handles the induction
00302   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
00303   /// arbitrary length vectors.
00304   void widenPHIInstruction(Instruction *PN, VectorParts &Entry,
00305                            unsigned UF, unsigned VF, PhiVector *PV);
00306 
00307   /// Insert the new loop to the loop hierarchy and pass manager
00308   /// and update the analysis passes.
00309   void updateAnalysis();
00310 
00311   /// This instruction is un-vectorizable. Implement it as a sequence
00312   /// of scalars. If \p IfPredicateStore is true we need to 'hide' each
00313   /// scalarized instruction behind an if block predicated on the control
00314   /// dependence of the instruction.
00315   virtual void scalarizeInstruction(Instruction *Instr,
00316                                     bool IfPredicateStore=false);
00317 
00318   /// Vectorize Load and Store instructions,
00319   virtual void vectorizeMemoryInstruction(Instruction *Instr);
00320 
00321   /// Create a broadcast instruction. This method generates a broadcast
00322   /// instruction (shuffle) for loop invariant values and for the induction
00323   /// value. If this is the induction variable then we extend it to N, N+1, ...
00324   /// this is needed because each iteration in the loop corresponds to a SIMD
00325   /// element.
00326   virtual Value *getBroadcastInstrs(Value *V);
00327 
00328   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
00329   /// to each vector element of Val. The sequence starts at StartIndex.
00330   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step);
00331 
00332   /// When we go over instructions in the basic block we rely on previous
00333   /// values within the current basic block or on loop invariant values.
00334   /// When we widen (vectorize) values we place them in the map. If the values
00335   /// are not within the map, they have to be loop invariant, so we simply
00336   /// broadcast them into a vector.
00337   VectorParts &getVectorValue(Value *V);
00338 
00339   /// Generate a shuffle sequence that will reverse the vector Vec.
00340   virtual Value *reverseVector(Value *Vec);
00341 
00342   /// This is a helper class that holds the vectorizer state. It maps scalar
00343   /// instructions to vector instructions. When the code is 'unrolled' then
00344   /// then a single scalar value is mapped to multiple vector parts. The parts
00345   /// are stored in the VectorPart type.
00346   struct ValueMap {
00347     /// C'tor.  UnrollFactor controls the number of vectors ('parts') that
00348     /// are mapped.
00349     ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {}
00350 
00351     /// \return True if 'Key' is saved in the Value Map.
00352     bool has(Value *Key) const { return MapStorage.count(Key); }
00353 
00354     /// Initializes a new entry in the map. Sets all of the vector parts to the
00355     /// save value in 'Val'.
00356     /// \return A reference to a vector with splat values.
00357     VectorParts &splat(Value *Key, Value *Val) {
00358       VectorParts &Entry = MapStorage[Key];
00359       Entry.assign(UF, Val);
00360       return Entry;
00361     }
00362 
00363     ///\return A reference to the value that is stored at 'Key'.
00364     VectorParts &get(Value *Key) {
00365       VectorParts &Entry = MapStorage[Key];
00366       if (Entry.empty())
00367         Entry.resize(UF);
00368       assert(Entry.size() == UF);
00369       return Entry;
00370     }
00371 
00372   private:
00373     /// The unroll factor. Each entry in the map stores this number of vector
00374     /// elements.
00375     unsigned UF;
00376 
00377     /// Map storage. We use std::map and not DenseMap because insertions to a
00378     /// dense map invalidates its iterators.
00379     std::map<Value *, VectorParts> MapStorage;
00380   };
00381 
00382   /// The original loop.
00383   Loop *OrigLoop;
00384   /// Scev analysis to use.
00385   ScalarEvolution *SE;
00386   /// Loop Info.
00387   LoopInfo *LI;
00388   /// Dominator Tree.
00389   DominatorTree *DT;
00390   /// Alias Analysis.
00391   AliasAnalysis *AA;
00392   /// Data Layout.
00393   const DataLayout *DL;
00394   /// Target Library Info.
00395   const TargetLibraryInfo *TLI;
00396 
00397   /// The vectorization SIMD factor to use. Each vector will have this many
00398   /// vector elements.
00399   unsigned VF;
00400 
00401 protected:
00402   /// The vectorization unroll factor to use. Each scalar is vectorized to this
00403   /// many different vector instructions.
00404   unsigned UF;
00405 
00406   /// The builder that we use
00407   IRBuilder<> Builder;
00408 
00409   // --- Vectorization state ---
00410 
00411   /// The vector-loop preheader.
00412   BasicBlock *LoopVectorPreHeader;
00413   /// The scalar-loop preheader.
00414   BasicBlock *LoopScalarPreHeader;
00415   /// Middle Block between the vector and the scalar.
00416   BasicBlock *LoopMiddleBlock;
00417   ///The ExitBlock of the scalar loop.
00418   BasicBlock *LoopExitBlock;
00419   ///The vector loop body.
00420   SmallVector<BasicBlock *, 4> LoopVectorBody;
00421   ///The scalar loop body.
00422   BasicBlock *LoopScalarBody;
00423   /// A list of all bypass blocks. The first block is the entry of the loop.
00424   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
00425 
00426   /// The new Induction variable which was added to the new block.
00427   PHINode *Induction;
00428   /// The induction variable of the old basic block.
00429   PHINode *OldInduction;
00430   /// Holds the extended (to the widest induction type) start index.
00431   Value *ExtendedIdx;
00432   /// Maps scalars to widened vectors.
00433   ValueMap WidenMap;
00434   EdgeMaskCache MaskCache;
00435 
00436   LoopVectorizationLegality *Legal;
00437 };
00438 
00439 class InnerLoopUnroller : public InnerLoopVectorizer {
00440 public:
00441   InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
00442                     DominatorTree *DT, const DataLayout *DL,
00443                     const TargetLibraryInfo *TLI, unsigned UnrollFactor) :
00444     InnerLoopVectorizer(OrigLoop, SE, LI, DT, DL, TLI, 1, UnrollFactor) { }
00445 
00446 private:
00447   void scalarizeInstruction(Instruction *Instr,
00448                             bool IfPredicateStore = false) override;
00449   void vectorizeMemoryInstruction(Instruction *Instr) override;
00450   Value *getBroadcastInstrs(Value *V) override;
00451   Value *getStepVector(Value *Val, int StartIdx, Value *Step) override;
00452   Value *reverseVector(Value *Vec) override;
00453 };
00454 
00455 /// \brief Look for a meaningful debug location on the instruction or it's
00456 /// operands.
00457 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
00458   if (!I)
00459     return I;
00460 
00461   DebugLoc Empty;
00462   if (I->getDebugLoc() != Empty)
00463     return I;
00464 
00465   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
00466     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
00467       if (OpInst->getDebugLoc() != Empty)
00468         return OpInst;
00469   }
00470 
00471   return I;
00472 }
00473 
00474 /// \brief Set the debug location in the builder using the debug location in the
00475 /// instruction.
00476 static void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
00477   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr))
00478     B.SetCurrentDebugLocation(Inst->getDebugLoc());
00479   else
00480     B.SetCurrentDebugLocation(DebugLoc());
00481 }
00482 
00483 #ifndef NDEBUG
00484 /// \return string containing a file name and a line # for the given loop.
00485 static std::string getDebugLocString(const Loop *L) {
00486   std::string Result;
00487   if (L) {
00488     raw_string_ostream OS(Result);
00489     const DebugLoc LoopDbgLoc = L->getStartLoc();
00490     if (!LoopDbgLoc.isUnknown())
00491       LoopDbgLoc.print(OS);
00492     else
00493       // Just print the module name.
00494       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
00495     OS.flush();
00496   }
00497   return Result;
00498 }
00499 #endif
00500 
00501 /// \brief Propagate known metadata from one instruction to another.
00502 static void propagateMetadata(Instruction *To, const Instruction *From) {
00503   SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
00504   From->getAllMetadataOtherThanDebugLoc(Metadata);
00505 
00506   for (auto M : Metadata) {
00507     unsigned Kind = M.first;
00508 
00509     // These are safe to transfer (this is safe for TBAA, even when we
00510     // if-convert, because should that metadata have had a control dependency
00511     // on the condition, and thus actually aliased with some other
00512     // non-speculated memory access when the condition was false, this would be
00513     // caught by the runtime overlap checks).
00514     if (Kind != LLVMContext::MD_tbaa &&
00515         Kind != LLVMContext::MD_alias_scope &&
00516         Kind != LLVMContext::MD_noalias &&
00517         Kind != LLVMContext::MD_fpmath)
00518       continue;
00519 
00520     To->setMetadata(Kind, M.second);
00521   }
00522 }
00523 
00524 /// \brief Propagate known metadata from one instruction to a vector of others.
00525 static void propagateMetadata(SmallVectorImpl<Value *> &To, const Instruction *From) {
00526   for (Value *V : To)
00527     if (Instruction *I = dyn_cast<Instruction>(V))
00528       propagateMetadata(I, From);
00529 }
00530 
00531 /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
00532 /// to what vectorization factor.
00533 /// This class does not look at the profitability of vectorization, only the
00534 /// legality. This class has two main kinds of checks:
00535 /// * Memory checks - The code in canVectorizeMemory checks if vectorization
00536 ///   will change the order of memory accesses in a way that will change the
00537 ///   correctness of the program.
00538 /// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory
00539 /// checks for a number of different conditions, such as the availability of a
00540 /// single induction variable, that all types are supported and vectorize-able,
00541 /// etc. This code reflects the capabilities of InnerLoopVectorizer.
00542 /// This class is also used by InnerLoopVectorizer for identifying
00543 /// induction variable and the different reduction variables.
00544 class LoopVectorizationLegality {
00545 public:
00546   LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, const DataLayout *DL,
00547                             DominatorTree *DT, TargetLibraryInfo *TLI,
00548                             AliasAnalysis *AA, Function *F,
00549                             const TargetTransformInfo *TTI,
00550                             LoopAccessAnalysis *LAA)
00551       : NumPredStores(0), TheLoop(L), SE(SE), DL(DL),
00552         TLI(TLI), TheFunction(F), TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr),
00553         Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false) {}
00554 
00555   /// This enum represents the kinds of reductions that we support.
00556   enum ReductionKind {
00557     RK_NoReduction, ///< Not a reduction.
00558     RK_IntegerAdd,  ///< Sum of integers.
00559     RK_IntegerMult, ///< Product of integers.
00560     RK_IntegerOr,   ///< Bitwise or logical OR of numbers.
00561     RK_IntegerAnd,  ///< Bitwise or logical AND of numbers.
00562     RK_IntegerXor,  ///< Bitwise or logical XOR of numbers.
00563     RK_IntegerMinMax, ///< Min/max implemented in terms of select(cmp()).
00564     RK_FloatAdd,    ///< Sum of floats.
00565     RK_FloatMult,   ///< Product of floats.
00566     RK_FloatMinMax  ///< Min/max implemented in terms of select(cmp()).
00567   };
00568 
00569   /// This enum represents the kinds of inductions that we support.
00570   enum InductionKind {
00571     IK_NoInduction,  ///< Not an induction variable.
00572     IK_IntInduction, ///< Integer induction variable. Step = C.
00573     IK_PtrInduction  ///< Pointer induction var. Step = C / sizeof(elem).
00574   };
00575 
00576   // This enum represents the kind of minmax reduction.
00577   enum MinMaxReductionKind {
00578     MRK_Invalid,
00579     MRK_UIntMin,
00580     MRK_UIntMax,
00581     MRK_SIntMin,
00582     MRK_SIntMax,
00583     MRK_FloatMin,
00584     MRK_FloatMax
00585   };
00586 
00587   /// This struct holds information about reduction variables.
00588   struct ReductionDescriptor {
00589     ReductionDescriptor() : StartValue(nullptr), LoopExitInstr(nullptr),
00590       Kind(RK_NoReduction), MinMaxKind(MRK_Invalid) {}
00591 
00592     ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K,
00593                         MinMaxReductionKind MK)
00594         : StartValue(Start), LoopExitInstr(Exit), Kind(K), MinMaxKind(MK) {}
00595 
00596     // The starting value of the reduction.
00597     // It does not have to be zero!
00598     TrackingVH<Value> StartValue;
00599     // The instruction who's value is used outside the loop.
00600     Instruction *LoopExitInstr;
00601     // The kind of the reduction.
00602     ReductionKind Kind;
00603     // If this a min/max reduction the kind of reduction.
00604     MinMaxReductionKind MinMaxKind;
00605   };
00606 
00607   /// This POD struct holds information about a potential reduction operation.
00608   struct ReductionInstDesc {
00609     ReductionInstDesc(bool IsRedux, Instruction *I) :
00610       IsReduction(IsRedux), PatternLastInst(I), MinMaxKind(MRK_Invalid) {}
00611 
00612     ReductionInstDesc(Instruction *I, MinMaxReductionKind K) :
00613       IsReduction(true), PatternLastInst(I), MinMaxKind(K) {}
00614 
00615     // Is this instruction a reduction candidate.
00616     bool IsReduction;
00617     // The last instruction in a min/max pattern (select of the select(icmp())
00618     // pattern), or the current reduction instruction otherwise.
00619     Instruction *PatternLastInst;
00620     // If this is a min/max pattern the comparison predicate.
00621     MinMaxReductionKind MinMaxKind;
00622   };
00623 
00624   /// A struct for saving information about induction variables.
00625   struct InductionInfo {
00626     InductionInfo(Value *Start, InductionKind K, ConstantInt *Step)
00627         : StartValue(Start), IK(K), StepValue(Step) {
00628       assert(IK != IK_NoInduction && "Not an induction");
00629       assert(StartValue && "StartValue is null");
00630       assert(StepValue && !StepValue->isZero() && "StepValue is zero");
00631       assert((IK != IK_PtrInduction || StartValue->getType()->isPointerTy()) &&
00632              "StartValue is not a pointer for pointer induction");
00633       assert((IK != IK_IntInduction || StartValue->getType()->isIntegerTy()) &&
00634              "StartValue is not an integer for integer induction");
00635       assert(StepValue->getType()->isIntegerTy() &&
00636              "StepValue is not an integer");
00637     }
00638     InductionInfo()
00639         : StartValue(nullptr), IK(IK_NoInduction), StepValue(nullptr) {}
00640 
00641     /// Get the consecutive direction. Returns:
00642     ///   0 - unknown or non-consecutive.
00643     ///   1 - consecutive and increasing.
00644     ///  -1 - consecutive and decreasing.
00645     int getConsecutiveDirection() const {
00646       if (StepValue && (StepValue->isOne() || StepValue->isMinusOne()))
00647         return StepValue->getSExtValue();
00648       return 0;
00649     }
00650 
00651     /// Compute the transformed value of Index at offset StartValue using step
00652     /// StepValue.
00653     /// For integer induction, returns StartValue + Index * StepValue.
00654     /// For pointer induction, returns StartValue[Index * StepValue].
00655     /// FIXME: The newly created binary instructions should contain nsw/nuw
00656     /// flags, which can be found from the original scalar operations.
00657     Value *transform(IRBuilder<> &B, Value *Index) const {
00658       switch (IK) {
00659       case IK_IntInduction:
00660         assert(Index->getType() == StartValue->getType() &&
00661                "Index type does not match StartValue type");
00662         if (StepValue->isMinusOne())
00663           return B.CreateSub(StartValue, Index);
00664         if (!StepValue->isOne())
00665           Index = B.CreateMul(Index, StepValue);
00666         return B.CreateAdd(StartValue, Index);
00667 
00668       case IK_PtrInduction:
00669         if (StepValue->isMinusOne())
00670           Index = B.CreateNeg(Index);
00671         else if (!StepValue->isOne())
00672           Index = B.CreateMul(Index, StepValue);
00673         return B.CreateGEP(StartValue, Index);
00674 
00675       case IK_NoInduction:
00676         return nullptr;
00677       }
00678       llvm_unreachable("invalid enum");
00679     }
00680 
00681     /// Start value.
00682     TrackingVH<Value> StartValue;
00683     /// Induction kind.
00684     InductionKind IK;
00685     /// Step value.
00686     ConstantInt *StepValue;
00687   };
00688 
00689   /// ReductionList contains the reduction descriptors for all
00690   /// of the reductions that were found in the loop.
00691   typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
00692 
00693   /// InductionList saves induction variables and maps them to the
00694   /// induction descriptor.
00695   typedef MapVector<PHINode*, InductionInfo> InductionList;
00696 
00697   /// Returns true if it is legal to vectorize this loop.
00698   /// This does not mean that it is profitable to vectorize this
00699   /// loop, only that it is legal to do so.
00700   bool canVectorize();
00701 
00702   /// Returns the Induction variable.
00703   PHINode *getInduction() { return Induction; }
00704 
00705   /// Returns the reduction variables found in the loop.
00706   ReductionList *getReductionVars() { return &Reductions; }
00707 
00708   /// Returns the induction variables found in the loop.
00709   InductionList *getInductionVars() { return &Inductions; }
00710 
00711   /// Returns the widest induction type.
00712   Type *getWidestInductionType() { return WidestIndTy; }
00713 
00714   /// Returns True if V is an induction variable in this loop.
00715   bool isInductionVariable(const Value *V);
00716 
00717   /// Return true if the block BB needs to be predicated in order for the loop
00718   /// to be vectorized.
00719   bool blockNeedsPredication(BasicBlock *BB);
00720 
00721   /// Check if this  pointer is consecutive when vectorizing. This happens
00722   /// when the last index of the GEP is the induction variable, or that the
00723   /// pointer itself is an induction variable.
00724   /// This check allows us to vectorize A[idx] into a wide load/store.
00725   /// Returns:
00726   /// 0 - Stride is unknown or non-consecutive.
00727   /// 1 - Address is consecutive.
00728   /// -1 - Address is consecutive, and decreasing.
00729   int isConsecutivePtr(Value *Ptr);
00730 
00731   /// Returns true if the value V is uniform within the loop.
00732   bool isUniform(Value *V);
00733 
00734   /// Returns true if this instruction will remain scalar after vectorization.
00735   bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); }
00736 
00737   /// Returns the information that we collected about runtime memory check.
00738   const LoopAccessInfo::RuntimePointerCheck *getRuntimePointerCheck() const {
00739     return LAI->getRuntimePointerCheck();
00740   }
00741 
00742   const LoopAccessInfo *getLAI() const {
00743     return LAI;
00744   }
00745 
00746   /// This function returns the identity element (or neutral element) for
00747   /// the operation K.
00748   static Constant *getReductionIdentity(ReductionKind K, Type *Tp);
00749 
00750   unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); }
00751 
00752   bool hasStride(Value *V) { return StrideSet.count(V); }
00753   bool mustCheckStrides() { return !StrideSet.empty(); }
00754   SmallPtrSet<Value *, 8>::iterator strides_begin() {
00755     return StrideSet.begin();
00756   }
00757   SmallPtrSet<Value *, 8>::iterator strides_end() { return StrideSet.end(); }
00758 
00759   /// Returns true if the target machine supports masked store operation
00760   /// for the given \p DataType and kind of access to \p Ptr.
00761   bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
00762     return TTI->isLegalMaskedStore(DataType, isConsecutivePtr(Ptr));
00763   }
00764   /// Returns true if the target machine supports masked load operation
00765   /// for the given \p DataType and kind of access to \p Ptr.
00766   bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
00767     return TTI->isLegalMaskedLoad(DataType, isConsecutivePtr(Ptr));
00768   }
00769   /// Returns true if vector representation of the instruction \p I
00770   /// requires mask.
00771   bool isMaskRequired(const Instruction* I) {
00772     return (MaskedOp.count(I) != 0);
00773   }
00774   unsigned getNumStores() const {
00775     return LAI->getNumStores();
00776   }
00777   unsigned getNumLoads() const {
00778     return LAI->getNumLoads();
00779   }
00780   unsigned getNumPredStores() const {
00781     return NumPredStores;
00782   }
00783 private:
00784   /// Check if a single basic block loop is vectorizable.
00785   /// At this point we know that this is a loop with a constant trip count
00786   /// and we only need to check individual instructions.
00787   bool canVectorizeInstrs();
00788 
00789   /// When we vectorize loops we may change the order in which
00790   /// we read and write from memory. This method checks if it is
00791   /// legal to vectorize the code, considering only memory constrains.
00792   /// Returns true if the loop is vectorizable
00793   bool canVectorizeMemory();
00794 
00795   /// Return true if we can vectorize this loop using the IF-conversion
00796   /// transformation.
00797   bool canVectorizeWithIfConvert();
00798 
00799   /// Collect the variables that need to stay uniform after vectorization.
00800   void collectLoopUniforms();
00801 
00802   /// Return true if all of the instructions in the block can be speculatively
00803   /// executed. \p SafePtrs is a list of addresses that are known to be legal
00804   /// and we know that we can read from them without segfault.
00805   bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs);
00806 
00807   /// Returns True, if 'Phi' is the kind of reduction variable for type
00808   /// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
00809   bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
00810   /// Returns a struct describing if the instruction 'I' can be a reduction
00811   /// variable of type 'Kind'. If the reduction is a min/max pattern of
00812   /// select(icmp()) this function advances the instruction pointer 'I' from the
00813   /// compare instruction to the select instruction and stores this pointer in
00814   /// 'PatternLastInst' member of the returned struct.
00815   ReductionInstDesc isReductionInstr(Instruction *I, ReductionKind Kind,
00816                                      ReductionInstDesc &Desc);
00817   /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction
00818   /// pattern corresponding to a min(X, Y) or max(X, Y).
00819   static ReductionInstDesc isMinMaxSelectCmpPattern(Instruction *I,
00820                                                     ReductionInstDesc &Prev);
00821   /// Returns the induction kind of Phi and record the step. This function may
00822   /// return NoInduction if the PHI is not an induction variable.
00823   InductionKind isInductionVariable(PHINode *Phi, ConstantInt *&StepValue);
00824 
00825   /// \brief Collect memory access with loop invariant strides.
00826   ///
00827   /// Looks for accesses like "a[i * StrideA]" where "StrideA" is loop
00828   /// invariant.
00829   void collectStridedAccess(Value *LoadOrStoreInst);
00830 
00831   /// Report an analysis message to assist the user in diagnosing loops that are
00832   /// not vectorized.  These are handled as LoopAccessReport rather than
00833   /// VectorizationReport because the << operator of VectorizationReport returns
00834   /// LoopAccessReport.
00835   void emitAnalysis(const LoopAccessReport &Message) {
00836     LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, LV_NAME);
00837   }
00838 
00839   unsigned NumPredStores;
00840 
00841   /// The loop that we evaluate.
00842   Loop *TheLoop;
00843   /// Scev analysis.
00844   ScalarEvolution *SE;
00845   /// DataLayout analysis.
00846   const DataLayout *DL;
00847   /// Target Library Info.
00848   TargetLibraryInfo *TLI;
00849   /// Parent function
00850   Function *TheFunction;
00851   /// Target Transform Info
00852   const TargetTransformInfo *TTI;
00853   /// Dominator Tree.
00854   DominatorTree *DT;
00855   // LoopAccess analysis.
00856   LoopAccessAnalysis *LAA;
00857   // And the loop-accesses info corresponding to this loop.  This pointer is
00858   // null until canVectorizeMemory sets it up.
00859   const LoopAccessInfo *LAI;
00860 
00861   //  ---  vectorization state --- //
00862 
00863   /// Holds the integer induction variable. This is the counter of the
00864   /// loop.
00865   PHINode *Induction;
00866   /// Holds the reduction variables.
00867   ReductionList Reductions;
00868   /// Holds all of the induction variables that we found in the loop.
00869   /// Notice that inductions don't need to start at zero and that induction
00870   /// variables can be pointers.
00871   InductionList Inductions;
00872   /// Holds the widest induction type encountered.
00873   Type *WidestIndTy;
00874 
00875   /// Allowed outside users. This holds the reduction
00876   /// vars which can be accessed from outside the loop.
00877   SmallPtrSet<Value*, 4> AllowedExit;
00878   /// This set holds the variables which are known to be uniform after
00879   /// vectorization.
00880   SmallPtrSet<Instruction*, 4> Uniforms;
00881 
00882   /// Can we assume the absence of NaNs.
00883   bool HasFunNoNaNAttr;
00884 
00885   ValueToValueMap Strides;
00886   SmallPtrSet<Value *, 8> StrideSet;
00887   
00888   /// While vectorizing these instructions we have to generate a
00889   /// call to the appropriate masked intrinsic
00890   SmallPtrSet<const Instruction*, 8> MaskedOp;
00891 };
00892 
00893 /// LoopVectorizationCostModel - estimates the expected speedups due to
00894 /// vectorization.
00895 /// In many cases vectorization is not profitable. This can happen because of
00896 /// a number of reasons. In this class we mainly attempt to predict the
00897 /// expected speedup/slowdowns due to the supported instruction set. We use the
00898 /// TargetTransformInfo to query the different backends for the cost of
00899 /// different operations.
00900 class LoopVectorizationCostModel {
00901 public:
00902   LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI,
00903                              LoopVectorizationLegality *Legal,
00904                              const TargetTransformInfo &TTI,
00905                              const DataLayout *DL, const TargetLibraryInfo *TLI,
00906                              AssumptionCache *AC, const Function *F,
00907                              const LoopVectorizeHints *Hints)
00908       : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI),
00909         TheFunction(F), Hints(Hints) {
00910     CodeMetrics::collectEphemeralValues(L, AC, EphValues);
00911   }
00912 
00913   /// Information about vectorization costs
00914   struct VectorizationFactor {
00915     unsigned Width; // Vector width with best cost
00916     unsigned Cost; // Cost of the loop with that width
00917   };
00918   /// \return The most profitable vectorization factor and the cost of that VF.
00919   /// This method checks every power of two up to VF. If UserVF is not ZERO
00920   /// then this vectorization factor will be selected if vectorization is
00921   /// possible.
00922   VectorizationFactor selectVectorizationFactor(bool OptForSize);
00923 
00924   /// \return The size (in bits) of the widest type in the code that
00925   /// needs to be vectorized. We ignore values that remain scalar such as
00926   /// 64 bit loop indices.
00927   unsigned getWidestType();
00928 
00929   /// \return The most profitable unroll factor.
00930   /// If UserUF is non-zero then this method finds the best unroll-factor
00931   /// based on register pressure and other parameters.
00932   /// VF and LoopCost are the selected vectorization factor and the cost of the
00933   /// selected VF.
00934   unsigned selectUnrollFactor(bool OptForSize, unsigned VF, unsigned LoopCost);
00935 
00936   /// \brief A struct that represents some properties of the register usage
00937   /// of a loop.
00938   struct RegisterUsage {
00939     /// Holds the number of loop invariant values that are used in the loop.
00940     unsigned LoopInvariantRegs;
00941     /// Holds the maximum number of concurrent live intervals in the loop.
00942     unsigned MaxLocalUsers;
00943     /// Holds the number of instructions in the loop.
00944     unsigned NumInstructions;
00945   };
00946 
00947   /// \return  information about the register usage of the loop.
00948   RegisterUsage calculateRegisterUsage();
00949 
00950 private:
00951   /// Returns the expected execution cost. The unit of the cost does
00952   /// not matter because we use the 'cost' units to compare different
00953   /// vector widths. The cost that is returned is *not* normalized by
00954   /// the factor width.
00955   unsigned expectedCost(unsigned VF);
00956 
00957   /// Returns the execution time cost of an instruction for a given vector
00958   /// width. Vector width of one means scalar.
00959   unsigned getInstructionCost(Instruction *I, unsigned VF);
00960 
00961   /// A helper function for converting Scalar types to vector types.
00962   /// If the incoming type is void, we return void. If the VF is 1, we return
00963   /// the scalar type.
00964   static Type* ToVectorTy(Type *Scalar, unsigned VF);
00965 
00966   /// Returns whether the instruction is a load or store and will be a emitted
00967   /// as a vector operation.
00968   bool isConsecutiveLoadOrStore(Instruction *I);
00969 
00970   /// Report an analysis message to assist the user in diagnosing loops that are
00971   /// not vectorized.  These are handled as LoopAccessReport rather than
00972   /// VectorizationReport because the << operator of VectorizationReport returns
00973   /// LoopAccessReport.
00974   void emitAnalysis(const LoopAccessReport &Message) {
00975     LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, LV_NAME);
00976   }
00977 
00978   /// Values used only by @llvm.assume calls.
00979   SmallPtrSet<const Value *, 32> EphValues;
00980 
00981   /// The loop that we evaluate.
00982   Loop *TheLoop;
00983   /// Scev analysis.
00984   ScalarEvolution *SE;
00985   /// Loop Info analysis.
00986   LoopInfo *LI;
00987   /// Vectorization legality.
00988   LoopVectorizationLegality *Legal;
00989   /// Vector target information.
00990   const TargetTransformInfo &TTI;
00991   /// Target data layout information.
00992   const DataLayout *DL;
00993   /// Target Library Info.
00994   const TargetLibraryInfo *TLI;
00995   const Function *TheFunction;
00996   // Loop Vectorize Hint.
00997   const LoopVectorizeHints *Hints;
00998 };
00999 
01000 /// Utility class for getting and setting loop vectorizer hints in the form
01001 /// of loop metadata.
01002 /// This class keeps a number of loop annotations locally (as member variables)
01003 /// and can, upon request, write them back as metadata on the loop. It will
01004 /// initially scan the loop for existing metadata, and will update the local
01005 /// values based on information in the loop.
01006 /// We cannot write all values to metadata, as the mere presence of some info,
01007 /// for example 'force', means a decision has been made. So, we need to be
01008 /// careful NOT to add them if the user hasn't specifically asked so.
01009 class LoopVectorizeHints {
01010   enum HintKind {
01011     HK_WIDTH,
01012     HK_UNROLL,
01013     HK_FORCE
01014   };
01015 
01016   /// Hint - associates name and validation with the hint value.
01017   struct Hint {
01018     const char * Name;
01019     unsigned Value; // This may have to change for non-numeric values.
01020     HintKind Kind;
01021 
01022     Hint(const char * Name, unsigned Value, HintKind Kind)
01023       : Name(Name), Value(Value), Kind(Kind) { }
01024 
01025     bool validate(unsigned Val) {
01026       switch (Kind) {
01027       case HK_WIDTH:
01028         return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth;
01029       case HK_UNROLL:
01030         return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
01031       case HK_FORCE:
01032         return (Val <= 1);
01033       }
01034       return false;
01035     }
01036   };
01037 
01038   /// Vectorization width.
01039   Hint Width;
01040   /// Vectorization interleave factor.
01041   Hint Interleave;
01042   /// Vectorization forced
01043   Hint Force;
01044 
01045   /// Return the loop metadata prefix.
01046   static StringRef Prefix() { return "llvm.loop."; }
01047 
01048 public:
01049   enum ForceKind {
01050     FK_Undefined = -1, ///< Not selected.
01051     FK_Disabled = 0,   ///< Forcing disabled.
01052     FK_Enabled = 1,    ///< Forcing enabled.
01053   };
01054 
01055   LoopVectorizeHints(const Loop *L, bool DisableInterleaving)
01056       : Width("vectorize.width", VectorizerParams::VectorizationFactor,
01057               HK_WIDTH),
01058         Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
01059         Force("vectorize.enable", FK_Undefined, HK_FORCE),
01060         TheLoop(L) {
01061     // Populate values with existing loop metadata.
01062     getHintsFromMetadata();
01063 
01064     // force-vector-interleave overrides DisableInterleaving.
01065     if (VectorizerParams::isInterleaveForced())
01066       Interleave.Value = VectorizerParams::VectorizationInterleave;
01067 
01068     DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()
01069           << "LV: Interleaving disabled by the pass manager\n");
01070   }
01071 
01072   /// Mark the loop L as already vectorized by setting the width to 1.
01073   void setAlreadyVectorized() {
01074     Width.Value = Interleave.Value = 1;
01075     Hint Hints[] = {Width, Interleave};
01076     writeHintsToMetadata(Hints);
01077   }
01078 
01079   /// Dumps all the hint information.
01080   std::string emitRemark() const {
01081     VectorizationReport R;
01082     if (Force.Value == LoopVectorizeHints::FK_Disabled)
01083       R << "vectorization is explicitly disabled";
01084     else {
01085       R << "use -Rpass-analysis=loop-vectorize for more info";
01086       if (Force.Value == LoopVectorizeHints::FK_Enabled) {
01087         R << " (Force=true";
01088         if (Width.Value != 0)
01089           R << ", Vector Width=" << Width.Value;
01090         if (Interleave.Value != 0)
01091           R << ", Interleave Count=" << Interleave.Value;
01092         R << ")";
01093       }
01094     }
01095 
01096     return R.str();
01097   }
01098 
01099   unsigned getWidth() const { return Width.Value; }
01100   unsigned getInterleave() const { return Interleave.Value; }
01101   enum ForceKind getForce() const { return (ForceKind)Force.Value; }
01102 
01103 private:
01104   /// Find hints specified in the loop metadata and update local values.
01105   void getHintsFromMetadata() {
01106     MDNode *LoopID = TheLoop->getLoopID();
01107     if (!LoopID)
01108       return;
01109 
01110     // First operand should refer to the loop id itself.
01111     assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
01112     assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
01113 
01114     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
01115       const MDString *S = nullptr;
01116       SmallVector<Metadata *, 4> Args;
01117 
01118       // The expected hint is either a MDString or a MDNode with the first
01119       // operand a MDString.
01120       if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
01121         if (!MD || MD->getNumOperands() == 0)
01122           continue;
01123         S = dyn_cast<MDString>(MD->getOperand(0));
01124         for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
01125           Args.push_back(MD->getOperand(i));
01126       } else {
01127         S = dyn_cast<MDString>(LoopID->getOperand(i));
01128         assert(Args.size() == 0 && "too many arguments for MDString");
01129       }
01130 
01131       if (!S)
01132         continue;
01133 
01134       // Check if the hint starts with the loop metadata prefix.
01135       StringRef Name = S->getString();
01136       if (Args.size() == 1)
01137         setHint(Name, Args[0]);
01138     }
01139   }
01140 
01141   /// Checks string hint with one operand and set value if valid.
01142   void setHint(StringRef Name, Metadata *Arg) {
01143     if (!Name.startswith(Prefix()))
01144       return;
01145     Name = Name.substr(Prefix().size(), StringRef::npos);
01146 
01147     const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
01148     if (!C) return;
01149     unsigned Val = C->getZExtValue();
01150 
01151     Hint *Hints[] = {&Width, &Interleave, &Force};
01152     for (auto H : Hints) {
01153       if (Name == H->Name) {
01154         if (H->validate(Val))
01155           H->Value = Val;
01156         else
01157           DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
01158         break;
01159       }
01160     }
01161   }
01162 
01163   /// Create a new hint from name / value pair.
01164   MDNode *createHintMetadata(StringRef Name, unsigned V) const {
01165     LLVMContext &Context = TheLoop->getHeader()->getContext();
01166     Metadata *MDs[] = {MDString::get(Context, Name),
01167                        ConstantAsMetadata::get(
01168                            ConstantInt::get(Type::getInt32Ty(Context), V))};
01169     return MDNode::get(Context, MDs);
01170   }
01171 
01172   /// Matches metadata with hint name.
01173   bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) {
01174     MDString* Name = dyn_cast<MDString>(Node->getOperand(0));
01175     if (!Name)
01176       return false;
01177 
01178     for (auto H : HintTypes)
01179       if (Name->getString().endswith(H.Name))
01180         return true;
01181     return false;
01182   }
01183 
01184   /// Sets current hints into loop metadata, keeping other values intact.
01185   void writeHintsToMetadata(ArrayRef<Hint> HintTypes) {
01186     if (HintTypes.size() == 0)
01187       return;
01188 
01189     // Reserve the first element to LoopID (see below).
01190     SmallVector<Metadata *, 4> MDs(1);
01191     // If the loop already has metadata, then ignore the existing operands.
01192     MDNode *LoopID = TheLoop->getLoopID();
01193     if (LoopID) {
01194       for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
01195         MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
01196         // If node in update list, ignore old value.
01197         if (!matchesHintMetadataName(Node, HintTypes))
01198           MDs.push_back(Node);
01199       }
01200     }
01201 
01202     // Now, add the missing hints.
01203     for (auto H : HintTypes)
01204       MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));
01205 
01206     // Replace current metadata node with new one.
01207     LLVMContext &Context = TheLoop->getHeader()->getContext();
01208     MDNode *NewLoopID = MDNode::get(Context, MDs);
01209     // Set operand 0 to refer to the loop id itself.
01210     NewLoopID->replaceOperandWith(0, NewLoopID);
01211 
01212     TheLoop->setLoopID(NewLoopID);
01213   }
01214 
01215   /// The loop these hints belong to.
01216   const Loop *TheLoop;
01217 };
01218 
01219 static void emitMissedWarning(Function *F, Loop *L,
01220                               const LoopVectorizeHints &LH) {
01221   emitOptimizationRemarkMissed(F->getContext(), DEBUG_TYPE, *F,
01222                                L->getStartLoc(), LH.emitRemark());
01223 
01224   if (LH.getForce() == LoopVectorizeHints::FK_Enabled) {
01225     if (LH.getWidth() != 1)
01226       emitLoopVectorizeWarning(
01227           F->getContext(), *F, L->getStartLoc(),
01228           "failed explicitly specified loop vectorization");
01229     else if (LH.getInterleave() != 1)
01230       emitLoopInterleaveWarning(
01231           F->getContext(), *F, L->getStartLoc(),
01232           "failed explicitly specified loop interleaving");
01233   }
01234 }
01235 
01236 static void addInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {
01237   if (L.empty())
01238     return V.push_back(&L);
01239 
01240   for (Loop *InnerL : L)
01241     addInnerLoop(*InnerL, V);
01242 }
01243 
01244 /// The LoopVectorize Pass.
01245 struct LoopVectorize : public FunctionPass {
01246   /// Pass identification, replacement for typeid
01247   static char ID;
01248 
01249   explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true)
01250     : FunctionPass(ID),
01251       DisableUnrolling(NoUnrolling),
01252       AlwaysVectorize(AlwaysVectorize) {
01253     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
01254   }
01255 
01256   ScalarEvolution *SE;
01257   const DataLayout *DL;
01258   LoopInfo *LI;
01259   TargetTransformInfo *TTI;
01260   DominatorTree *DT;
01261   BlockFrequencyInfo *BFI;
01262   TargetLibraryInfo *TLI;
01263   AliasAnalysis *AA;
01264   AssumptionCache *AC;
01265   LoopAccessAnalysis *LAA;
01266   bool DisableUnrolling;
01267   bool AlwaysVectorize;
01268 
01269   BlockFrequency ColdEntryFreq;
01270 
01271   bool runOnFunction(Function &F) override {
01272     SE = &getAnalysis<ScalarEvolution>();
01273     DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
01274     DL = DLP ? &DLP->getDataLayout() : nullptr;
01275     LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
01276     TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
01277     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
01278     BFI = &getAnalysis<BlockFrequencyInfo>();
01279     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
01280     TLI = TLIP ? &TLIP->getTLI() : nullptr;
01281     AA = &getAnalysis<AliasAnalysis>();
01282     AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
01283     LAA = &getAnalysis<LoopAccessAnalysis>();
01284 
01285     // Compute some weights outside of the loop over the loops. Compute this
01286     // using a BranchProbability to re-use its scaling math.
01287     const BranchProbability ColdProb(1, 5); // 20%
01288     ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb;
01289 
01290     // If the target claims to have no vector registers don't attempt
01291     // vectorization.
01292     if (!TTI->getNumberOfRegisters(true))
01293       return false;
01294 
01295     if (!DL) {
01296       DEBUG(dbgs() << "\nLV: Not vectorizing " << F.getName()
01297                    << ": Missing data layout\n");
01298       return false;
01299     }
01300 
01301     // Build up a worklist of inner-loops to vectorize. This is necessary as
01302     // the act of vectorizing or partially unrolling a loop creates new loops
01303     // and can invalidate iterators across the loops.
01304     SmallVector<Loop *, 8> Worklist;
01305 
01306     for (Loop *L : *LI)
01307       addInnerLoop(*L, Worklist);
01308 
01309     LoopsAnalyzed += Worklist.size();
01310 
01311     // Now walk the identified inner loops.
01312     bool Changed = false;
01313     while (!Worklist.empty())
01314       Changed |= processLoop(Worklist.pop_back_val());
01315 
01316     // Process each loop nest in the function.
01317     return Changed;
01318   }
01319 
01320   bool processLoop(Loop *L) {
01321     assert(L->empty() && "Only process inner loops.");
01322 
01323 #ifndef NDEBUG
01324     const std::string DebugLocStr = getDebugLocString(L);
01325 #endif /* NDEBUG */
01326 
01327     DEBUG(dbgs() << "\nLV: Checking a loop in \""
01328                  << L->getHeader()->getParent()->getName() << "\" from "
01329                  << DebugLocStr << "\n");
01330 
01331     LoopVectorizeHints Hints(L, DisableUnrolling);
01332 
01333     DEBUG(dbgs() << "LV: Loop hints:"
01334                  << " force="
01335                  << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
01336                          ? "disabled"
01337                          : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
01338                                 ? "enabled"
01339                                 : "?")) << " width=" << Hints.getWidth()
01340                  << " unroll=" << Hints.getInterleave() << "\n");
01341 
01342     // Function containing loop
01343     Function *F = L->getHeader()->getParent();
01344 
01345     // Looking at the diagnostic output is the only way to determine if a loop
01346     // was vectorized (other than looking at the IR or machine code), so it
01347     // is important to generate an optimization remark for each loop. Most of
01348     // these messages are generated by emitOptimizationRemarkAnalysis. Remarks
01349     // generated by emitOptimizationRemark and emitOptimizationRemarkMissed are
01350     // less verbose reporting vectorized loops and unvectorized loops that may
01351     // benefit from vectorization, respectively.
01352 
01353     if (Hints.getForce() == LoopVectorizeHints::FK_Disabled) {
01354       DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
01355       emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F,
01356                                      L->getStartLoc(), Hints.emitRemark());
01357       return false;
01358     }
01359 
01360     if (!AlwaysVectorize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) {
01361       DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
01362       emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F,
01363                                      L->getStartLoc(), Hints.emitRemark());
01364       return false;
01365     }
01366 
01367     if (Hints.getWidth() == 1 && Hints.getInterleave() == 1) {
01368       DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
01369       emitOptimizationRemarkAnalysis(
01370           F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
01371           "loop not vectorized: vector width and interleave count are "
01372           "explicitly set to 1");
01373       return false;
01374     }
01375 
01376     // Check the loop for a trip count threshold:
01377     // do not vectorize loops with a tiny trip count.
01378     const unsigned TC = SE->getSmallConstantTripCount(L);
01379     if (TC > 0u && TC < TinyTripCountVectorThreshold) {
01380       DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
01381                    << "This loop is not worth vectorizing.");
01382       if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
01383         DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
01384       else {
01385         DEBUG(dbgs() << "\n");
01386         emitOptimizationRemarkAnalysis(
01387             F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
01388             "vectorization is not beneficial and is not explicitly forced");
01389         return false;
01390       }
01391     }
01392 
01393     // Check if it is legal to vectorize the loop.
01394     LoopVectorizationLegality LVL(L, SE, DL, DT, TLI, AA, F, TTI, LAA);
01395     if (!LVL.canVectorize()) {
01396       DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
01397       emitMissedWarning(F, L, Hints);
01398       return false;
01399     }
01400 
01401     // Use the cost model.
01402     LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL, TLI, AC, F,
01403                                   &Hints);
01404 
01405     // Check the function attributes to find out if this function should be
01406     // optimized for size.
01407     bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
01408                       F->hasFnAttribute(Attribute::OptimizeForSize);
01409 
01410     // Compute the weighted frequency of this loop being executed and see if it
01411     // is less than 20% of the function entry baseline frequency. Note that we
01412     // always have a canonical loop here because we think we *can* vectoriez.
01413     // FIXME: This is hidden behind a flag due to pervasive problems with
01414     // exactly what block frequency models.
01415     if (LoopVectorizeWithBlockFrequency) {
01416       BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader());
01417       if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
01418           LoopEntryFreq < ColdEntryFreq)
01419         OptForSize = true;
01420     }
01421 
01422     // Check the function attributes to see if implicit floats are allowed.a
01423     // FIXME: This check doesn't seem possibly correct -- what if the loop is
01424     // an integer loop and the vector instructions selected are purely integer
01425     // vector instructions?
01426     if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
01427       DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
01428             "attribute is used.\n");
01429       emitOptimizationRemarkAnalysis(
01430           F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
01431           "loop not vectorized due to NoImplicitFloat attribute");
01432       emitMissedWarning(F, L, Hints);
01433       return false;
01434     }
01435 
01436     // Select the optimal vectorization factor.
01437     const LoopVectorizationCostModel::VectorizationFactor VF =
01438         CM.selectVectorizationFactor(OptForSize);
01439 
01440     // Select the unroll factor.
01441     const unsigned UF =
01442         CM.selectUnrollFactor(OptForSize, VF.Width, VF.Cost);
01443 
01444     DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
01445                  << DebugLocStr << '\n');
01446     DEBUG(dbgs() << "LV: Unroll Factor is " << UF << '\n');
01447 
01448     if (VF.Width == 1) {
01449       DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial\n");
01450 
01451       if (UF == 1) {
01452         emitOptimizationRemarkAnalysis(
01453             F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
01454             "not beneficial to vectorize and user disabled interleaving");
01455         return false;
01456       }
01457       DEBUG(dbgs() << "LV: Trying to at least unroll the loops.\n");
01458 
01459       // Report the unrolling decision.
01460       emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
01461                              Twine("unrolled with interleaving factor " +
01462                                    Twine(UF) +
01463                                    " (vectorization not beneficial)"));
01464 
01465       // We decided not to vectorize, but we may want to unroll.
01466 
01467       InnerLoopUnroller Unroller(L, SE, LI, DT, DL, TLI, UF);
01468       Unroller.vectorize(&LVL);
01469     } else {
01470       // If we decided that it is *legal* to vectorize the loop then do it.
01471       InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF);
01472       LB.vectorize(&LVL);
01473       ++LoopsVectorized;
01474 
01475       // Report the vectorization decision.
01476       emitOptimizationRemark(
01477           F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
01478           Twine("vectorized loop (vectorization factor: ") + Twine(VF.Width) +
01479               ", unrolling interleave factor: " + Twine(UF) + ")");
01480     }
01481 
01482     // Mark the loop as already vectorized to avoid vectorizing again.
01483     Hints.setAlreadyVectorized();
01484 
01485     DEBUG(verifyFunction(*L->getHeader()->getParent()));
01486     return true;
01487   }
01488 
01489   void getAnalysisUsage(AnalysisUsage &AU) const override {
01490     AU.addRequired<AssumptionCacheTracker>();
01491     AU.addRequiredID(LoopSimplifyID);
01492     AU.addRequiredID(LCSSAID);
01493     AU.addRequired<BlockFrequencyInfo>();
01494     AU.addRequired<DominatorTreeWrapperPass>();
01495     AU.addRequired<LoopInfoWrapperPass>();
01496     AU.addRequired<ScalarEvolution>();
01497     AU.addRequired<TargetTransformInfoWrapperPass>();
01498     AU.addRequired<AliasAnalysis>();
01499     AU.addRequired<LoopAccessAnalysis>();
01500     AU.addPreserved<LoopInfoWrapperPass>();
01501     AU.addPreserved<DominatorTreeWrapperPass>();
01502     AU.addPreserved<AliasAnalysis>();
01503   }
01504 
01505 };
01506 
01507 } // end anonymous namespace
01508 
01509 //===----------------------------------------------------------------------===//
01510 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
01511 // LoopVectorizationCostModel.
01512 //===----------------------------------------------------------------------===//
01513 
01514 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
01515   // We need to place the broadcast of invariant variables outside the loop.
01516   Instruction *Instr = dyn_cast<Instruction>(V);
01517   bool NewInstr =
01518       (Instr && std::find(LoopVectorBody.begin(), LoopVectorBody.end(),
01519                           Instr->getParent()) != LoopVectorBody.end());
01520   bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
01521 
01522   // Place the code for broadcasting invariant variables in the new preheader.
01523   IRBuilder<>::InsertPointGuard Guard(Builder);
01524   if (Invariant)
01525     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
01526 
01527   // Broadcast the scalar into all locations in the vector.
01528   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
01529 
01530   return Shuf;
01531 }
01532 
01533 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx,
01534                                           Value *Step) {
01535   assert(Val->getType()->isVectorTy() && "Must be a vector");
01536   assert(Val->getType()->getScalarType()->isIntegerTy() &&
01537          "Elem must be an integer");
01538   assert(Step->getType() == Val->getType()->getScalarType() &&
01539          "Step has wrong type");
01540   // Create the types.
01541   Type *ITy = Val->getType()->getScalarType();
01542   VectorType *Ty = cast<VectorType>(Val->getType());
01543   int VLen = Ty->getNumElements();
01544   SmallVector<Constant*, 8> Indices;
01545 
01546   // Create a vector of consecutive numbers from zero to VF.
01547   for (int i = 0; i < VLen; ++i)
01548     Indices.push_back(ConstantInt::get(ITy, StartIdx + i));
01549 
01550   // Add the consecutive indices to the vector value.
01551   Constant *Cv = ConstantVector::get(Indices);
01552   assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
01553   Step = Builder.CreateVectorSplat(VLen, Step);
01554   assert(Step->getType() == Val->getType() && "Invalid step vec");
01555   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
01556   // which can be found from the original scalar operations.
01557   Step = Builder.CreateMul(Cv, Step);
01558   return Builder.CreateAdd(Val, Step, "induction");
01559 }
01560 
01561 /// \brief Find the operand of the GEP that should be checked for consecutive
01562 /// stores. This ignores trailing indices that have no effect on the final
01563 /// pointer.
01564 static unsigned getGEPInductionOperand(const DataLayout *DL,
01565                                        const GetElementPtrInst *Gep) {
01566   unsigned LastOperand = Gep->getNumOperands() - 1;
01567   unsigned GEPAllocSize = DL->getTypeAllocSize(
01568       cast<PointerType>(Gep->getType()->getScalarType())->getElementType());
01569 
01570   // Walk backwards and try to peel off zeros.
01571   while (LastOperand > 1 && match(Gep->getOperand(LastOperand), m_Zero())) {
01572     // Find the type we're currently indexing into.
01573     gep_type_iterator GEPTI = gep_type_begin(Gep);
01574     std::advance(GEPTI, LastOperand - 1);
01575 
01576     // If it's a type with the same allocation size as the result of the GEP we
01577     // can peel off the zero index.
01578     if (DL->getTypeAllocSize(*GEPTI) != GEPAllocSize)
01579       break;
01580     --LastOperand;
01581   }
01582 
01583   return LastOperand;
01584 }
01585 
01586 int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
01587   assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr");
01588   // Make sure that the pointer does not point to structs.
01589   if (Ptr->getType()->getPointerElementType()->isAggregateType())
01590     return 0;
01591 
01592   // If this value is a pointer induction variable we know it is consecutive.
01593   PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);
01594   if (Phi && Inductions.count(Phi)) {
01595     InductionInfo II = Inductions[Phi];
01596     return II.getConsecutiveDirection();
01597   }
01598 
01599   GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
01600   if (!Gep)
01601     return 0;
01602 
01603   unsigned NumOperands = Gep->getNumOperands();
01604   Value *GpPtr = Gep->getPointerOperand();
01605   // If this GEP value is a consecutive pointer induction variable and all of
01606   // the indices are constant then we know it is consecutive. We can
01607   Phi = dyn_cast<PHINode>(GpPtr);
01608   if (Phi && Inductions.count(Phi)) {
01609 
01610     // Make sure that the pointer does not point to structs.
01611     PointerType *GepPtrType = cast<PointerType>(GpPtr->getType());
01612     if (GepPtrType->getElementType()->isAggregateType())
01613       return 0;
01614 
01615     // Make sure that all of the index operands are loop invariant.
01616     for (unsigned i = 1; i < NumOperands; ++i)
01617       if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
01618         return 0;
01619 
01620     InductionInfo II = Inductions[Phi];
01621     return II.getConsecutiveDirection();
01622   }
01623 
01624   unsigned InductionOperand = getGEPInductionOperand(DL, Gep);
01625 
01626   // Check that all of the gep indices are uniform except for our induction
01627   // operand.
01628   for (unsigned i = 0; i != NumOperands; ++i)
01629     if (i != InductionOperand &&
01630         !SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
01631       return 0;
01632 
01633   // We can emit wide load/stores only if the last non-zero index is the
01634   // induction variable.
01635   const SCEV *Last = nullptr;
01636   if (!Strides.count(Gep))
01637     Last = SE->getSCEV(Gep->getOperand(InductionOperand));
01638   else {
01639     // Because of the multiplication by a stride we can have a s/zext cast.
01640     // We are going to replace this stride by 1 so the cast is safe to ignore.
01641     //
01642     //  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
01643     //  %0 = trunc i64 %indvars.iv to i32
01644     //  %mul = mul i32 %0, %Stride1
01645     //  %idxprom = zext i32 %mul to i64  << Safe cast.
01646     //  %arrayidx = getelementptr inbounds i32* %B, i64 %idxprom
01647     //
01648     Last = replaceSymbolicStrideSCEV(SE, Strides,
01649                                      Gep->getOperand(InductionOperand), Gep);
01650     if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(Last))
01651       Last =
01652           (C->getSCEVType() == scSignExtend || C->getSCEVType() == scZeroExtend)
01653               ? C->getOperand()
01654               : Last;
01655   }
01656   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {
01657     const SCEV *Step = AR->getStepRecurrence(*SE);
01658 
01659     // The memory is consecutive because the last index is consecutive
01660     // and all other indices are loop invariant.
01661     if (Step->isOne())
01662       return 1;
01663     if (Step->isAllOnesValue())
01664       return -1;
01665   }
01666 
01667   return 0;
01668 }
01669 
01670 bool LoopVectorizationLegality::isUniform(Value *V) {
01671   return LAI->isUniform(V);
01672 }
01673 
01674 InnerLoopVectorizer::VectorParts&
01675 InnerLoopVectorizer::getVectorValue(Value *V) {
01676   assert(V != Induction && "The new induction variable should not be used.");
01677   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
01678 
01679   // If we have a stride that is replaced by one, do it here.
01680   if (Legal->hasStride(V))
01681     V = ConstantInt::get(V->getType(), 1);
01682 
01683   // If we have this scalar in the map, return it.
01684   if (WidenMap.has(V))
01685     return WidenMap.get(V);
01686 
01687   // If this scalar is unknown, assume that it is a constant or that it is
01688   // loop invariant. Broadcast V and save the value for future uses.
01689   Value *B = getBroadcastInstrs(V);
01690   return WidenMap.splat(V, B);
01691 }
01692 
01693 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
01694   assert(Vec->getType()->isVectorTy() && "Invalid type");
01695   SmallVector<Constant*, 8> ShuffleMask;
01696   for (unsigned i = 0; i < VF; ++i)
01697     ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
01698 
01699   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
01700                                      ConstantVector::get(ShuffleMask),
01701                                      "reverse");
01702 }
01703 
01704 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
01705   // Attempt to issue a wide load.
01706   LoadInst *LI = dyn_cast<LoadInst>(Instr);
01707   StoreInst *SI = dyn_cast<StoreInst>(Instr);
01708 
01709   assert((LI || SI) && "Invalid Load/Store instruction");
01710 
01711   Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType();
01712   Type *DataTy = VectorType::get(ScalarDataTy, VF);
01713   Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
01714   unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
01715   // An alignment of 0 means target abi alignment. We need to use the scalar's
01716   // target abi alignment in such a case.
01717   if (!Alignment)
01718     Alignment = DL->getABITypeAlignment(ScalarDataTy);
01719   unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
01720   unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy);
01721   unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF;
01722 
01723   if (SI && Legal->blockNeedsPredication(SI->getParent()) &&
01724       !Legal->isMaskRequired(SI))
01725     return scalarizeInstruction(Instr, true);
01726 
01727   if (ScalarAllocatedSize != VectorElementSize)
01728     return scalarizeInstruction(Instr);
01729 
01730   // If the pointer is loop invariant or if it is non-consecutive,
01731   // scalarize the load.
01732   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
01733   bool Reverse = ConsecutiveStride < 0;
01734   bool UniformLoad = LI && Legal->isUniform(Ptr);
01735   if (!ConsecutiveStride || UniformLoad)
01736     return scalarizeInstruction(Instr);
01737 
01738   Constant *Zero = Builder.getInt32(0);
01739   VectorParts &Entry = WidenMap.get(Instr);
01740 
01741   // Handle consecutive loads/stores.
01742   GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
01743   if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) {
01744     setDebugLocFromInst(Builder, Gep);
01745     Value *PtrOperand = Gep->getPointerOperand();
01746     Value *FirstBasePtr = getVectorValue(PtrOperand)[0];
01747     FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero);
01748 
01749     // Create the new GEP with the new induction variable.
01750     GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
01751     Gep2->setOperand(0, FirstBasePtr);
01752     Gep2->setName("gep.indvar.base");
01753     Ptr = Builder.Insert(Gep2);
01754   } else if (Gep) {
01755     setDebugLocFromInst(Builder, Gep);
01756     assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()),
01757                                OrigLoop) && "Base ptr must be invariant");
01758 
01759     // The last index does not have to be the induction. It can be
01760     // consecutive and be a function of the index. For example A[I+1];
01761     unsigned NumOperands = Gep->getNumOperands();
01762     unsigned InductionOperand = getGEPInductionOperand(DL, Gep);
01763     // Create the new GEP with the new induction variable.
01764     GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
01765 
01766     for (unsigned i = 0; i < NumOperands; ++i) {
01767       Value *GepOperand = Gep->getOperand(i);
01768       Instruction *GepOperandInst = dyn_cast<Instruction>(GepOperand);
01769 
01770       // Update last index or loop invariant instruction anchored in loop.
01771       if (i == InductionOperand ||
01772           (GepOperandInst && OrigLoop->contains(GepOperandInst))) {
01773         assert((i == InductionOperand ||
01774                SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) &&
01775                "Must be last index or loop invariant");
01776 
01777         VectorParts &GEPParts = getVectorValue(GepOperand);
01778         Value *Index = GEPParts[0];
01779         Index = Builder.CreateExtractElement(Index, Zero);
01780         Gep2->setOperand(i, Index);
01781         Gep2->setName("gep.indvar.idx");
01782       }
01783     }
01784     Ptr = Builder.Insert(Gep2);
01785   } else {
01786     // Use the induction element ptr.
01787     assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
01788     setDebugLocFromInst(Builder, Ptr);
01789     VectorParts &PtrVal = getVectorValue(Ptr);
01790     Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
01791   }
01792 
01793   VectorParts Mask = createBlockInMask(Instr->getParent());
01794   // Handle Stores:
01795   if (SI) {
01796     assert(!Legal->isUniform(SI->getPointerOperand()) &&
01797            "We do not allow storing to uniform addresses");
01798     setDebugLocFromInst(Builder, SI);
01799     // We don't want to update the value in the map as it might be used in
01800     // another expression. So don't use a reference type for "StoredVal".
01801     VectorParts StoredVal = getVectorValue(SI->getValueOperand());
01802     
01803     for (unsigned Part = 0; Part < UF; ++Part) {
01804       // Calculate the pointer for the specific unroll-part.
01805       Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
01806 
01807       if (Reverse) {
01808         // If we store to reverse consecutive memory locations then we need
01809         // to reverse the order of elements in the stored value.
01810         StoredVal[Part] = reverseVector(StoredVal[Part]);
01811         // If the address is consecutive but reversed, then the
01812         // wide store needs to start at the last vector element.
01813         PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
01814         PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
01815         Mask[Part] = reverseVector(Mask[Part]);
01816       }
01817 
01818       Value *VecPtr = Builder.CreateBitCast(PartPtr,
01819                                             DataTy->getPointerTo(AddressSpace));
01820 
01821       Instruction *NewSI;
01822       if (Legal->isMaskRequired(SI))
01823         NewSI = Builder.CreateMaskedStore(StoredVal[Part], VecPtr, Alignment,
01824                                           Mask[Part]);
01825       else 
01826         NewSI = Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment);
01827       propagateMetadata(NewSI, SI);
01828     }
01829     return;
01830   }
01831 
01832   // Handle loads.
01833   assert(LI && "Must have a load instruction");
01834   setDebugLocFromInst(Builder, LI);
01835   for (unsigned Part = 0; Part < UF; ++Part) {
01836     // Calculate the pointer for the specific unroll-part.
01837     Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
01838 
01839     if (Reverse) {
01840       // If the address is consecutive but reversed, then the
01841       // wide load needs to start at the last vector element.
01842       PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
01843       PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
01844       Mask[Part] = reverseVector(Mask[Part]);
01845     }
01846 
01847     Instruction* NewLI;
01848     Value *VecPtr = Builder.CreateBitCast(PartPtr,
01849                                           DataTy->getPointerTo(AddressSpace));
01850     if (Legal->isMaskRequired(LI))
01851       NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
01852                                        UndefValue::get(DataTy),
01853                                        "wide.masked.load");
01854     else
01855       NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");
01856     propagateMetadata(NewLI, LI);
01857     Entry[Part] = Reverse ? reverseVector(NewLI) :  NewLI;
01858   }
01859 }
01860 
01861 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredicateStore) {
01862   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
01863   // Holds vector parameters or scalars, in case of uniform vals.
01864   SmallVector<VectorParts, 4> Params;
01865 
01866   setDebugLocFromInst(Builder, Instr);
01867 
01868   // Find all of the vectorized parameters.
01869   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
01870     Value *SrcOp = Instr->getOperand(op);
01871 
01872     // If we are accessing the old induction variable, use the new one.
01873     if (SrcOp == OldInduction) {
01874       Params.push_back(getVectorValue(SrcOp));
01875       continue;
01876     }
01877 
01878     // Try using previously calculated values.
01879     Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
01880 
01881     // If the src is an instruction that appeared earlier in the basic block
01882     // then it should already be vectorized.
01883     if (SrcInst && OrigLoop->contains(SrcInst)) {
01884       assert(WidenMap.has(SrcInst) && "Source operand is unavailable");
01885       // The parameter is a vector value from earlier.
01886       Params.push_back(WidenMap.get(SrcInst));
01887     } else {
01888       // The parameter is a scalar from outside the loop. Maybe even a constant.
01889       VectorParts Scalars;
01890       Scalars.append(UF, SrcOp);
01891       Params.push_back(Scalars);
01892     }
01893   }
01894 
01895   assert(Params.size() == Instr->getNumOperands() &&
01896          "Invalid number of operands");
01897 
01898   // Does this instruction return a value ?
01899   bool IsVoidRetTy = Instr->getType()->isVoidTy();
01900 
01901   Value *UndefVec = IsVoidRetTy ? nullptr :
01902     UndefValue::get(VectorType::get(Instr->getType(), VF));
01903   // Create a new entry in the WidenMap and initialize it to Undef or Null.
01904   VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
01905 
01906   Instruction *InsertPt = Builder.GetInsertPoint();
01907   BasicBlock *IfBlock = Builder.GetInsertBlock();
01908   BasicBlock *CondBlock = nullptr;
01909 
01910   VectorParts Cond;
01911   Loop *VectorLp = nullptr;
01912   if (IfPredicateStore) {
01913     assert(Instr->getParent()->getSinglePredecessor() &&
01914            "Only support single predecessor blocks");
01915     Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
01916                           Instr->getParent());
01917     VectorLp = LI->getLoopFor(IfBlock);
01918     assert(VectorLp && "Must have a loop for this block");
01919   }
01920 
01921   // For each vector unroll 'part':
01922   for (unsigned Part = 0; Part < UF; ++Part) {
01923     // For each scalar that we create:
01924     for (unsigned Width = 0; Width < VF; ++Width) {
01925 
01926       // Start if-block.
01927       Value *Cmp = nullptr;
01928       if (IfPredicateStore) {
01929         Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width));
01930         Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1));
01931         CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
01932         LoopVectorBody.push_back(CondBlock);
01933         VectorLp->addBasicBlockToLoop(CondBlock, *LI);
01934         // Update Builder with newly created basic block.
01935         Builder.SetInsertPoint(InsertPt);
01936       }
01937 
01938       Instruction *Cloned = Instr->clone();
01939       if (!IsVoidRetTy)
01940         Cloned->setName(Instr->getName() + ".cloned");
01941       // Replace the operands of the cloned instructions with extracted scalars.
01942       for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
01943         Value *Op = Params[op][Part];
01944         // Param is a vector. Need to extract the right lane.
01945         if (Op->getType()->isVectorTy())
01946           Op = Builder.CreateExtractElement(Op, Builder.getInt32(Width));
01947         Cloned->setOperand(op, Op);
01948       }
01949 
01950       // Place the cloned scalar in the new loop.
01951       Builder.Insert(Cloned);
01952 
01953       // If the original scalar returns a value we need to place it in a vector
01954       // so that future users will be able to use it.
01955       if (!IsVoidRetTy)
01956         VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned,
01957                                                        Builder.getInt32(Width));
01958       // End if-block.
01959       if (IfPredicateStore) {
01960          BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
01961          LoopVectorBody.push_back(NewIfBlock);
01962          VectorLp->addBasicBlockToLoop(NewIfBlock, *LI);
01963          Builder.SetInsertPoint(InsertPt);
01964          Instruction *OldBr = IfBlock->getTerminator();
01965          BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
01966          OldBr->eraseFromParent();
01967          IfBlock = NewIfBlock;
01968       }
01969     }
01970   }
01971 }
01972 
01973 static Instruction *getFirstInst(Instruction *FirstInst, Value *V,
01974                                  Instruction *Loc) {
01975   if (FirstInst)
01976     return FirstInst;
01977   if (Instruction *I = dyn_cast<Instruction>(V))
01978     return I->getParent() == Loc->getParent() ? I : nullptr;
01979   return nullptr;
01980 }
01981 
01982 std::pair<Instruction *, Instruction *>
01983 InnerLoopVectorizer::addStrideCheck(Instruction *Loc) {
01984   Instruction *tnullptr = nullptr;
01985   if (!Legal->mustCheckStrides())
01986     return std::pair<Instruction *, Instruction *>(tnullptr, tnullptr);
01987 
01988   IRBuilder<> ChkBuilder(Loc);
01989 
01990   // Emit checks.
01991   Value *Check = nullptr;
01992   Instruction *FirstInst = nullptr;
01993   for (SmallPtrSet<Value *, 8>::iterator SI = Legal->strides_begin(),
01994                                          SE = Legal->strides_end();
01995        SI != SE; ++SI) {
01996     Value *Ptr = stripIntegerCast(*SI);
01997     Value *C = ChkBuilder.CreateICmpNE(Ptr, ConstantInt::get(Ptr->getType(), 1),
01998                                        "stride.chk");
01999     // Store the first instruction we create.
02000     FirstInst = getFirstInst(FirstInst, C, Loc);
02001     if (Check)
02002       Check = ChkBuilder.CreateOr(Check, C);
02003     else
02004       Check = C;
02005   }
02006 
02007   // We have to do this trickery because the IRBuilder might fold the check to a
02008   // constant expression in which case there is no Instruction anchored in a
02009   // the block.
02010   LLVMContext &Ctx = Loc->getContext();
02011   Instruction *TheCheck =
02012       BinaryOperator::CreateAnd(Check, ConstantInt::getTrue(Ctx));
02013   ChkBuilder.Insert(TheCheck, "stride.not.one");
02014   FirstInst = getFirstInst(FirstInst, TheCheck, Loc);
02015 
02016   return std::make_pair(FirstInst, TheCheck);
02017 }
02018 
02019 void InnerLoopVectorizer::createEmptyLoop() {
02020   /*
02021    In this function we generate a new loop. The new loop will contain
02022    the vectorized instructions while the old loop will continue to run the
02023    scalar remainder.
02024 
02025        [ ] <-- Back-edge taken count overflow check.
02026     /   |
02027    /    v
02028   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
02029   |  /  |
02030   | /   v
02031   ||   [ ]     <-- vector pre header.
02032   ||    |
02033   ||    v
02034   ||   [  ] \
02035   ||   [  ]_|   <-- vector loop.
02036   ||    |
02037   | \   v
02038   |   >[ ]   <--- middle-block.
02039   |  /  |
02040   | /   v
02041   -|- >[ ]     <--- new preheader.
02042    |    |
02043    |    v
02044    |   [ ] \
02045    |   [ ]_|   <-- old scalar loop to handle remainder.
02046     \   |
02047      \  v
02048       >[ ]     <-- exit block.
02049    ...
02050    */
02051 
02052   BasicBlock *OldBasicBlock = OrigLoop->getHeader();
02053   BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
02054   BasicBlock *ExitBlock = OrigLoop->getExitBlock();
02055   assert(BypassBlock && "Invalid loop structure");
02056   assert(ExitBlock && "Must have an exit block");
02057 
02058   // Some loops have a single integer induction variable, while other loops
02059   // don't. One example is c++ iterators that often have multiple pointer
02060   // induction variables. In the code below we also support a case where we
02061   // don't have a single induction variable.
02062   OldInduction = Legal->getInduction();
02063   Type *IdxTy = Legal->getWidestInductionType();
02064 
02065   // Find the loop boundaries.
02066   const SCEV *ExitCount = SE->getBackedgeTakenCount(OrigLoop);
02067   assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
02068 
02069   // The exit count might have the type of i64 while the phi is i32. This can
02070   // happen if we have an induction variable that is sign extended before the
02071   // compare. The only way that we get a backedge taken count is that the
02072   // induction variable was signed and as such will not overflow. In such a case
02073   // truncation is legal.
02074   if (ExitCount->getType()->getPrimitiveSizeInBits() >
02075       IdxTy->getPrimitiveSizeInBits())
02076     ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy);
02077 
02078   const SCEV *BackedgeTakeCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy);
02079   // Get the total trip count from the count by adding 1.
02080   ExitCount = SE->getAddExpr(BackedgeTakeCount,
02081                              SE->getConstant(BackedgeTakeCount->getType(), 1));
02082 
02083   // Expand the trip count and place the new instructions in the preheader.
02084   // Notice that the pre-header does not change, only the loop body.
02085   SCEVExpander Exp(*SE, "induction");
02086 
02087   // We need to test whether the backedge-taken count is uint##_max. Adding one
02088   // to it will cause overflow and an incorrect loop trip count in the vector
02089   // body. In case of overflow we want to directly jump to the scalar remainder
02090   // loop.
02091   Value *BackedgeCount =
02092       Exp.expandCodeFor(BackedgeTakeCount, BackedgeTakeCount->getType(),
02093                         BypassBlock->getTerminator());
02094   if (BackedgeCount->getType()->isPointerTy())
02095     BackedgeCount = CastInst::CreatePointerCast(BackedgeCount, IdxTy,
02096                                                 "backedge.ptrcnt.to.int",
02097                                                 BypassBlock->getTerminator());
02098   Instruction *CheckBCOverflow =
02099       CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, BackedgeCount,
02100                       Constant::getAllOnesValue(BackedgeCount->getType()),
02101                       "backedge.overflow", BypassBlock->getTerminator());
02102 
02103   // The loop index does not have to start at Zero. Find the original start
02104   // value from the induction PHI node. If we don't have an induction variable
02105   // then we know that it starts at zero.
02106   Builder.SetInsertPoint(BypassBlock->getTerminator());
02107   Value *StartIdx = ExtendedIdx = OldInduction ?
02108     Builder.CreateZExt(OldInduction->getIncomingValueForBlock(BypassBlock),
02109                        IdxTy):
02110     ConstantInt::get(IdxTy, 0);
02111 
02112   // We need an instruction to anchor the overflow check on. StartIdx needs to
02113   // be defined before the overflow check branch. Because the scalar preheader
02114   // is going to merge the start index and so the overflow branch block needs to
02115   // contain a definition of the start index.
02116   Instruction *OverflowCheckAnchor = BinaryOperator::CreateAdd(
02117       StartIdx, ConstantInt::get(IdxTy, 0), "overflow.check.anchor",
02118       BypassBlock->getTerminator());
02119 
02120   // Count holds the overall loop count (N).
02121   Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
02122                                    BypassBlock->getTerminator());
02123 
02124   LoopBypassBlocks.push_back(BypassBlock);
02125 
02126   // Split the single block loop into the two loop structure described above.
02127   BasicBlock *VectorPH =
02128   BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
02129   BasicBlock *VecBody =
02130   VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
02131   BasicBlock *MiddleBlock =
02132   VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
02133   BasicBlock *ScalarPH =
02134   MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
02135 
02136   // Create and register the new vector loop.
02137   Loop* Lp = new Loop();
02138   Loop *ParentLoop = OrigLoop->getParentLoop();
02139 
02140   // Insert the new loop into the loop nest and register the new basic blocks
02141   // before calling any utilities such as SCEV that require valid LoopInfo.
02142   if (ParentLoop) {
02143     ParentLoop->addChildLoop(Lp);
02144     ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
02145     ParentLoop->addBasicBlockToLoop(VectorPH, *LI);
02146     ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
02147   } else {
02148     LI->addTopLevelLoop(Lp);
02149   }
02150   Lp->addBasicBlockToLoop(VecBody, *LI);
02151 
02152   // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
02153   // inside the loop.
02154   Builder.SetInsertPoint(VecBody->getFirstNonPHI());
02155 
02156   // Generate the induction variable.
02157   setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction));
02158   Induction = Builder.CreatePHI(IdxTy, 2, "index");
02159   // The loop step is equal to the vectorization factor (num of SIMD elements)
02160   // times the unroll factor (num of SIMD instructions).
02161   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
02162 
02163   // This is the IR builder that we use to add all of the logic for bypassing
02164   // the new vector loop.
02165   IRBuilder<> BypassBuilder(BypassBlock->getTerminator());
02166   setDebugLocFromInst(BypassBuilder,
02167                       getDebugLocFromInstOrOperands(OldInduction));
02168 
02169   // We may need to extend the index in case there is a type mismatch.
02170   // We know that the count starts at zero and does not overflow.
02171   if (Count->getType() != IdxTy) {
02172     // The exit count can be of pointer type. Convert it to the correct
02173     // integer type.
02174     if (ExitCount->getType()->isPointerTy())
02175       Count = BypassBuilder.CreatePointerCast(Count, IdxTy, "ptrcnt.to.int");
02176     else
02177       Count = BypassBuilder.CreateZExtOrTrunc(Count, IdxTy, "cnt.cast");
02178   }
02179 
02180   // Add the start index to the loop count to get the new end index.
02181   Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx, "end.idx");
02182 
02183   // Now we need to generate the expression for N - (N % VF), which is
02184   // the part that the vectorized body will execute.
02185   Value *R = BypassBuilder.CreateURem(Count, Step, "n.mod.vf");
02186   Value *CountRoundDown = BypassBuilder.CreateSub(Count, R, "n.vec");
02187   Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx,
02188                                                      "end.idx.rnd.down");
02189 
02190   // Now, compare the new count to zero. If it is zero skip the vector loop and
02191   // jump to the scalar loop.
02192   Value *Cmp =
02193       BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, "cmp.zero");
02194 
02195   BasicBlock *LastBypassBlock = BypassBlock;
02196 
02197   // Generate code to check that the loops trip count that we computed by adding
02198   // one to the backedge-taken count will not overflow.
02199   {
02200     auto PastOverflowCheck =
02201         std::next(BasicBlock::iterator(OverflowCheckAnchor));
02202     BasicBlock *CheckBlock =
02203       LastBypassBlock->splitBasicBlock(PastOverflowCheck, "overflow.checked");
02204     if (ParentLoop)
02205       ParentLoop->addBasicBlockToLoop(CheckBlock, *LI);
02206     LoopBypassBlocks.push_back(CheckBlock);
02207     Instruction *OldTerm = LastBypassBlock->getTerminator();
02208     BranchInst::Create(ScalarPH, CheckBlock, CheckBCOverflow, OldTerm);
02209     OldTerm->eraseFromParent();
02210     LastBypassBlock = CheckBlock;
02211   }
02212 
02213   // Generate the code to check that the strides we assumed to be one are really
02214   // one. We want the new basic block to start at the first instruction in a
02215   // sequence of instructions that form a check.
02216   Instruction *StrideCheck;
02217   Instruction *FirstCheckInst;
02218   std::tie(FirstCheckInst, StrideCheck) =
02219       addStrideCheck(LastBypassBlock->getTerminator());
02220   if (StrideCheck) {
02221     // Create a new block containing the stride check.
02222     BasicBlock *CheckBlock =
02223         LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.stridecheck");
02224     if (ParentLoop)
02225       ParentLoop->addBasicBlockToLoop(CheckBlock, *LI);
02226     LoopBypassBlocks.push_back(CheckBlock);
02227 
02228     // Replace the branch into the memory check block with a conditional branch
02229     // for the "few elements case".
02230     Instruction *OldTerm = LastBypassBlock->getTerminator();
02231     BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm);
02232     OldTerm->eraseFromParent();
02233 
02234     Cmp = StrideCheck;
02235     LastBypassBlock = CheckBlock;
02236   }
02237 
02238   // Generate the code that checks in runtime if arrays overlap. We put the
02239   // checks into a separate block to make the more common case of few elements
02240   // faster.
02241   Instruction *MemRuntimeCheck;
02242   std::tie(FirstCheckInst, MemRuntimeCheck) =
02243     Legal->getLAI()->addRuntimeCheck(LastBypassBlock->getTerminator());
02244   if (MemRuntimeCheck) {
02245     // Create a new block containing the memory check.
02246     BasicBlock *CheckBlock =
02247         LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.memcheck");
02248     if (ParentLoop)
02249       ParentLoop->addBasicBlockToLoop(CheckBlock, *LI);
02250     LoopBypassBlocks.push_back(CheckBlock);
02251 
02252     // Replace the branch into the memory check block with a conditional branch
02253     // for the "few elements case".
02254     Instruction *OldTerm = LastBypassBlock->getTerminator();
02255     BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm);
02256     OldTerm->eraseFromParent();
02257 
02258     Cmp = MemRuntimeCheck;
02259     LastBypassBlock = CheckBlock;
02260   }
02261 
02262   LastBypassBlock->getTerminator()->eraseFromParent();
02263   BranchInst::Create(MiddleBlock, VectorPH, Cmp,
02264                      LastBypassBlock);
02265 
02266   // We are going to resume the execution of the scalar loop.
02267   // Go over all of the induction variables that we found and fix the
02268   // PHIs that are left in the scalar version of the loop.
02269   // The starting values of PHI nodes depend on the counter of the last
02270   // iteration in the vectorized loop.
02271   // If we come from a bypass edge then we need to start from the original
02272   // start value.
02273 
02274   // This variable saves the new starting index for the scalar loop.
02275   PHINode *ResumeIndex = nullptr;
02276   LoopVectorizationLegality::InductionList::iterator I, E;
02277   LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
02278   // Set builder to point to last bypass block.
02279   BypassBuilder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator());
02280   for (I = List->begin(), E = List->end(); I != E; ++I) {
02281     PHINode *OrigPhi = I->first;
02282     LoopVectorizationLegality::InductionInfo II = I->second;
02283 
02284     Type *ResumeValTy = (OrigPhi == OldInduction) ? IdxTy : OrigPhi->getType();
02285     PHINode *ResumeVal = PHINode::Create(ResumeValTy, 2, "resume.val",
02286                                          MiddleBlock->getTerminator());
02287     // We might have extended the type of the induction variable but we need a
02288     // truncated version for the scalar loop.
02289     PHINode *TruncResumeVal = (OrigPhi == OldInduction) ?
02290       PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val",
02291                       MiddleBlock->getTerminator()) : nullptr;
02292 
02293     // Create phi nodes to merge from the  backedge-taken check block.
02294     PHINode *BCResumeVal = PHINode::Create(ResumeValTy, 3, "bc.resume.val",
02295                                            ScalarPH->getTerminator());
02296     BCResumeVal->addIncoming(ResumeVal, MiddleBlock);
02297 
02298     PHINode *BCTruncResumeVal = nullptr;
02299     if (OrigPhi == OldInduction) {
02300       BCTruncResumeVal =
02301           PHINode::Create(OrigPhi->getType(), 2, "bc.trunc.resume.val",
02302                           ScalarPH->getTerminator());
02303       BCTruncResumeVal->addIncoming(TruncResumeVal, MiddleBlock);
02304     }
02305 
02306     Value *EndValue = nullptr;
02307     switch (II.IK) {
02308     case LoopVectorizationLegality::IK_NoInduction:
02309       llvm_unreachable("Unknown induction");
02310     case LoopVectorizationLegality::IK_IntInduction: {
02311       // Handle the integer induction counter.
02312       assert(OrigPhi->getType()->isIntegerTy() && "Invalid type");
02313 
02314       // We have the canonical induction variable.
02315       if (OrigPhi == OldInduction) {
02316         // Create a truncated version of the resume value for the scalar loop,
02317         // we might have promoted the type to a larger width.
02318         EndValue =
02319           BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType());
02320         // The new PHI merges the original incoming value, in case of a bypass,
02321         // or the value at the end of the vectorized loop.
02322         for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
02323           TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
02324         TruncResumeVal->addIncoming(EndValue, VecBody);
02325 
02326         BCTruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]);
02327 
02328         // We know what the end value is.
02329         EndValue = IdxEndRoundDown;
02330         // We also know which PHI node holds it.
02331         ResumeIndex = ResumeVal;
02332         break;
02333       }
02334 
02335       // Not the canonical induction variable - add the vector loop count to the
02336       // start value.
02337       Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,
02338                                                    II.StartValue->getType(),
02339                                                    "cast.crd");
02340       EndValue = II.transform(BypassBuilder, CRD);
02341       EndValue->setName("ind.end");
02342       break;
02343     }
02344     case LoopVectorizationLegality::IK_PtrInduction: {
02345       EndValue = II.transform(BypassBuilder, CountRoundDown);
02346       EndValue->setName("ptr.ind.end");
02347       break;
02348     }
02349     }// end of case
02350 
02351     // The new PHI merges the original incoming value, in case of a bypass,
02352     // or the value at the end of the vectorized loop.
02353     for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) {
02354       if (OrigPhi == OldInduction)
02355         ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]);
02356       else
02357         ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
02358     }
02359     ResumeVal->addIncoming(EndValue, VecBody);
02360 
02361     // Fix the scalar body counter (PHI node).
02362     unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
02363 
02364     // The old induction's phi node in the scalar body needs the truncated
02365     // value.
02366     if (OrigPhi == OldInduction) {
02367       BCResumeVal->addIncoming(StartIdx, LoopBypassBlocks[0]);
02368       OrigPhi->setIncomingValue(BlockIdx, BCTruncResumeVal);
02369     } else {
02370       BCResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]);
02371       OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
02372     }
02373   }
02374 
02375   // If we are generating a new induction variable then we also need to
02376   // generate the code that calculates the exit value. This value is not
02377   // simply the end of the counter because we may skip the vectorized body
02378   // in case of a runtime check.
02379   if (!OldInduction){
02380     assert(!ResumeIndex && "Unexpected resume value found");
02381     ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val",
02382                                   MiddleBlock->getTerminator());
02383     for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
02384       ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]);
02385     ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
02386   }
02387 
02388   // Make sure that we found the index where scalar loop needs to continue.
02389   assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() &&
02390          "Invalid resume Index");
02391 
02392   // Add a check in the middle block to see if we have completed
02393   // all of the iterations in the first vector loop.
02394   // If (N - N%VF) == N, then we *don't* need to run the remainder.
02395   Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd,
02396                                 ResumeIndex, "cmp.n",
02397                                 MiddleBlock->getTerminator());
02398 
02399   BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator());
02400   // Remove the old terminator.
02401   MiddleBlock->getTerminator()->eraseFromParent();
02402 
02403   // Create i+1 and fill the PHINode.
02404   Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next");
02405   Induction->addIncoming(StartIdx, VectorPH);
02406   Induction->addIncoming(NextIdx, VecBody);
02407   // Create the compare.
02408   Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown);
02409   Builder.CreateCondBr(ICmp, MiddleBlock, VecBody);
02410 
02411   // Now we have two terminators. Remove the old one from the block.
02412   VecBody->getTerminator()->eraseFromParent();
02413 
02414   // Get ready to start creating new instructions into the vectorized body.
02415   Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
02416 
02417   // Save the state.
02418   LoopVectorPreHeader = VectorPH;
02419   LoopScalarPreHeader = ScalarPH;
02420   LoopMiddleBlock = MiddleBlock;
02421   LoopExitBlock = ExitBlock;
02422   LoopVectorBody.push_back(VecBody);
02423   LoopScalarBody = OldBasicBlock;
02424 
02425   LoopVectorizeHints Hints(Lp, true);
02426   Hints.setAlreadyVectorized();
02427 }
02428 
02429 /// This function returns the identity element (or neutral element) for
02430 /// the operation K.
02431 Constant*
02432 LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp) {
02433   switch (K) {
02434   case RK_IntegerXor:
02435   case RK_IntegerAdd:
02436   case RK_IntegerOr:
02437     // Adding, Xoring, Oring zero to a number does not change it.
02438     return ConstantInt::get(Tp, 0);
02439   case RK_IntegerMult:
02440     // Multiplying a number by 1 does not change it.
02441     return ConstantInt::get(Tp, 1);
02442   case RK_IntegerAnd:
02443     // AND-ing a number with an all-1 value does not change it.
02444     return ConstantInt::get(Tp, -1, true);
02445   case  RK_FloatMult:
02446     // Multiplying a number by 1 does not change it.
02447     return ConstantFP::get(Tp, 1.0L);
02448   case  RK_FloatAdd:
02449     // Adding zero to a number does not change it.
02450     return ConstantFP::get(Tp, 0.0L);
02451   default:
02452     llvm_unreachable("Unknown reduction kind");
02453   }
02454 }
02455 
02456 /// This function translates the reduction kind to an LLVM binary operator.
02457 static unsigned
02458 getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) {
02459   switch (Kind) {
02460     case LoopVectorizationLegality::RK_IntegerAdd:
02461       return Instruction::Add;
02462     case LoopVectorizationLegality::RK_IntegerMult:
02463       return Instruction::Mul;
02464     case LoopVectorizationLegality::RK_IntegerOr:
02465       return Instruction::Or;
02466     case LoopVectorizationLegality::RK_IntegerAnd:
02467       return Instruction::And;
02468     case LoopVectorizationLegality::RK_IntegerXor:
02469       return Instruction::Xor;
02470     case LoopVectorizationLegality::RK_FloatMult:
02471       return Instruction::FMul;
02472     case LoopVectorizationLegality::RK_FloatAdd:
02473       return Instruction::FAdd;
02474     case LoopVectorizationLegality::RK_IntegerMinMax:
02475       return Instruction::ICmp;
02476     case LoopVectorizationLegality::RK_FloatMinMax:
02477       return Instruction::FCmp;
02478     default:
02479       llvm_unreachable("Unknown reduction operation");
02480   }
02481 }
02482 
02483 Value *createMinMaxOp(IRBuilder<> &Builder,
02484                       LoopVectorizationLegality::MinMaxReductionKind RK,
02485                       Value *Left,
02486                       Value *Right) {
02487   CmpInst::Predicate P = CmpInst::ICMP_NE;
02488   switch (RK) {
02489   default:
02490     llvm_unreachable("Unknown min/max reduction kind");
02491   case LoopVectorizationLegality::MRK_UIntMin:
02492     P = CmpInst::ICMP_ULT;
02493     break;
02494   case LoopVectorizationLegality::MRK_UIntMax:
02495     P = CmpInst::ICMP_UGT;
02496     break;
02497   case LoopVectorizationLegality::MRK_SIntMin:
02498     P = CmpInst::ICMP_SLT;
02499     break;
02500   case LoopVectorizationLegality::MRK_SIntMax:
02501     P = CmpInst::ICMP_SGT;
02502     break;
02503   case LoopVectorizationLegality::MRK_FloatMin:
02504     P = CmpInst::FCMP_OLT;
02505     break;
02506   case LoopVectorizationLegality::MRK_FloatMax:
02507     P = CmpInst::FCMP_OGT;
02508     break;
02509   }
02510 
02511   Value *Cmp;
02512   if (RK == LoopVectorizationLegality::MRK_FloatMin ||
02513       RK == LoopVectorizationLegality::MRK_FloatMax)
02514     Cmp = Builder.CreateFCmp(P, Left, Right, "rdx.minmax.cmp");
02515   else
02516     Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp");
02517 
02518   Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select");
02519   return Select;
02520 }
02521 
02522 namespace {
02523 struct CSEDenseMapInfo {
02524   static bool canHandle(Instruction *I) {
02525     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
02526            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
02527   }
02528   static inline Instruction *getEmptyKey() {
02529     return DenseMapInfo<Instruction *>::getEmptyKey();
02530   }
02531   static inline Instruction *getTombstoneKey() {
02532     return DenseMapInfo<Instruction *>::getTombstoneKey();
02533   }
02534   static unsigned getHashValue(Instruction *I) {
02535     assert(canHandle(I) && "Unknown instruction!");
02536     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
02537                                                            I->value_op_end()));
02538   }
02539   static bool isEqual(Instruction *LHS, Instruction *RHS) {
02540     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
02541         LHS == getTombstoneKey() || RHS == getTombstoneKey())
02542       return LHS == RHS;
02543     return LHS->isIdenticalTo(RHS);
02544   }
02545 };
02546 }
02547 
02548 /// \brief Check whether this block is a predicated block.
02549 /// Due to if predication of stores we might create a sequence of "if(pred) a[i]
02550 /// = ...;  " blocks. We start with one vectorized basic block. For every
02551 /// conditional block we split this vectorized block. Therefore, every second
02552 /// block will be a predicated one.
02553 static bool isPredicatedBlock(unsigned BlockNum) {
02554   return BlockNum % 2;
02555 }
02556 
02557 ///\brief Perform cse of induction variable instructions.
02558 static void cse(SmallVector<BasicBlock *, 4> &BBs) {
02559   // Perform simple cse.
02560   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
02561   for (unsigned i = 0, e = BBs.size(); i != e; ++i) {
02562     BasicBlock *BB = BBs[i];
02563     for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
02564       Instruction *In = I++;
02565 
02566       if (!CSEDenseMapInfo::canHandle(In))
02567         continue;
02568 
02569       // Check if we can replace this instruction with any of the
02570       // visited instructions.
02571       if (Instruction *V = CSEMap.lookup(In)) {
02572         In->replaceAllUsesWith(V);
02573         In->eraseFromParent();
02574         continue;
02575       }
02576       // Ignore instructions in conditional blocks. We create "if (pred) a[i] =
02577       // ...;" blocks for predicated stores. Every second block is a predicated
02578       // block.
02579       if (isPredicatedBlock(i))
02580         continue;
02581 
02582       CSEMap[In] = In;
02583     }
02584   }
02585 }
02586 
02587 /// \brief Adds a 'fast' flag to floating point operations.
02588 static Value *addFastMathFlag(Value *V) {
02589   if (isa<FPMathOperator>(V)){
02590     FastMathFlags Flags;
02591     Flags.setUnsafeAlgebra();
02592     cast<Instruction>(V)->setFastMathFlags(Flags);
02593   }
02594   return V;
02595 }
02596 
02597 void InnerLoopVectorizer::vectorizeLoop() {
02598   //===------------------------------------------------===//
02599   //
02600   // Notice: any optimization or new instruction that go
02601   // into the code below should be also be implemented in
02602   // the cost-model.
02603   //
02604   //===------------------------------------------------===//
02605   Constant *Zero = Builder.getInt32(0);
02606 
02607   // In order to support reduction variables we need to be able to vectorize
02608   // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
02609   // stages. First, we create a new vector PHI node with no incoming edges.
02610   // We use this value when we vectorize all of the instructions that use the
02611   // PHI. Next, after all of the instructions in the block are complete we
02612   // add the new incoming edges to the PHI. At this point all of the
02613   // instructions in the basic block are vectorized, so we can use them to
02614   // construct the PHI.
02615   PhiVector RdxPHIsToFix;
02616 
02617   // Scan the loop in a topological order to ensure that defs are vectorized
02618   // before users.
02619   LoopBlocksDFS DFS(OrigLoop);
02620   DFS.perform(LI);
02621 
02622   // Vectorize all of the blocks in the original loop.
02623   for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
02624        be = DFS.endRPO(); bb != be; ++bb)
02625     vectorizeBlockInLoop(*bb, &RdxPHIsToFix);
02626 
02627   // At this point every instruction in the original loop is widened to
02628   // a vector form. We are almost done. Now, we need to fix the PHI nodes
02629   // that we vectorized. The PHI nodes are currently empty because we did
02630   // not want to introduce cycles. Notice that the remaining PHI nodes
02631   // that we need to fix are reduction variables.
02632 
02633   // Create the 'reduced' values for each of the induction vars.
02634   // The reduced values are the vector values that we scalarize and combine
02635   // after the loop is finished.
02636   for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end();
02637        it != e; ++it) {
02638     PHINode *RdxPhi = *it;
02639     assert(RdxPhi && "Unable to recover vectorized PHI");
02640 
02641     // Find the reduction variable descriptor.
02642     assert(Legal->getReductionVars()->count(RdxPhi) &&
02643            "Unable to find the reduction variable");
02644     LoopVectorizationLegality::ReductionDescriptor RdxDesc =
02645     (*Legal->getReductionVars())[RdxPhi];
02646 
02647     setDebugLocFromInst(Builder, RdxDesc.StartValue);
02648 
02649     // We need to generate a reduction vector from the incoming scalar.
02650     // To do so, we need to generate the 'identity' vector and override
02651     // one of the elements with the incoming scalar reduction. We need
02652     // to do it in the vector-loop preheader.
02653     Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator());
02654 
02655     // This is the vector-clone of the value that leaves the loop.
02656     VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
02657     Type *VecTy = VectorExit[0]->getType();
02658 
02659     // Find the reduction identity variable. Zero for addition, or, xor,
02660     // one for multiplication, -1 for And.
02661     Value *Identity;
02662     Value *VectorStart;
02663     if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerMinMax ||
02664         RdxDesc.Kind == LoopVectorizationLegality::RK_FloatMinMax) {
02665       // MinMax reduction have the start value as their identify.
02666       if (VF == 1) {
02667         VectorStart = Identity = RdxDesc.StartValue;
02668       } else {
02669         VectorStart = Identity = Builder.CreateVectorSplat(VF,
02670                                                            RdxDesc.StartValue,
02671                                                            "minmax.ident");
02672       }
02673     } else {
02674       // Handle other reduction kinds:
02675       Constant *Iden =
02676       LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind,
02677                                                       VecTy->getScalarType());
02678       if (VF == 1) {
02679         Identity = Iden;
02680         // This vector is the Identity vector where the first element is the
02681         // incoming scalar reduction.
02682         VectorStart = RdxDesc.StartValue;
02683       } else {
02684         Identity = ConstantVector::getSplat(VF, Iden);
02685 
02686         // This vector is the Identity vector where the first element is the
02687         // incoming scalar reduction.
02688         VectorStart = Builder.CreateInsertElement(Identity,
02689                                                   RdxDesc.StartValue, Zero);
02690       }
02691     }
02692 
02693     // Fix the vector-loop phi.
02694 
02695     // Reductions do not have to start at zero. They can start with
02696     // any loop invariant values.
02697     VectorParts &VecRdxPhi = WidenMap.get(RdxPhi);
02698     BasicBlock *Latch = OrigLoop->getLoopLatch();
02699     Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch);
02700     VectorParts &Val = getVectorValue(LoopVal);
02701     for (unsigned part = 0; part < UF; ++part) {
02702       // Make sure to add the reduction stat value only to the
02703       // first unroll part.
02704       Value *StartVal = (part == 0) ? VectorStart : Identity;
02705       cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal,
02706                                                   LoopVectorPreHeader);
02707       cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part],
02708                                                   LoopVectorBody.back());
02709     }
02710 
02711     // Before each round, move the insertion point right between
02712     // the PHIs and the values we are going to write.
02713     // This allows us to write both PHINodes and the extractelement
02714     // instructions.
02715     Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
02716 
02717     VectorParts RdxParts;
02718     setDebugLocFromInst(Builder, RdxDesc.LoopExitInstr);
02719     for (unsigned part = 0; part < UF; ++part) {
02720       // This PHINode contains the vectorized reduction variable, or
02721       // the initial value vector, if we bypass the vector loop.
02722       VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr);
02723       PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
02724       Value *StartVal = (part == 0) ? VectorStart : Identity;
02725       for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
02726         NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]);
02727       NewPhi->addIncoming(RdxExitVal[part],
02728                           LoopVectorBody.back());
02729       RdxParts.push_back(NewPhi);
02730     }
02731 
02732     // Reduce all of the unrolled parts into a single vector.
02733     Value *ReducedPartRdx = RdxParts[0];
02734     unsigned Op = getReductionBinOp(RdxDesc.Kind);
02735     setDebugLocFromInst(Builder, ReducedPartRdx);
02736     for (unsigned part = 1; part < UF; ++part) {
02737       if (Op != Instruction::ICmp && Op != Instruction::FCmp)
02738         // Floating point operations had to be 'fast' to enable the reduction.
02739         ReducedPartRdx = addFastMathFlag(
02740             Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part],
02741                                 ReducedPartRdx, "bin.rdx"));
02742       else
02743         ReducedPartRdx = createMinMaxOp(Builder, RdxDesc.MinMaxKind,
02744                                         ReducedPartRdx, RdxParts[part]);
02745     }
02746 
02747     if (VF > 1) {
02748       // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
02749       // and vector ops, reducing the set of values being computed by half each
02750       // round.
02751       assert(isPowerOf2_32(VF) &&
02752              "Reduction emission only supported for pow2 vectors!");
02753       Value *TmpVec = ReducedPartRdx;
02754       SmallVector<Constant*, 32> ShuffleMask(VF, nullptr);
02755       for (unsigned i = VF; i != 1; i >>= 1) {
02756         // Move the upper half of the vector to the lower half.
02757         for (unsigned j = 0; j != i/2; ++j)
02758           ShuffleMask[j] = Builder.getInt32(i/2 + j);
02759 
02760         // Fill the rest of the mask with undef.
02761         std::fill(&ShuffleMask[i/2], ShuffleMask.end(),
02762                   UndefValue::get(Builder.getInt32Ty()));
02763 
02764         Value *Shuf =
02765         Builder.CreateShuffleVector(TmpVec,
02766                                     UndefValue::get(TmpVec->getType()),
02767                                     ConstantVector::get(ShuffleMask),
02768                                     "rdx.shuf");
02769 
02770         if (Op != Instruction::ICmp && Op != Instruction::FCmp)
02771           // Floating point operations had to be 'fast' to enable the reduction.
02772           TmpVec = addFastMathFlag(Builder.CreateBinOp(
02773               (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx"));
02774         else
02775           TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf);
02776       }
02777 
02778       // The result is in the first element of the vector.
02779       ReducedPartRdx = Builder.CreateExtractElement(TmpVec,
02780                                                     Builder.getInt32(0));
02781     }
02782 
02783     // Create a phi node that merges control-flow from the backedge-taken check
02784     // block and the middle block.
02785     PHINode *BCBlockPhi = PHINode::Create(RdxPhi->getType(), 2, "bc.merge.rdx",
02786                                           LoopScalarPreHeader->getTerminator());
02787     BCBlockPhi->addIncoming(RdxDesc.StartValue, LoopBypassBlocks[0]);
02788     BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
02789 
02790     // Now, we need to fix the users of the reduction variable
02791     // inside and outside of the scalar remainder loop.
02792     // We know that the loop is in LCSSA form. We need to update the
02793     // PHI nodes in the exit blocks.
02794     for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
02795          LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
02796       PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
02797       if (!LCSSAPhi) break;
02798 
02799       // All PHINodes need to have a single entry edge, or two if
02800       // we already fixed them.
02801       assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
02802 
02803       // We found our reduction value exit-PHI. Update it with the
02804       // incoming bypass edge.
02805       if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) {
02806         // Add an edge coming from the bypass.
02807         LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
02808         break;
02809       }
02810     }// end of the LCSSA phi scan.
02811 
02812     // Fix the scalar loop reduction variable with the incoming reduction sum
02813     // from the vector body and from the backedge value.
02814     int IncomingEdgeBlockIdx =
02815     (RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch());
02816     assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
02817     // Pick the other block.
02818     int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
02819     (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
02820     (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
02821   }// end of for each redux variable.
02822 
02823   fixLCSSAPHIs();
02824 
02825   // Remove redundant induction instructions.
02826   cse(LoopVectorBody);
02827 }
02828 
02829 void InnerLoopVectorizer::fixLCSSAPHIs() {
02830   for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
02831        LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
02832     PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
02833     if (!LCSSAPhi) break;
02834     if (LCSSAPhi->getNumIncomingValues() == 1)
02835       LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()),
02836                             LoopMiddleBlock);
02837   }
02838 }
02839 
02840 InnerLoopVectorizer::VectorParts
02841 InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
02842   assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) &&
02843          "Invalid edge");
02844 
02845   // Look for cached value.
02846   std::pair<BasicBlock*, BasicBlock*> Edge(Src, Dst);
02847   EdgeMaskCache::iterator ECEntryIt = MaskCache.find(Edge);
02848   if (ECEntryIt != MaskCache.end())
02849     return ECEntryIt->second;
02850 
02851   VectorParts SrcMask = createBlockInMask(Src);
02852 
02853   // The terminator has to be a branch inst!
02854   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
02855   assert(BI && "Unexpected terminator found");
02856 
02857   if (BI->isConditional()) {
02858     VectorParts EdgeMask = getVectorValue(BI->getCondition());
02859 
02860     if (BI->getSuccessor(0) != Dst)
02861       for (unsigned part = 0; part < UF; ++part)
02862         EdgeMask[part] = Builder.CreateNot(EdgeMask[part]);
02863 
02864     for (unsigned part = 0; part < UF; ++part)
02865       EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]);
02866 
02867     MaskCache[Edge] = EdgeMask;
02868     return EdgeMask;
02869   }
02870 
02871   MaskCache[Edge] = SrcMask;
02872   return SrcMask;
02873 }
02874 
02875 InnerLoopVectorizer::VectorParts
02876 InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
02877   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
02878 
02879   // Loop incoming mask is all-one.
02880   if (OrigLoop->getHeader() == BB) {
02881     Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1);
02882     return getVectorValue(C);
02883   }
02884 
02885   // This is the block mask. We OR all incoming edges, and with zero.
02886   Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0);
02887   VectorParts BlockMask = getVectorValue(Zero);
02888 
02889   // For each pred:
02890   for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) {
02891     VectorParts EM = createEdgeMask(*it, BB);
02892     for (unsigned part = 0; part < UF; ++part)
02893       BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]);
02894   }
02895 
02896   return BlockMask;
02897 }
02898 
02899 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
02900                                               InnerLoopVectorizer::VectorParts &Entry,
02901                                               unsigned UF, unsigned VF, PhiVector *PV) {
02902   PHINode* P = cast<PHINode>(PN);
02903   // Handle reduction variables:
02904   if (Legal->getReductionVars()->count(P)) {
02905     for (unsigned part = 0; part < UF; ++part) {
02906       // This is phase one of vectorizing PHIs.
02907       Type *VecTy = (VF == 1) ? PN->getType() :
02908       VectorType::get(PN->getType(), VF);
02909       Entry[part] = PHINode::Create(VecTy, 2, "vec.phi",
02910                                     LoopVectorBody.back()-> getFirstInsertionPt());
02911     }
02912     PV->push_back(P);
02913     return;
02914   }
02915 
02916   setDebugLocFromInst(Builder, P);
02917   // Check for PHI nodes that are lowered to vector selects.
02918   if (P->getParent() != OrigLoop->getHeader()) {
02919     // We know that all PHIs in non-header blocks are converted into
02920     // selects, so we don't have to worry about the insertion order and we
02921     // can just use the builder.
02922     // At this point we generate the predication tree. There may be
02923     // duplications since this is a simple recursive scan, but future
02924     // optimizations will clean it up.
02925 
02926     unsigned NumIncoming = P->getNumIncomingValues();
02927 
02928     // Generate a sequence of selects of the form:
02929     // SELECT(Mask3, In3,
02930     //      SELECT(Mask2, In2,
02931     //                   ( ...)))
02932     for (unsigned In = 0; In < NumIncoming; In++) {
02933       VectorParts Cond = createEdgeMask(P->getIncomingBlock(In),
02934                                         P->getParent());
02935       VectorParts &In0 = getVectorValue(P->getIncomingValue(In));
02936 
02937       for (unsigned part = 0; part < UF; ++part) {
02938         // We might have single edge PHIs (blocks) - use an identity
02939         // 'select' for the first PHI operand.
02940         if (In == 0)
02941           Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
02942                                              In0[part]);
02943         else
02944           // Select between the current value and the previous incoming edge
02945           // based on the incoming mask.
02946           Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
02947                                              Entry[part], "predphi");
02948       }
02949     }
02950     return;
02951   }
02952 
02953   // This PHINode must be an induction variable.
02954   // Make sure that we know about it.
02955   assert(Legal->getInductionVars()->count(P) &&
02956          "Not an induction variable");
02957 
02958   LoopVectorizationLegality::InductionInfo II =
02959   Legal->getInductionVars()->lookup(P);
02960 
02961   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
02962   // which can be found from the original scalar operations.
02963   switch (II.IK) {
02964     case LoopVectorizationLegality::IK_NoInduction:
02965       llvm_unreachable("Unknown induction");
02966     case LoopVectorizationLegality::IK_IntInduction: {
02967       assert(P->getType() == II.StartValue->getType() && "Types must match");
02968       Type *PhiTy = P->getType();
02969       Value *Broadcasted;
02970       if (P == OldInduction) {
02971         // Handle the canonical induction variable. We might have had to
02972         // extend the type.
02973         Broadcasted = Builder.CreateTrunc(Induction, PhiTy);
02974       } else {
02975         // Handle other induction variables that are now based on the
02976         // canonical one.
02977         Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx,
02978                                                  "normalized.idx");
02979         NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy);
02980         Broadcasted = II.transform(Builder, NormalizedIdx);
02981         Broadcasted->setName("offset.idx");
02982       }
02983       Broadcasted = getBroadcastInstrs(Broadcasted);
02984       // After broadcasting the induction variable we need to make the vector
02985       // consecutive by adding 0, 1, 2, etc.
02986       for (unsigned part = 0; part < UF; ++part)
02987         Entry[part] = getStepVector(Broadcasted, VF * part, II.StepValue);
02988       return;
02989     }
02990     case LoopVectorizationLegality::IK_PtrInduction:
02991       // Handle the pointer induction variable case.
02992       assert(P->getType()->isPointerTy() && "Unexpected type.");
02993       // This is the normalized GEP that starts counting at zero.
02994       Value *NormalizedIdx =
02995           Builder.CreateSub(Induction, ExtendedIdx, "normalized.idx");
02996       // This is the vector of results. Notice that we don't generate
02997       // vector geps because scalar geps result in better code.
02998       for (unsigned part = 0; part < UF; ++part) {
02999         if (VF == 1) {
03000           int EltIndex = part;
03001           Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex);
03002           Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx);
03003           Value *SclrGep = II.transform(Builder, GlobalIdx);
03004           SclrGep->setName("next.gep");
03005           Entry[part] = SclrGep;
03006           continue;
03007         }
03008 
03009         Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
03010         for (unsigned int i = 0; i < VF; ++i) {
03011           int EltIndex = i + part * VF;
03012           Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex);
03013           Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx);
03014           Value *SclrGep = II.transform(Builder, GlobalIdx);
03015           SclrGep->setName("next.gep");
03016           VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
03017                                                Builder.getInt32(i),
03018                                                "insert.gep");
03019         }
03020         Entry[part] = VecVal;
03021       }
03022       return;
03023   }
03024 }
03025 
03026 void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
03027   // For each instruction in the old loop.
03028   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
03029     VectorParts &Entry = WidenMap.get(it);
03030     switch (it->getOpcode()) {
03031     case Instruction::Br:
03032       // Nothing to do for PHIs and BR, since we already took care of the
03033       // loop control flow instructions.
03034       continue;
03035     case Instruction::PHI: {
03036       // Vectorize PHINodes.
03037       widenPHIInstruction(it, Entry, UF, VF, PV);
03038       continue;
03039     }// End of PHI.
03040 
03041     case Instruction::Add:
03042     case Instruction::FAdd:
03043     case Instruction::Sub:
03044     case Instruction::FSub:
03045     case Instruction::Mul:
03046     case Instruction::FMul:
03047     case Instruction::UDiv:
03048     case Instruction::SDiv:
03049     case Instruction::FDiv:
03050     case Instruction::URem:
03051     case Instruction::SRem:
03052     case Instruction::FRem:
03053     case Instruction::Shl:
03054     case Instruction::LShr:
03055     case Instruction::AShr:
03056     case Instruction::And:
03057     case Instruction::Or:
03058     case Instruction::Xor: {
03059       // Just widen binops.
03060       BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it);
03061       setDebugLocFromInst(Builder, BinOp);
03062       VectorParts &A = getVectorValue(it->getOperand(0));
03063       VectorParts &B = getVectorValue(it->getOperand(1));
03064 
03065       // Use this vector value for all users of the original instruction.
03066       for (unsigned Part = 0; Part < UF; ++Part) {
03067         Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]);
03068 
03069         if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
03070           VecOp->copyIRFlags(BinOp);
03071 
03072         Entry[Part] = V;
03073       }
03074 
03075       propagateMetadata(Entry, it);
03076       break;
03077     }
03078     case Instruction::Select: {
03079       // Widen selects.
03080       // If the selector is loop invariant we can create a select
03081       // instruction with a scalar condition. Otherwise, use vector-select.
03082       bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)),
03083                                                OrigLoop);
03084       setDebugLocFromInst(Builder, it);
03085 
03086       // The condition can be loop invariant  but still defined inside the
03087       // loop. This means that we can't just use the original 'cond' value.
03088       // We have to take the 'vectorized' value and pick the first lane.
03089       // Instcombine will make this a no-op.
03090       VectorParts &Cond = getVectorValue(it->getOperand(0));
03091       VectorParts &Op0  = getVectorValue(it->getOperand(1));
03092       VectorParts &Op1  = getVectorValue(it->getOperand(2));
03093 
03094       Value *ScalarCond = (VF == 1) ? Cond[0] :
03095         Builder.CreateExtractElement(Cond[0], Builder.getInt32(0));
03096 
03097       for (unsigned Part = 0; Part < UF; ++Part) {
03098         Entry[Part] = Builder.CreateSelect(
03099           InvariantCond ? ScalarCond : Cond[Part],
03100           Op0[Part],
03101           Op1[Part]);
03102       }
03103 
03104       propagateMetadata(Entry, it);
03105       break;
03106     }
03107 
03108     case Instruction::ICmp:
03109     case Instruction::FCmp: {
03110       // Widen compares. Generate vector compares.
03111       bool FCmp = (it->getOpcode() == Instruction::FCmp);
03112       CmpInst *Cmp = dyn_cast<CmpInst>(it);
03113       setDebugLocFromInst(Builder, it);
03114       VectorParts &A = getVectorValue(it->getOperand(0));
03115       VectorParts &B = getVectorValue(it->getOperand(1));
03116       for (unsigned Part = 0; Part < UF; ++Part) {
03117         Value *C = nullptr;
03118         if (FCmp)
03119           C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]);
03120         else
03121           C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]);
03122         Entry[Part] = C;
03123       }
03124 
03125       propagateMetadata(Entry, it);
03126       break;
03127     }
03128 
03129     case Instruction::Store:
03130     case Instruction::Load:
03131       vectorizeMemoryInstruction(it);
03132         break;
03133     case Instruction::ZExt:
03134     case Instruction::SExt:
03135     case Instruction::FPToUI:
03136     case Instruction::FPToSI:
03137     case Instruction::FPExt:
03138     case Instruction::PtrToInt:
03139     case Instruction::IntToPtr:
03140     case Instruction::SIToFP:
03141     case Instruction::UIToFP:
03142     case Instruction::Trunc:
03143     case Instruction::FPTrunc:
03144     case Instruction::BitCast: {
03145       CastInst *CI = dyn_cast<CastInst>(it);
03146       setDebugLocFromInst(Builder, it);
03147       /// Optimize the special case where the source is the induction
03148       /// variable. Notice that we can only optimize the 'trunc' case
03149       /// because: a. FP conversions lose precision, b. sext/zext may wrap,
03150       /// c. other casts depend on pointer size.
03151       if (CI->getOperand(0) == OldInduction &&
03152           it->getOpcode() == Instruction::Trunc) {
03153         Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction,
03154                                                CI->getType());
03155         Value *Broadcasted = getBroadcastInstrs(ScalarCast);
03156         LoopVectorizationLegality::InductionInfo II =
03157             Legal->getInductionVars()->lookup(OldInduction);
03158         Constant *Step =
03159             ConstantInt::getSigned(CI->getType(), II.StepValue->getSExtValue());
03160         for (unsigned Part = 0; Part < UF; ++Part)
03161           Entry[Part] = getStepVector(Broadcasted, VF * Part, Step);
03162         propagateMetadata(Entry, it);
03163         break;
03164       }
03165       /// Vectorize casts.
03166       Type *DestTy = (VF == 1) ? CI->getType() :
03167                                  VectorType::get(CI->getType(), VF);
03168 
03169       VectorParts &A = getVectorValue(it->getOperand(0));
03170       for (unsigned Part = 0; Part < UF; ++Part)
03171         Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy);
03172       propagateMetadata(Entry, it);
03173       break;
03174     }
03175 
03176     case Instruction::Call: {
03177       // Ignore dbg intrinsics.
03178       if (isa<DbgInfoIntrinsic>(it))
03179         break;
03180       setDebugLocFromInst(Builder, it);
03181 
03182       Module *M = BB->getParent()->getParent();
03183       CallInst *CI = cast<CallInst>(it);
03184       Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
03185       assert(ID && "Not an intrinsic call!");
03186       switch (ID) {
03187       case Intrinsic::assume:
03188       case Intrinsic::lifetime_end:
03189       case Intrinsic::lifetime_start:
03190         scalarizeInstruction(it);
03191         break;
03192       default:
03193         bool HasScalarOpd = hasVectorInstrinsicScalarOpd(ID, 1);
03194         for (unsigned Part = 0; Part < UF; ++Part) {
03195           SmallVector<Value *, 4> Args;
03196           for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
03197             if (HasScalarOpd && i == 1) {
03198               Args.push_back(CI->getArgOperand(i));
03199               continue;
03200             }
03201             VectorParts &Arg = getVectorValue(CI->getArgOperand(i));
03202             Args.push_back(Arg[Part]);
03203           }
03204           Type *Tys[] = {CI->getType()};
03205           if (VF > 1)
03206             Tys[0] = VectorType::get(CI->getType()->getScalarType(), VF);
03207 
03208           Function *F = Intrinsic::getDeclaration(M, ID, Tys);
03209           Entry[Part] = Builder.CreateCall(F, Args);
03210         }
03211 
03212         propagateMetadata(Entry, it);
03213         break;
03214       }
03215       break;
03216     }
03217 
03218     default:
03219       // All other instructions are unsupported. Scalarize them.
03220       scalarizeInstruction(it);
03221       break;
03222     }// end of switch.
03223   }// end of for_each instr.
03224 }
03225 
03226 void InnerLoopVectorizer::updateAnalysis() {
03227   // Forget the original basic block.
03228   SE->forgetLoop(OrigLoop);
03229 
03230   // Update the dominator tree information.
03231   assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
03232          "Entry does not dominate exit.");
03233 
03234   for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
03235     DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]);
03236   DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back());
03237 
03238   // Due to if predication of stores we might create a sequence of "if(pred)
03239   // a[i] = ...;  " blocks.
03240   for (unsigned i = 0, e = LoopVectorBody.size(); i != e; ++i) {
03241     if (i == 0)
03242       DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader);
03243     else if (isPredicatedBlock(i)) {
03244       DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-1]);
03245     } else {
03246       DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-2]);
03247     }
03248   }
03249 
03250   DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks[1]);
03251   DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
03252   DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
03253   DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
03254 
03255   DEBUG(DT->verifyDomTree());
03256 }
03257 
03258 /// \brief Check whether it is safe to if-convert this phi node.
03259 ///
03260 /// Phi nodes with constant expressions that can trap are not safe to if
03261 /// convert.
03262 static bool canIfConvertPHINodes(BasicBlock *BB) {
03263   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
03264     PHINode *Phi = dyn_cast<PHINode>(I);
03265     if (!Phi)
03266       return true;
03267     for (unsigned p = 0, e = Phi->getNumIncomingValues(); p != e; ++p)
03268       if (Constant *C = dyn_cast<Constant>(Phi->getIncomingValue(p)))
03269         if (C->canTrap())
03270           return false;
03271   }
03272   return true;
03273 }
03274 
03275 bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
03276   if (!EnableIfConversion) {
03277     emitAnalysis(VectorizationReport() << "if-conversion is disabled");
03278     return false;
03279   }
03280 
03281   assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
03282 
03283   // A list of pointers that we can safely read and write to.
03284   SmallPtrSet<Value *, 8> SafePointes;
03285 
03286   // Collect safe addresses.
03287   for (Loop::block_iterator BI = TheLoop->block_begin(),
03288          BE = TheLoop->block_end(); BI != BE; ++BI) {
03289     BasicBlock *BB = *BI;
03290 
03291     if (blockNeedsPredication(BB))
03292       continue;
03293 
03294     for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
03295       if (LoadInst *LI = dyn_cast<LoadInst>(I))
03296         SafePointes.insert(LI->getPointerOperand());
03297       else if (StoreInst *SI = dyn_cast<StoreInst>(I))
03298         SafePointes.insert(SI->getPointerOperand());
03299     }
03300   }
03301 
03302   // Collect the blocks that need predication.
03303   BasicBlock *Header = TheLoop->getHeader();
03304   for (Loop::block_iterator BI = TheLoop->block_begin(),
03305          BE = TheLoop->block_end(); BI != BE; ++BI) {
03306     BasicBlock *BB = *BI;
03307 
03308     // We don't support switch statements inside loops.
03309     if (!isa<BranchInst>(BB->getTerminator())) {
03310       emitAnalysis(VectorizationReport(BB->getTerminator())
03311                    << "loop contains a switch statement");
03312       return false;
03313     }
03314 
03315     // We must be able to predicate all blocks that need to be predicated.
03316     if (blockNeedsPredication(BB)) {
03317       if (!blockCanBePredicated(BB, SafePointes)) {
03318         emitAnalysis(VectorizationReport(BB->getTerminator())
03319                      << "control flow cannot be substituted for a select");
03320         return false;
03321       }
03322     } else if (BB != Header && !canIfConvertPHINodes(BB)) {
03323       emitAnalysis(VectorizationReport(BB->getTerminator())
03324                    << "control flow cannot be substituted for a select");
03325       return false;
03326     }
03327   }
03328 
03329   // We can if-convert this loop.
03330   return true;
03331 }
03332 
03333 bool LoopVectorizationLegality::canVectorize() {
03334   // We must have a loop in canonical form. Loops with indirectbr in them cannot
03335   // be canonicalized.
03336   if (!TheLoop->getLoopPreheader()) {
03337     emitAnalysis(
03338         VectorizationReport() <<
03339         "loop control flow is not understood by vectorizer");
03340     return false;
03341   }
03342 
03343   // We can only vectorize innermost loops.
03344   if (!TheLoop->getSubLoopsVector().empty()) {
03345     emitAnalysis(VectorizationReport() << "loop is not the innermost loop");
03346     return false;
03347   }
03348 
03349   // We must have a single backedge.
03350   if (TheLoop->getNumBackEdges() != 1) {
03351     emitAnalysis(
03352         VectorizationReport() <<
03353         "loop control flow is not understood by vectorizer");
03354     return false;
03355   }
03356 
03357   // We must have a single exiting block.
03358   if (!TheLoop->getExitingBlock()) {
03359     emitAnalysis(
03360         VectorizationReport() <<
03361         "loop control flow is not understood by vectorizer");
03362     return false;
03363   }
03364 
03365   // We only handle bottom-tested loops, i.e. loop in which the condition is
03366   // checked at the end of each iteration. With that we can assume that all
03367   // instructions in the loop are executed the same number of times.
03368   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
03369     emitAnalysis(
03370         VectorizationReport() <<
03371         "loop control flow is not understood by vectorizer");
03372     return false;
03373   }
03374 
03375   // We need to have a loop header.
03376   DEBUG(dbgs() << "LV: Found a loop: " <<
03377         TheLoop->getHeader()->getName() << '\n');
03378 
03379   // Check if we can if-convert non-single-bb loops.
03380   unsigned NumBlocks = TheLoop->getNumBlocks();
03381   if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
03382     DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
03383     return false;
03384   }
03385 
03386   // ScalarEvolution needs to be able to find the exit count.
03387   const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop);
03388   if (ExitCount == SE->getCouldNotCompute()) {
03389     emitAnalysis(VectorizationReport() <<
03390                  "could not determine number of loop iterations");
03391     DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
03392     return false;
03393   }
03394 
03395   // Check if we can vectorize the instructions and CFG in this loop.
03396   if (!canVectorizeInstrs()) {
03397     DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
03398     return false;
03399   }
03400 
03401   // Go over each instruction and look at memory deps.
03402   if (!canVectorizeMemory()) {
03403     DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
03404     return false;
03405   }
03406 
03407   // Collect all of the variables that remain uniform after vectorization.
03408   collectLoopUniforms();
03409 
03410   DEBUG(dbgs() << "LV: We can vectorize this loop" <<
03411         (LAI->getRuntimePointerCheck()->Need ? " (with a runtime bound check)" :
03412          "")
03413         <<"!\n");
03414 
03415   // Okay! We can vectorize. At this point we don't have any other mem analysis
03416   // which may limit our maximum vectorization factor, so just return true with
03417   // no restrictions.
03418   return true;
03419 }
03420 
03421 static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {
03422   if (Ty->isPointerTy())
03423     return DL.getIntPtrType(Ty);
03424 
03425   // It is possible that char's or short's overflow when we ask for the loop's
03426   // trip count, work around this by changing the type size.
03427   if (Ty->getScalarSizeInBits() < 32)
03428     return Type::getInt32Ty(Ty->getContext());
03429 
03430   return Ty;
03431 }
03432 
03433 static Type* getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
03434   Ty0 = convertPointerToIntegerType(DL, Ty0);
03435   Ty1 = convertPointerToIntegerType(DL, Ty1);
03436   if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
03437     return Ty0;
03438   return Ty1;
03439 }
03440 
03441 /// \brief Check that the instruction has outside loop users and is not an
03442 /// identified reduction variable.
03443 static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
03444                                SmallPtrSetImpl<Value *> &Reductions) {
03445   // Reduction instructions are allowed to have exit users. All other
03446   // instructions must not have external users.
03447   if (!Reductions.count(Inst))
03448     //Check that all of the users of the loop are inside the BB.
03449     for (User *U : Inst->users()) {
03450       Instruction *UI = cast<Instruction>(U);
03451       // This user may be a reduction exit value.
03452       if (!TheLoop->contains(UI)) {
03453         DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n');
03454         return true;
03455       }
03456     }
03457   return false;
03458 }
03459 
03460 bool LoopVectorizationLegality::canVectorizeInstrs() {
03461   BasicBlock *PreHeader = TheLoop->getLoopPreheader();
03462   BasicBlock *Header = TheLoop->getHeader();
03463 
03464   // Look for the attribute signaling the absence of NaNs.
03465   Function &F = *Header->getParent();
03466   if (F.hasFnAttribute("no-nans-fp-math"))
03467     HasFunNoNaNAttr =
03468         F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
03469 
03470   // For each block in the loop.
03471   for (Loop::block_iterator bb = TheLoop->block_begin(),
03472        be = TheLoop->block_end(); bb != be; ++bb) {
03473 
03474     // Scan the instructions in the block and look for hazards.
03475     for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
03476          ++it) {
03477 
03478       if (PHINode *Phi = dyn_cast<PHINode>(it)) {
03479         Type *PhiTy = Phi->getType();
03480         // Check that this PHI type is allowed.
03481         if (!PhiTy->isIntegerTy() &&
03482             !PhiTy->isFloatingPointTy() &&
03483             !PhiTy->isPointerTy()) {
03484           emitAnalysis(VectorizationReport(it)
03485                        << "loop control flow is not understood by vectorizer");
03486           DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
03487           return false;
03488         }
03489 
03490         // If this PHINode is not in the header block, then we know that we
03491         // can convert it to select during if-conversion. No need to check if
03492         // the PHIs in this block are induction or reduction variables.
03493         if (*bb != Header) {
03494           // Check that this instruction has no outside users or is an
03495           // identified reduction value with an outside user.
03496           if (!hasOutsideLoopUser(TheLoop, it, AllowedExit))
03497             continue;
03498           emitAnalysis(VectorizationReport(it) <<
03499                        "value could not be identified as "
03500                        "an induction or reduction variable");
03501           return false;
03502         }
03503 
03504         // We only allow if-converted PHIs with exactly two incoming values.
03505         if (Phi->getNumIncomingValues() != 2) {
03506           emitAnalysis(VectorizationReport(it)
03507                        << "control flow not understood by vectorizer");
03508           DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
03509           return false;
03510         }
03511 
03512         // This is the value coming from the preheader.
03513         Value *StartValue = Phi->getIncomingValueForBlock(PreHeader);
03514         ConstantInt *StepValue = nullptr;
03515         // Check if this is an induction variable.
03516         InductionKind IK = isInductionVariable(Phi, StepValue);
03517 
03518         if (IK_NoInduction != IK) {
03519           // Get the widest type.
03520           if (!WidestIndTy)
03521             WidestIndTy = convertPointerToIntegerType(*DL, PhiTy);
03522           else
03523             WidestIndTy = getWiderType(*DL, PhiTy, WidestIndTy);
03524 
03525           // Int inductions are special because we only allow one IV.
03526           if (IK == IK_IntInduction && StepValue->isOne()) {
03527             // Use the phi node with the widest type as induction. Use the last
03528             // one if there are multiple (no good reason for doing this other
03529             // than it is expedient).
03530             if (!Induction || PhiTy == WidestIndTy)
03531               Induction = Phi;
03532           }
03533 
03534           DEBUG(dbgs() << "LV: Found an induction variable.\n");
03535           Inductions[Phi] = InductionInfo(StartValue, IK, StepValue);
03536 
03537           // Until we explicitly handle the case of an induction variable with
03538           // an outside loop user we have to give up vectorizing this loop.
03539           if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) {
03540             emitAnalysis(VectorizationReport(it) <<
03541                          "use of induction value outside of the "
03542                          "loop is not handled by vectorizer");
03543             return false;
03544           }
03545 
03546           continue;
03547         }
03548 
03549         if (AddReductionVar(Phi, RK_IntegerAdd)) {
03550           DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n");
03551           continue;
03552         }
03553         if (AddReductionVar(Phi, RK_IntegerMult)) {
03554           DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n");
03555           continue;
03556         }
03557         if (AddReductionVar(Phi, RK_IntegerOr)) {
03558           DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n");
03559           continue;
03560         }
03561         if (AddReductionVar(Phi, RK_IntegerAnd)) {
03562           DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n");
03563           continue;
03564         }
03565         if (AddReductionVar(Phi, RK_IntegerXor)) {
03566           DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n");
03567           continue;
03568         }
03569         if (AddReductionVar(Phi, RK_IntegerMinMax)) {
03570           DEBUG(dbgs() << "LV: Found a MINMAX reduction PHI."<< *Phi <<"\n");
03571           continue;
03572         }
03573         if (AddReductionVar(Phi, RK_FloatMult)) {
03574           DEBUG(dbgs() << "LV: Found an FMult reduction PHI."<< *Phi <<"\n");
03575           continue;
03576         }
03577         if (AddReductionVar(Phi, RK_FloatAdd)) {
03578           DEBUG(dbgs() << "LV: Found an FAdd reduction PHI."<< *Phi <<"\n");
03579           continue;
03580         }
03581         if (AddReductionVar(Phi, RK_FloatMinMax)) {
03582           DEBUG(dbgs() << "LV: Found an float MINMAX reduction PHI."<< *Phi <<
03583                 "\n");
03584           continue;
03585         }
03586 
03587         emitAnalysis(VectorizationReport(it) <<
03588                      "value that could not be identified as "
03589                      "reduction is used outside the loop");
03590         DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
03591         return false;
03592       }// end of PHI handling
03593 
03594       // We still don't handle functions. However, we can ignore dbg intrinsic
03595       // calls and we do handle certain intrinsic and libm functions.
03596       CallInst *CI = dyn_cast<CallInst>(it);
03597       if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI)) {
03598         emitAnalysis(VectorizationReport(it) <<
03599                      "call instruction cannot be vectorized");
03600         DEBUG(dbgs() << "LV: Found a call site.\n");
03601         return false;
03602       }
03603 
03604       // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the
03605       // second argument is the same (i.e. loop invariant)
03606       if (CI &&
03607           hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) {
03608         if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) {
03609           emitAnalysis(VectorizationReport(it)
03610                        << "intrinsic instruction cannot be vectorized");
03611           DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n");
03612           return false;
03613         }
03614       }
03615 
03616       // Check that the instruction return type is vectorizable.
03617       // Also, we can't vectorize extractelement instructions.
03618       if ((!VectorType::isValidElementType(it->getType()) &&
03619            !it->getType()->isVoidTy()) || isa<ExtractElementInst>(it)) {
03620         emitAnalysis(VectorizationReport(it)
03621                      << "instruction return type cannot be vectorized");
03622         DEBUG(dbgs() << "LV: Found unvectorizable type.\n");
03623         return false;
03624       }
03625 
03626       // Check that the stored type is vectorizable.
03627       if (StoreInst *ST = dyn_cast<StoreInst>(it)) {
03628         Type *T = ST->getValueOperand()->getType();
03629         if (!VectorType::isValidElementType(T)) {
03630           emitAnalysis(VectorizationReport(ST) <<
03631                        "store instruction cannot be vectorized");
03632           return false;
03633         }
03634         if (EnableMemAccessVersioning)
03635           collectStridedAccess(ST);
03636       }
03637 
03638       if (EnableMemAccessVersioning)
03639         if (LoadInst *LI = dyn_cast<LoadInst>(it))
03640           collectStridedAccess(LI);
03641 
03642       // Reduction instructions are allowed to have exit users.
03643       // All other instructions must not have external users.
03644       if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) {
03645         emitAnalysis(VectorizationReport(it) <<
03646                      "value cannot be used outside the loop");
03647         return false;
03648       }
03649 
03650     } // next instr.
03651 
03652   }
03653 
03654   if (!Induction) {
03655     DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
03656     if (Inductions.empty()) {
03657       emitAnalysis(VectorizationReport()
03658                    << "loop induction variable could not be identified");
03659       return false;
03660     }
03661   }
03662 
03663   return true;
03664 }
03665 
03666 ///\brief Remove GEPs whose indices but the last one are loop invariant and
03667 /// return the induction operand of the gep pointer.
03668 static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE,
03669                                  const DataLayout *DL, Loop *Lp) {
03670   GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
03671   if (!GEP)
03672     return Ptr;
03673 
03674   unsigned InductionOperand = getGEPInductionOperand(DL, GEP);
03675 
03676   // Check that all of the gep indices are uniform except for our induction
03677   // operand.
03678   for (unsigned i = 0, e = GEP->getNumOperands(); i != e; ++i)
03679     if (i != InductionOperand &&
03680         !SE->isLoopInvariant(SE->getSCEV(GEP->getOperand(i)), Lp))
03681       return Ptr;
03682   return GEP->getOperand(InductionOperand);
03683 }
03684 
03685 ///\brief Look for a cast use of the passed value.
03686 static Value *getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty) {
03687   Value *UniqueCast = nullptr;
03688   for (User *U : Ptr->users()) {
03689     CastInst *CI = dyn_cast<CastInst>(U);
03690     if (CI && CI->getType() == Ty) {
03691       if (!UniqueCast)
03692         UniqueCast = CI;
03693       else
03694         return nullptr;
03695     }
03696   }
03697   return UniqueCast;
03698 }
03699 
03700 ///\brief Get the stride of a pointer access in a loop.
03701 /// Looks for symbolic strides "a[i*stride]". Returns the symbolic stride as a
03702 /// pointer to the Value, or null otherwise.
03703 static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE,
03704                                    const DataLayout *DL, Loop *Lp) {
03705   const PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
03706   if (!PtrTy || PtrTy->isAggregateType())
03707     return nullptr;
03708 
03709   // Try to remove a gep instruction to make the pointer (actually index at this
03710   // point) easier analyzable. If OrigPtr is equal to Ptr we are analzying the
03711   // pointer, otherwise, we are analyzing the index.
03712   Value *OrigPtr = Ptr;
03713 
03714   // The size of the pointer access.
03715   int64_t PtrAccessSize = 1;
03716 
03717   Ptr = stripGetElementPtr(Ptr, SE, DL, Lp);
03718   const SCEV *V = SE->getSCEV(Ptr);
03719 
03720   if (Ptr != OrigPtr)
03721     // Strip off casts.
03722     while (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(V))
03723       V = C->getOperand();
03724 
03725   const SCEVAddRecExpr *S = dyn_cast<SCEVAddRecExpr>(V);
03726   if (!S)
03727     return nullptr;
03728 
03729   V = S->getStepRecurrence(*SE);
03730   if (!V)
03731     return nullptr;
03732 
03733   // Strip off the size of access multiplication if we are still analyzing the
03734   // pointer.
03735   if (OrigPtr == Ptr) {
03736     DL->getTypeAllocSize(PtrTy->getElementType());
03737     if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(V)) {
03738       if (M->getOperand(0)->getSCEVType() != scConstant)
03739         return nullptr;
03740 
03741       const APInt &APStepVal =
03742           cast<SCEVConstant>(M->getOperand(0))->getValue()->getValue();
03743 
03744       // Huge step value - give up.
03745       if (APStepVal.getBitWidth() > 64)
03746         return nullptr;
03747 
03748       int64_t StepVal = APStepVal.getSExtValue();
03749       if (PtrAccessSize != StepVal)
03750         return nullptr;
03751       V = M->getOperand(1);
03752     }
03753   }
03754 
03755   // Strip off casts.
03756   Type *StripedOffRecurrenceCast = nullptr;
03757   if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(V)) {
03758     StripedOffRecurrenceCast = C->getType();
03759     V = C->getOperand();
03760   }
03761 
03762   // Look for the loop invariant symbolic value.
03763   const SCEVUnknown *U = dyn_cast<SCEVUnknown>(V);
03764   if (!U)
03765     return nullptr;
03766 
03767   Value *Stride = U->getValue();
03768   if (!Lp->isLoopInvariant(Stride))
03769     return nullptr;
03770 
03771   // If we have stripped off the recurrence cast we have to make sure that we
03772   // return the value that is used in this loop so that we can replace it later.
03773   if (StripedOffRecurrenceCast)
03774     Stride = getUniqueCastUse(Stride, Lp, StripedOffRecurrenceCast);
03775 
03776   return Stride;
03777 }
03778 
03779 void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) {
03780   Value *Ptr = nullptr;
03781   if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess))
03782     Ptr = LI->getPointerOperand();
03783   else if (StoreInst *SI = dyn_cast<StoreInst>(MemAccess))
03784     Ptr = SI->getPointerOperand();
03785   else
03786     return;
03787 
03788   Value *Stride = getStrideFromPointer(Ptr, SE, DL, TheLoop);
03789   if (!Stride)
03790     return;
03791 
03792   DEBUG(dbgs() << "LV: Found a strided access that we can version");
03793   DEBUG(dbgs() << "  Ptr: " << *Ptr << " Stride: " << *Stride << "\n");
03794   Strides[Ptr] = Stride;
03795   StrideSet.insert(Stride);
03796 }
03797 
03798 void LoopVectorizationLegality::collectLoopUniforms() {
03799   // We now know that the loop is vectorizable!
03800   // Collect variables that will remain uniform after vectorization.
03801   std::vector<Value*> Worklist;
03802   BasicBlock *Latch = TheLoop->getLoopLatch();
03803 
03804   // Start with the conditional branch and walk up the block.
03805   Worklist.push_back(Latch->getTerminator()->getOperand(0));
03806 
03807   // Also add all consecutive pointer values; these values will be uniform
03808   // after vectorization (and subsequent cleanup) and, until revectorization is
03809   // supported, all dependencies must also be uniform.
03810   for (Loop::block_iterator B = TheLoop->block_begin(),
03811        BE = TheLoop->block_end(); B != BE; ++B)
03812     for (BasicBlock::iterator I = (*B)->begin(), IE = (*B)->end();
03813          I != IE; ++I)
03814       if (I->getType()->isPointerTy() && isConsecutivePtr(I))
03815         Worklist.insert(Worklist.end(), I->op_begin(), I->op_end());
03816 
03817   while (!Worklist.empty()) {
03818     Instruction *I = dyn_cast<Instruction>(Worklist.back());
03819     Worklist.pop_back();
03820 
03821     // Look at instructions inside this loop.
03822     // Stop when reaching PHI nodes.
03823     // TODO: we need to follow values all over the loop, not only in this block.
03824     if (!I || !TheLoop->contains(I) || isa<PHINode>(I))
03825       continue;
03826 
03827     // This is a known uniform.
03828     Uniforms.insert(I);
03829 
03830     // Insert all operands.
03831     Worklist.insert(Worklist.end(), I->op_begin(), I->op_end());
03832   }
03833 }
03834 
03835 bool LoopVectorizationLegality::canVectorizeMemory() {
03836   LAI = &LAA->getInfo(TheLoop, Strides);
03837   auto &OptionalReport = LAI->getReport();
03838   if (OptionalReport)
03839     emitAnalysis(VectorizationReport(*OptionalReport));
03840   return LAI->canVectorizeMemory();
03841 }
03842 
03843 static bool hasMultipleUsesOf(Instruction *I,
03844                               SmallPtrSetImpl<Instruction *> &Insts) {
03845   unsigned NumUses = 0;
03846   for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) {
03847     if (Insts.count(dyn_cast<Instruction>(*Use)))
03848       ++NumUses;
03849     if (NumUses > 1)
03850       return true;
03851   }
03852 
03853   return false;
03854 }
03855 
03856 static bool areAllUsesIn(Instruction *I, SmallPtrSetImpl<Instruction *> &Set) {
03857   for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use)
03858     if (!Set.count(dyn_cast<Instruction>(*Use)))
03859       return false;
03860   return true;
03861 }
03862 
03863 bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
03864                                                 ReductionKind Kind) {
03865   if (Phi->getNumIncomingValues() != 2)
03866     return false;
03867 
03868   // Reduction variables are only found in the loop header block.
03869   if (Phi->getParent() != TheLoop->getHeader())
03870     return false;
03871 
03872   // Obtain the reduction start value from the value that comes from the loop
03873   // preheader.
03874   Value *RdxStart = Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader());
03875 
03876   // ExitInstruction is the single value which is used outside the loop.
03877   // We only allow for a single reduction value to be used outside the loop.
03878   // This includes users of the reduction, variables (which form a cycle
03879   // which ends in the phi node).
03880   Instruction *ExitInstruction = nullptr;
03881   // Indicates that we found a reduction operation in our scan.
03882   bool FoundReduxOp = false;
03883 
03884   // We start with the PHI node and scan for all of the users of this
03885   // instruction. All users must be instructions that can be used as reduction
03886   // variables (such as ADD). We must have a single out-of-block user. The cycle
03887   // must include the original PHI.
03888   bool FoundStartPHI = false;
03889 
03890   // To recognize min/max patterns formed by a icmp select sequence, we store
03891   // the number of instruction we saw from the recognized min/max pattern,
03892   //  to make sure we only see exactly the two instructions.
03893   unsigned NumCmpSelectPatternInst = 0;
03894   ReductionInstDesc ReduxDesc(false, nullptr);
03895 
03896   SmallPtrSet<Instruction *, 8> VisitedInsts;
03897   SmallVector<Instruction *, 8> Worklist;
03898   Worklist.push_back(Phi);
03899   VisitedInsts.insert(Phi);
03900 
03901   // A value in the reduction can be used:
03902   //  - By the reduction:
03903   //      - Reduction operation:
03904   //        - One use of reduction value (safe).
03905   //        - Multiple use of reduction value (not safe).
03906   //      - PHI:
03907   //        - All uses of the PHI must be the reduction (safe).
03908   //        - Otherwise, not safe.
03909   //  - By one instruction outside of the loop (safe).
03910   //  - By further instructions outside of the loop (not safe).
03911   //  - By an instruction that is not part of the reduction (not safe).
03912   //    This is either:
03913   //      * An instruction type other than PHI or the reduction operation.
03914   //      * A PHI in the header other than the initial PHI.
03915   while (!Worklist.empty()) {
03916     Instruction *Cur = Worklist.back();
03917     Worklist.pop_back();
03918 
03919     // No Users.
03920     // If the instruction has no users then this is a broken chain and can't be
03921     // a reduction variable.
03922     if (Cur->use_empty())
03923       return false;
03924 
03925     bool IsAPhi = isa<PHINode>(Cur);
03926 
03927     // A header PHI use other than the original PHI.
03928     if (Cur != Phi && IsAPhi && Cur->getParent() == Phi->getParent())
03929       return false;
03930 
03931     // Reductions of instructions such as Div, and Sub is only possible if the
03932     // LHS is the reduction variable.
03933     if (!Cur->isCommutative() && !IsAPhi && !isa<SelectInst>(Cur) &&
03934         !isa<ICmpInst>(Cur) && !isa<FCmpInst>(Cur) &&
03935         !VisitedInsts.count(dyn_cast<Instruction>(Cur->getOperand(0))))
03936       return false;
03937 
03938     // Any reduction instruction must be of one of the allowed kinds.
03939     ReduxDesc = isReductionInstr(Cur, Kind, ReduxDesc);
03940     if (!ReduxDesc.IsReduction)
03941       return false;
03942 
03943     // A reduction operation must only have one use of the reduction value.
03944     if (!IsAPhi && Kind != RK_IntegerMinMax && Kind != RK_FloatMinMax &&
03945         hasMultipleUsesOf(Cur, VisitedInsts))
03946       return false;
03947 
03948     // All inputs to a PHI node must be a reduction value.
03949     if(IsAPhi && Cur != Phi && !areAllUsesIn(Cur, VisitedInsts))
03950       return false;
03951 
03952     if (Kind == RK_IntegerMinMax && (isa<ICmpInst>(Cur) ||
03953                                      isa<SelectInst>(Cur)))
03954       ++NumCmpSelectPatternInst;
03955     if (Kind == RK_FloatMinMax && (isa<FCmpInst>(Cur) ||
03956                                    isa<SelectInst>(Cur)))
03957       ++NumCmpSelectPatternInst;
03958 
03959     // Check  whether we found a reduction operator.
03960     FoundReduxOp |= !IsAPhi;
03961 
03962     // Process users of current instruction. Push non-PHI nodes after PHI nodes
03963     // onto the stack. This way we are going to have seen all inputs to PHI
03964     // nodes once we get to them.
03965     SmallVector<Instruction *, 8> NonPHIs;
03966     SmallVector<Instruction *, 8> PHIs;
03967     for (User *U : Cur->users()) {
03968       Instruction *UI = cast<Instruction>(U);
03969 
03970       // Check if we found the exit user.
03971       BasicBlock *Parent = UI->getParent();
03972       if (!TheLoop->contains(Parent)) {
03973         // Exit if you find multiple outside users or if the header phi node is
03974         // being used. In this case the user uses the value of the previous
03975         // iteration, in which case we would loose "VF-1" iterations of the
03976         // reduction operation if we vectorize.
03977         if (ExitInstruction != nullptr || Cur == Phi)
03978           return false;
03979 
03980         // The instruction used by an outside user must be the last instruction
03981         // before we feed back to the reduction phi. Otherwise, we loose VF-1
03982         // operations on the value.
03983         if (std::find(Phi->op_begin(), Phi->op_end(), Cur) == Phi->op_end())
03984          return false;
03985 
03986         ExitInstruction = Cur;
03987         continue;
03988       }
03989 
03990       // Process instructions only once (termination). Each reduction cycle
03991       // value must only be used once, except by phi nodes and min/max
03992       // reductions which are represented as a cmp followed by a select.
03993       ReductionInstDesc IgnoredVal(false, nullptr);
03994       if (VisitedInsts.insert(UI).second) {
03995         if (isa<PHINode>(UI))
03996           PHIs.push_back(UI);
03997         else
03998           NonPHIs.push_back(UI);
03999       } else if (!isa<PHINode>(UI) &&
04000                  ((!isa<FCmpInst>(UI) &&
04001                    !isa<ICmpInst>(UI) &&
04002                    !isa<SelectInst>(UI)) ||
04003                   !isMinMaxSelectCmpPattern(UI, IgnoredVal).IsReduction))
04004         return false;
04005 
04006       // Remember that we completed the cycle.
04007       if (UI == Phi)
04008         FoundStartPHI = true;
04009     }
04010     Worklist.append(PHIs.begin(), PHIs.end());
04011     Worklist.append(NonPHIs.begin(), NonPHIs.end());
04012   }
04013 
04014   // This means we have seen one but not the other instruction of the
04015   // pattern or more than just a select and cmp.
04016   if ((Kind == RK_IntegerMinMax || Kind == RK_FloatMinMax) &&
04017       NumCmpSelectPatternInst != 2)
04018     return false;
04019 
04020   if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction)
04021     return false;
04022 
04023   // We found a reduction var if we have reached the original phi node and we
04024   // only have a single instruction with out-of-loop users.
04025 
04026   // This instruction is allowed to have out-of-loop users.
04027   AllowedExit.insert(ExitInstruction);
04028 
04029   // Save the description of this reduction variable.
04030   ReductionDescriptor RD(RdxStart, ExitInstruction, Kind,
04031                          ReduxDesc.MinMaxKind);
04032   Reductions[Phi] = RD;
04033   // We've ended the cycle. This is a reduction variable if we have an
04034   // outside user and it has a binary op.
04035 
04036   return true;
04037 }
04038 
04039 /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction
04040 /// pattern corresponding to a min(X, Y) or max(X, Y).
04041 LoopVectorizationLegality::ReductionInstDesc
04042 LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I,
04043                                                     ReductionInstDesc &Prev) {
04044 
04045   assert((isa<ICmpInst>(I) || isa<FCmpInst>(I) || isa<SelectInst>(I)) &&
04046          "Expect a select instruction");
04047   Instruction *Cmp = nullptr;
04048   SelectInst *Select = nullptr;
04049 
04050   // We must handle the select(cmp()) as a single instruction. Advance to the
04051   // select.
04052   if ((Cmp = dyn_cast<ICmpInst>(I)) || (Cmp = dyn_cast<FCmpInst>(I))) {
04053     if (!Cmp->hasOneUse() || !(Select = dyn_cast<SelectInst>(*I->user_begin())))
04054       return ReductionInstDesc(false, I);
04055     return ReductionInstDesc(Select, Prev.MinMaxKind);
04056   }
04057 
04058   // Only handle single use cases for now.
04059   if (!(Select = dyn_cast<SelectInst>(I)))
04060     return ReductionInstDesc(false, I);
04061   if (!(Cmp = dyn_cast<ICmpInst>(I->getOperand(0))) &&
04062       !(Cmp = dyn_cast<FCmpInst>(I->getOperand(0))))
04063     return ReductionInstDesc(false, I);
04064   if (!Cmp->hasOneUse())
04065     return ReductionInstDesc(false, I);
04066 
04067   Value *CmpLeft;
04068   Value *CmpRight;
04069 
04070   // Look for a min/max pattern.
04071   if (m_UMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
04072     return ReductionInstDesc(Select, MRK_UIntMin);
04073   else if (m_UMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
04074     return ReductionInstDesc(Select, MRK_UIntMax);
04075   else if (m_SMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
04076     return ReductionInstDesc(Select, MRK_SIntMax);
04077   else if (m_SMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
04078     return ReductionInstDesc(Select, MRK_SIntMin);
04079   else if (m_OrdFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
04080     return ReductionInstDesc(Select, MRK_FloatMin);
04081   else if (m_OrdFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
04082     return ReductionInstDesc(Select, MRK_FloatMax);
04083   else if (m_UnordFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
04084     return ReductionInstDesc(Select, MRK_FloatMin);
04085   else if (m_UnordFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
04086     return ReductionInstDesc(Select, MRK_FloatMax);
04087 
04088   return ReductionInstDesc(false, I);
04089 }
04090 
04091 LoopVectorizationLegality::ReductionInstDesc
04092 LoopVectorizationLegality::isReductionInstr(Instruction *I,
04093                                             ReductionKind Kind,
04094                                             ReductionInstDesc &Prev) {
04095   bool FP = I->getType()->isFloatingPointTy();
04096   bool FastMath = FP && I->hasUnsafeAlgebra();
04097   switch (I->getOpcode()) {
04098   default:
04099     return ReductionInstDesc(false, I);
04100   case Instruction::PHI:
04101       if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd &&
04102                  Kind != RK_FloatMinMax))
04103         return ReductionInstDesc(false, I);
04104     return ReductionInstDesc(I, Prev.MinMaxKind);
04105   case Instruction::Sub:
04106   case Instruction::Add:
04107     return ReductionInstDesc(Kind == RK_IntegerAdd, I);
04108   case Instruction::Mul:
04109     return ReductionInstDesc(Kind == RK_IntegerMult, I);
04110   case Instruction::And:
04111     return ReductionInstDesc(Kind == RK_IntegerAnd, I);
04112   case Instruction::Or:
04113     return ReductionInstDesc(Kind == RK_IntegerOr, I);
04114   case Instruction::Xor:
04115     return ReductionInstDesc(Kind == RK_IntegerXor, I);
04116   case Instruction::FMul:
04117     return ReductionInstDesc(Kind == RK_FloatMult && FastMath, I);
04118   case Instruction::FSub:
04119   case Instruction::FAdd:
04120     return ReductionInstDesc(Kind == RK_FloatAdd && FastMath, I);
04121   case Instruction::FCmp:
04122   case Instruction::ICmp:
04123   case Instruction::Select:
04124     if (Kind != RK_IntegerMinMax &&
04125         (!HasFunNoNaNAttr || Kind != RK_FloatMinMax))
04126       return ReductionInstDesc(false, I);
04127     return isMinMaxSelectCmpPattern(I, Prev);
04128   }
04129 }
04130 
04131 LoopVectorizationLegality::InductionKind
04132 LoopVectorizationLegality::isInductionVariable(PHINode *Phi,
04133                                                ConstantInt *&StepValue) {
04134   Type *PhiTy = Phi->getType();
04135   // We only handle integer and pointer inductions variables.
04136   if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy())
04137     return IK_NoInduction;
04138 
04139   // Check that the PHI is consecutive.
04140   const SCEV *PhiScev = SE->getSCEV(Phi);
04141   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
04142   if (!AR) {
04143     DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
04144     return IK_NoInduction;
04145   }
04146 
04147   const SCEV *Step = AR->getStepRecurrence(*SE);
04148   // Calculate the pointer stride and check if it is consecutive.
04149   const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
04150   if (!C)
04151     return IK_NoInduction;
04152 
04153   ConstantInt *CV = C->getValue();
04154   if (PhiTy->isIntegerTy()) {
04155     StepValue = CV;
04156     return IK_IntInduction;
04157   }
04158 
04159   assert(PhiTy->isPointerTy() && "The PHI must be a pointer");
04160   Type *PointerElementType = PhiTy->getPointerElementType();
04161   // The pointer stride cannot be determined if the pointer element type is not
04162   // sized.
04163   if (!PointerElementType->isSized())
04164     return IK_NoInduction;
04165 
04166   int64_t Size = static_cast<int64_t>(DL->getTypeAllocSize(PointerElementType));
04167   int64_t CVSize = CV->getSExtValue();
04168   if (CVSize % Size)
04169     return IK_NoInduction;
04170   StepValue = ConstantInt::getSigned(CV->getType(), CVSize / Size);
04171   return IK_PtrInduction;
04172 }
04173 
04174 bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
04175   Value *In0 = const_cast<Value*>(V);
04176   PHINode *PN = dyn_cast_or_null<PHINode>(In0);
04177   if (!PN)
04178     return false;
04179 
04180   return Inductions.count(PN);
04181 }
04182 
04183 bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB)  {
04184   return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
04185 }
04186 
04187 bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,
04188                                            SmallPtrSetImpl<Value *> &SafePtrs) {
04189   
04190   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
04191     // Check that we don't have a constant expression that can trap as operand.
04192     for (Instruction::op_iterator OI = it->op_begin(), OE = it->op_end();
04193          OI != OE; ++OI) {
04194       if (Constant *C = dyn_cast<Constant>(*OI))
04195         if (C->canTrap())
04196           return false;
04197     }
04198     // We might be able to hoist the load.
04199     if (it->mayReadFromMemory()) {
04200       LoadInst *LI = dyn_cast<LoadInst>(it);
04201       if (!LI)
04202         return false;
04203       if (!SafePtrs.count(LI->getPointerOperand())) {
04204         if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand())) {
04205           MaskedOp.insert(LI);
04206           continue;
04207         }
04208         return false;
04209       }
04210     }
04211 
04212     // We don't predicate stores at the moment.
04213     if (it->mayWriteToMemory()) {
04214       StoreInst *SI = dyn_cast<StoreInst>(it);
04215       // We only support predication of stores in basic blocks with one
04216       // predecessor.
04217       if (!SI)
04218         return false;
04219 
04220       bool isSafePtr = (SafePtrs.count(SI->getPointerOperand()) != 0);
04221       bool isSinglePredecessor = SI->getParent()->getSinglePredecessor();
04222       
04223       if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr ||
04224           !isSinglePredecessor) {
04225         // Build a masked store if it is legal for the target, otherwise scalarize
04226         // the block.
04227         bool isLegalMaskedOp =
04228           isLegalMaskedStore(SI->getValueOperand()->getType(),
04229                              SI->getPointerOperand());
04230         if (isLegalMaskedOp) {
04231           --NumPredStores;
04232           MaskedOp.insert(SI);
04233           continue;
04234         }
04235         return false;
04236       }
04237     }
04238     if (it->mayThrow())
04239       return false;
04240 
04241     // The instructions below can trap.
04242     switch (it->getOpcode()) {
04243     default: continue;
04244     case Instruction::UDiv:
04245     case Instruction::SDiv:
04246     case Instruction::URem:
04247     case Instruction::SRem:
04248       return false;
04249     }
04250   }
04251 
04252   return true;
04253 }
04254 
04255 LoopVectorizationCostModel::VectorizationFactor
04256 LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
04257   // Width 1 means no vectorize
04258   VectorizationFactor Factor = { 1U, 0U };
04259   if (OptForSize && Legal->getRuntimePointerCheck()->Need) {
04260     emitAnalysis(VectorizationReport() <<
04261                  "runtime pointer checks needed. Enable vectorization of this "
04262                  "loop with '#pragma clang loop vectorize(enable)' when "
04263                  "compiling with -Os");
04264     DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n");
04265     return Factor;
04266   }
04267 
04268   if (!EnableCondStoresVectorization && Legal->getNumPredStores()) {
04269     emitAnalysis(VectorizationReport() <<
04270                  "store that is conditionally executed prevents vectorization");
04271     DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");
04272     return Factor;
04273   }
04274 
04275   // Find the trip count.
04276   unsigned TC = SE->getSmallConstantTripCount(TheLoop);
04277   DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
04278 
04279   unsigned WidestType = getWidestType();
04280   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
04281   unsigned MaxSafeDepDist = -1U;
04282   if (Legal->getMaxSafeDepDistBytes() != -1U)
04283     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
04284   WidestRegister = ((WidestRegister < MaxSafeDepDist) ?
04285                     WidestRegister : MaxSafeDepDist);
04286   unsigned MaxVectorSize = WidestRegister / WidestType;
04287   DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n");
04288   DEBUG(dbgs() << "LV: The Widest register is: "
04289           << WidestRegister << " bits.\n");
04290 
04291   if (MaxVectorSize == 0) {
04292     DEBUG(dbgs() << "LV: The target has no vector registers.\n");
04293     MaxVectorSize = 1;
04294   }
04295 
04296   assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements"
04297          " into one vector!");
04298 
04299   unsigned VF = MaxVectorSize;
04300 
04301   // If we optimize the program for size, avoid creating the tail loop.
04302   if (OptForSize) {
04303     // If we are unable to calculate the trip count then don't try to vectorize.
04304     if (TC < 2) {
04305       emitAnalysis
04306         (VectorizationReport() <<
04307          "unable to calculate the loop count due to complex control flow");
04308       DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
04309       return Factor;
04310     }
04311 
04312     // Find the maximum SIMD width that can fit within the trip count.
04313     VF = TC % MaxVectorSize;
04314 
04315     if (VF == 0)
04316       VF = MaxVectorSize;
04317 
04318     // If the trip count that we found modulo the vectorization factor is not
04319     // zero then we require a tail.
04320     if (VF < 2) {
04321       emitAnalysis(VectorizationReport() <<
04322                    "cannot optimize for size and vectorize at the "
04323                    "same time. Enable vectorization of this loop "
04324                    "with '#pragma clang loop vectorize(enable)' "
04325                    "when compiling with -Os");
04326       DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
04327       return Factor;
04328     }
04329   }
04330 
04331   int UserVF = Hints->getWidth();
04332   if (UserVF != 0) {
04333     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
04334     DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
04335 
04336     Factor.Width = UserVF;
04337     return Factor;
04338   }
04339 
04340   float Cost = expectedCost(1);
04341 #ifndef NDEBUG
04342   const float ScalarCost = Cost;
04343 #endif /* NDEBUG */
04344   unsigned Width = 1;
04345   DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
04346 
04347   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
04348   // Ignore scalar width, because the user explicitly wants vectorization.
04349   if (ForceVectorization && VF > 1) {
04350     Width = 2;
04351     Cost = expectedCost(Width) / (float)Width;
04352   }
04353 
04354   for (unsigned i=2; i <= VF; i*=2) {
04355     // Notice that the vector loop needs to be executed less times, so
04356     // we need to divide the cost of the vector loops by the width of
04357     // the vector elements.
04358     float VectorCost = expectedCost(i) / (float)i;
04359     DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " <<
04360           (int)VectorCost << ".\n");
04361     if (VectorCost < Cost) {
04362       Cost = VectorCost;
04363       Width = i;
04364     }
04365   }
04366 
04367   DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
04368         << "LV: Vectorization seems to be not beneficial, "
04369         << "but was forced by a user.\n");
04370   DEBUG(dbgs() << "LV: Selecting VF: "<< Width << ".\n");
04371   Factor.Width = Width;
04372   Factor.Cost = Width * Cost;
04373   return Factor;
04374 }
04375 
04376 unsigned LoopVectorizationCostModel::getWidestType() {
04377   unsigned MaxWidth = 8;
04378 
04379   // For each block.
04380   for (Loop::block_iterator bb = TheLoop->block_begin(),
04381        be = TheLoop->block_end(); bb != be; ++bb) {
04382     BasicBlock *BB = *bb;
04383 
04384     // For each instruction in the loop.
04385     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
04386       Type *T = it->getType();
04387 
04388       // Ignore ephemeral values.
04389       if (EphValues.count(it))
04390         continue;
04391 
04392       // Only examine Loads, Stores and PHINodes.
04393       if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it))
04394         continue;
04395 
04396       // Examine PHI nodes that are reduction variables.
04397       if (PHINode *PN = dyn_cast<PHINode>(it))
04398         if (!Legal->getReductionVars()->count(PN))
04399           continue;
04400 
04401       // Examine the stored values.
04402       if (StoreInst *ST = dyn_cast<StoreInst>(it))
04403         T = ST->getValueOperand()->getType();
04404 
04405       // Ignore loaded pointer types and stored pointer types that are not
04406       // consecutive. However, we do want to take consecutive stores/loads of
04407       // pointer vectors into account.
04408       if (T->isPointerTy() && !isConsecutiveLoadOrStore(it))
04409         continue;
04410 
04411       MaxWidth = std::max(MaxWidth,
04412                           (unsigned)DL->getTypeSizeInBits(T->getScalarType()));
04413     }
04414   }
04415 
04416   return MaxWidth;
04417 }
04418 
04419 unsigned
04420 LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
04421                                                unsigned VF,
04422                                                unsigned LoopCost) {
04423 
04424   // -- The unroll heuristics --
04425   // We unroll the loop in order to expose ILP and reduce the loop overhead.
04426   // There are many micro-architectural considerations that we can't predict
04427   // at this level. For example, frontend pressure (on decode or fetch) due to
04428   // code size, or the number and capabilities of the execution ports.
04429   //
04430   // We use the following heuristics to select the unroll factor:
04431   // 1. If the code has reductions, then we unroll in order to break the cross
04432   // iteration dependency.
04433   // 2. If the loop is really small, then we unroll in order to reduce the loop
04434   // overhead.
04435   // 3. We don't unroll if we think that we will spill registers to memory due
04436   // to the increased register pressure.
04437 
04438   // Use the user preference, unless 'auto' is selected.
04439   int UserUF = Hints->getInterleave();
04440   if (UserUF != 0)
04441     return UserUF;
04442 
04443   // When we optimize for size, we don't unroll.
04444   if (OptForSize)
04445     return 1;
04446 
04447   // We used the distance for the unroll factor.
04448   if (Legal->getMaxSafeDepDistBytes() != -1U)
04449     return 1;
04450 
04451   // Do not unroll loops with a relatively small trip count.
04452   unsigned TC = SE->getSmallConstantTripCount(TheLoop);
04453   if (TC > 1 && TC < TinyTripCountUnrollThreshold)
04454     return 1;
04455 
04456   unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
04457   DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters <<
04458         " registers\n");
04459 
04460   if (VF == 1) {
04461     if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
04462       TargetNumRegisters = ForceTargetNumScalarRegs;
04463   } else {
04464     if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
04465       TargetNumRegisters = ForceTargetNumVectorRegs;
04466   }
04467 
04468   LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage();
04469   // We divide by these constants so assume that we have at least one
04470   // instruction that uses at least one register.
04471   R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
04472   R.NumInstructions = std::max(R.NumInstructions, 1U);
04473 
04474   // We calculate the unroll factor using the following formula.
04475   // Subtract the number of loop invariants from the number of available
04476   // registers. These registers are used by all of the unrolled instances.
04477   // Next, divide the remaining registers by the number of registers that is
04478   // required by the loop, in order to estimate how many parallel instances
04479   // fit without causing spills. All of this is rounded down if necessary to be
04480   // a power of two. We want power of two unroll factors to simplify any
04481   // addressing operations or alignment considerations.
04482   unsigned UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
04483                               R.MaxLocalUsers);
04484 
04485   // Don't count the induction variable as unrolled.
04486   if (EnableIndVarRegisterHeur)
04487     UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
04488                        std::max(1U, (R.MaxLocalUsers - 1)));
04489 
04490   // Clamp the unroll factor ranges to reasonable factors.
04491   unsigned MaxInterleaveSize = TTI.getMaxInterleaveFactor();
04492 
04493   // Check if the user has overridden the unroll max.
04494   if (VF == 1) {
04495     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
04496       MaxInterleaveSize = ForceTargetMaxScalarInterleaveFactor;
04497   } else {
04498     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
04499       MaxInterleaveSize = ForceTargetMaxVectorInterleaveFactor;
04500   }
04501 
04502   // If we did not calculate the cost for VF (because the user selected the VF)
04503   // then we calculate the cost of VF here.
04504   if (LoopCost == 0)
04505     LoopCost = expectedCost(VF);
04506 
04507   // Clamp the calculated UF to be between the 1 and the max unroll factor
04508   // that the target allows.
04509   if (UF > MaxInterleaveSize)
04510     UF = MaxInterleaveSize;
04511   else if (UF < 1)
04512     UF = 1;
04513 
04514   // Unroll if we vectorized this loop and there is a reduction that could
04515   // benefit from unrolling.
04516   if (VF > 1 && Legal->getReductionVars()->size()) {
04517     DEBUG(dbgs() << "LV: Unrolling because of reductions.\n");
04518     return UF;
04519   }
04520 
04521   // Note that if we've already vectorized the loop we will have done the
04522   // runtime check and so unrolling won't require further checks.
04523   bool UnrollingRequiresRuntimePointerCheck =
04524       (VF == 1 && Legal->getRuntimePointerCheck()->Need);
04525 
04526   // We want to unroll small loops in order to reduce the loop overhead and
04527   // potentially expose ILP opportunities.
04528   DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
04529   if (!UnrollingRequiresRuntimePointerCheck &&
04530       LoopCost < SmallLoopCost) {
04531     // We assume that the cost overhead is 1 and we use the cost model
04532     // to estimate the cost of the loop and unroll until the cost of the
04533     // loop overhead is about 5% of the cost of the loop.
04534     unsigned SmallUF = std::min(UF, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
04535 
04536     // Unroll until store/load ports (estimated by max unroll factor) are
04537     // saturated.
04538     unsigned NumStores = Legal->getNumStores();
04539     unsigned NumLoads = Legal->getNumLoads();
04540     unsigned StoresUF = UF / (NumStores ? NumStores : 1);
04541     unsigned LoadsUF = UF /  (NumLoads ? NumLoads : 1);
04542 
04543     // If we have a scalar reduction (vector reductions are already dealt with
04544     // by this point), we can increase the critical path length if the loop
04545     // we're unrolling is inside another loop. Limit, by default to 2, so the
04546     // critical path only gets increased by one reduction operation.
04547     if (Legal->getReductionVars()->size() &&
04548         TheLoop->getLoopDepth() > 1) {
04549       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionUF);
04550       SmallUF = std::min(SmallUF, F);
04551       StoresUF = std::min(StoresUF, F);
04552       LoadsUF = std::min(LoadsUF, F);
04553     }
04554 
04555     if (EnableLoadStoreRuntimeUnroll && std::max(StoresUF, LoadsUF) > SmallUF) {
04556       DEBUG(dbgs() << "LV: Unrolling to saturate store or load ports.\n");
04557       return std::max(StoresUF, LoadsUF);
04558     }
04559 
04560     DEBUG(dbgs() << "LV: Unrolling to reduce branch cost.\n");
04561     return SmallUF;
04562   }
04563 
04564   DEBUG(dbgs() << "LV: Not Unrolling.\n");
04565   return 1;
04566 }
04567 
04568 LoopVectorizationCostModel::RegisterUsage
04569 LoopVectorizationCostModel::calculateRegisterUsage() {
04570   // This function calculates the register usage by measuring the highest number
04571   // of values that are alive at a single location. Obviously, this is a very
04572   // rough estimation. We scan the loop in a topological order in order and
04573   // assign a number to each instruction. We use RPO to ensure that defs are
04574   // met before their users. We assume that each instruction that has in-loop
04575   // users starts an interval. We record every time that an in-loop value is
04576   // used, so we have a list of the first and last occurrences of each
04577   // instruction. Next, we transpose this data structure into a multi map that
04578   // holds the list of intervals that *end* at a specific location. This multi
04579   // map allows us to perform a linear search. We scan the instructions linearly
04580   // and record each time that a new interval starts, by placing it in a set.
04581   // If we find this value in the multi-map then we remove it from the set.
04582   // The max register usage is the maximum size of the set.
04583   // We also search for instructions that are defined outside the loop, but are
04584   // used inside the loop. We need this number separately from the max-interval
04585   // usage number because when we unroll, loop-invariant values do not take
04586   // more register.
04587   LoopBlocksDFS DFS(TheLoop);
04588   DFS.perform(LI);
04589 
04590   RegisterUsage R;
04591   R.NumInstructions = 0;
04592 
04593   // Each 'key' in the map opens a new interval. The values
04594   // of the map are the index of the 'last seen' usage of the
04595   // instruction that is the key.
04596   typedef DenseMap<Instruction*, unsigned> IntervalMap;
04597   // Maps instruction to its index.
04598   DenseMap<unsigned, Instruction*> IdxToInstr;
04599   // Marks the end of each interval.
04600   IntervalMap EndPoint;
04601   // Saves the list of instruction indices that are used in the loop.
04602   SmallSet<Instruction*, 8> Ends;
04603   // Saves the list of values that are used in the loop but are
04604   // defined outside the loop, such as arguments and constants.
04605   SmallPtrSet<Value*, 8> LoopInvariants;
04606 
04607   unsigned Index = 0;
04608   for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
04609        be = DFS.endRPO(); bb != be; ++bb) {
04610     R.NumInstructions += (*bb)->size();
04611     for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
04612          ++it) {
04613       Instruction *I = it;
04614       IdxToInstr[Index++] = I;
04615 
04616       // Save the end location of each USE.
04617       for (unsigned i = 0; i < I->getNumOperands(); ++i) {
04618         Value *U = I->getOperand(i);
04619         Instruction *Instr = dyn_cast<Instruction>(U);
04620 
04621         // Ignore non-instruction values such as arguments, constants, etc.
04622         if (!Instr) continue;
04623 
04624         // If this instruction is outside the loop then record it and continue.
04625         if (!TheLoop->contains(Instr)) {
04626           LoopInvariants.insert(Instr);
04627           continue;
04628         }
04629 
04630         // Overwrite previous end points.
04631         EndPoint[Instr] = Index;
04632         Ends.insert(Instr);
04633       }
04634     }
04635   }
04636 
04637   // Saves the list of intervals that end with the index in 'key'.
04638   typedef SmallVector<Instruction*, 2> InstrList;
04639   DenseMap<unsigned, InstrList> TransposeEnds;
04640 
04641   // Transpose the EndPoints to a list of values that end at each index.
04642   for (IntervalMap::iterator it = EndPoint.begin(), e = EndPoint.end();
04643        it != e; ++it)
04644     TransposeEnds[it->second].push_back(it->first);
04645 
04646   SmallSet<Instruction*, 8> OpenIntervals;
04647   unsigned MaxUsage = 0;
04648 
04649 
04650   DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
04651   for (unsigned int i = 0; i < Index; ++i) {
04652     Instruction *I = IdxToInstr[i];
04653     // Ignore instructions that are never used within the loop.
04654     if (!Ends.count(I)) continue;
04655 
04656     // Ignore ephemeral values.
04657     if (EphValues.count(I))
04658       continue;
04659 
04660     // Remove all of the instructions that end at this location.
04661     InstrList &List = TransposeEnds[i];
04662     for (unsigned int j=0, e = List.size(); j < e; ++j)
04663       OpenIntervals.erase(List[j]);
04664 
04665     // Count the number of live interals.
04666     MaxUsage = std::max(MaxUsage, OpenIntervals.size());
04667 
04668     DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " <<
04669           OpenIntervals.size() << '\n');
04670 
04671     // Add the current instruction to the list of open intervals.
04672     OpenIntervals.insert(I);
04673   }
04674 
04675   unsigned Invariant = LoopInvariants.size();
04676   DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << '\n');
04677   DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n');
04678   DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << '\n');
04679 
04680   R.LoopInvariantRegs = Invariant;
04681   R.MaxLocalUsers = MaxUsage;
04682   return R;
04683 }
04684 
04685 unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
04686   unsigned Cost = 0;
04687 
04688   // For each block.
04689   for (Loop::block_iterator bb = TheLoop->block_begin(),
04690        be = TheLoop->block_end(); bb != be; ++bb) {
04691     unsigned BlockCost = 0;
04692     BasicBlock *BB = *bb;
04693 
04694     // For each instruction in the old loop.
04695     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
04696       // Skip dbg intrinsics.
04697       if (isa<DbgInfoIntrinsic>(it))
04698         continue;
04699 
04700       // Ignore ephemeral values.
04701       if (EphValues.count(it))
04702         continue;
04703 
04704       unsigned C = getInstructionCost(it, VF);
04705 
04706       // Check if we should override the cost.
04707       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
04708         C = ForceTargetInstructionCost;
04709 
04710       BlockCost += C;
04711       DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " <<
04712             VF << " For instruction: " << *it << '\n');
04713     }
04714 
04715     // We assume that if-converted blocks have a 50% chance of being executed.
04716     // When the code is scalar then some of the blocks are avoided due to CF.
04717     // When the code is vectorized we execute all code paths.
04718     if (VF == 1 && Legal->blockNeedsPredication(*bb))
04719       BlockCost /= 2;
04720 
04721     Cost += BlockCost;
04722   }
04723 
04724   return Cost;
04725 }
04726 
04727 /// \brief Check whether the address computation for a non-consecutive memory
04728 /// access looks like an unlikely candidate for being merged into the indexing
04729 /// mode.
04730 ///
04731 /// We look for a GEP which has one index that is an induction variable and all
04732 /// other indices are loop invariant. If the stride of this access is also
04733 /// within a small bound we decide that this address computation can likely be
04734 /// merged into the addressing mode.
04735 /// In all other cases, we identify the address computation as complex.
04736 static bool isLikelyComplexAddressComputation(Value *Ptr,
04737                                               LoopVectorizationLegality *Legal,
04738                                               ScalarEvolution *SE,
04739                                               const Loop *TheLoop) {
04740   GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
04741   if (!Gep)
04742     return true;
04743 
04744   // We are looking for a gep with all loop invariant indices except for one
04745   // which should be an induction variable.
04746   unsigned NumOperands = Gep->getNumOperands();
04747   for (unsigned i = 1; i < NumOperands; ++i) {
04748     Value *Opd = Gep->getOperand(i);
04749     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
04750         !Legal->isInductionVariable(Opd))
04751       return true;
04752   }
04753 
04754   // Now we know we have a GEP ptr, %inv, %ind, %inv. Make sure that the step
04755   // can likely be merged into the address computation.
04756   unsigned MaxMergeDistance = 64;
04757 
04758   const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Ptr));
04759   if (!AddRec)
04760     return true;
04761 
04762   // Check the step is constant.
04763   const SCEV *Step = AddRec->getStepRecurrence(*SE);
04764   // Calculate the pointer stride and check if it is consecutive.
04765   const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
04766   if (!C)
04767     return true;
04768 
04769   const APInt &APStepVal = C->getValue()->getValue();
04770 
04771   // Huge step value - give up.
04772   if (APStepVal.getBitWidth() > 64)
04773     return true;
04774 
04775   int64_t StepVal = APStepVal.getSExtValue();
04776 
04777   return StepVal > MaxMergeDistance;
04778 }
04779 
04780 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
04781   if (Legal->hasStride(I->getOperand(0)) || Legal->hasStride(I->getOperand(1)))
04782     return true;
04783   return false;
04784 }
04785 
04786 unsigned
04787 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
04788   // If we know that this instruction will remain uniform, check the cost of
04789   // the scalar version.
04790   if (Legal->isUniformAfterVectorization(I))
04791     VF = 1;
04792 
04793   Type *RetTy = I->getType();
04794   Type *VectorTy = ToVectorTy(RetTy, VF);
04795 
04796   // TODO: We need to estimate the cost of intrinsic calls.
04797   switch (I->getOpcode()) {
04798   case Instruction::GetElementPtr:
04799     // We mark this instruction as zero-cost because the cost of GEPs in
04800     // vectorized code depends on whether the corresponding memory instruction
04801     // is scalarized or not. Therefore, we handle GEPs with the memory
04802     // instruction cost.
04803     return 0;
04804   case Instruction::Br: {
04805     return TTI.getCFInstrCost(I->getOpcode());
04806   }
04807   case Instruction::PHI:
04808     //TODO: IF-converted IFs become selects.
04809     return 0;
04810   case Instruction::Add:
04811   case Instruction::FAdd:
04812   case Instruction::Sub:
04813   case Instruction::FSub:
04814   case Instruction::Mul:
04815   case Instruction::FMul:
04816   case Instruction::UDiv:
04817   case Instruction::SDiv:
04818   case Instruction::FDiv:
04819   case Instruction::URem:
04820   case Instruction::SRem:
04821   case Instruction::FRem:
04822   case Instruction::Shl:
04823   case Instruction::LShr:
04824   case Instruction::AShr:
04825   case Instruction::And:
04826   case Instruction::Or:
04827   case Instruction::Xor: {
04828     // Since we will replace the stride by 1 the multiplication should go away.
04829     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
04830       return 0;
04831     // Certain instructions can be cheaper to vectorize if they have a constant
04832     // second vector operand. One example of this are shifts on x86.
04833     TargetTransformInfo::OperandValueKind Op1VK =
04834       TargetTransformInfo::OK_AnyValue;
04835     TargetTransformInfo::OperandValueKind Op2VK =
04836       TargetTransformInfo::OK_AnyValue;
04837     TargetTransformInfo::OperandValueProperties Op1VP =
04838         TargetTransformInfo::OP_None;
04839     TargetTransformInfo::OperandValueProperties Op2VP =
04840         TargetTransformInfo::OP_None;
04841     Value *Op2 = I->getOperand(1);
04842 
04843     // Check for a splat of a constant or for a non uniform vector of constants.
04844     if (isa<ConstantInt>(Op2)) {
04845       ConstantInt *CInt = cast<ConstantInt>(Op2);
04846       if (CInt && CInt->getValue().isPowerOf2())
04847         Op2VP = TargetTransformInfo::OP_PowerOf2;
04848       Op2VK = TargetTransformInfo::OK_UniformConstantValue;
04849     } else if (isa<ConstantVector>(Op2) || isa<ConstantDataVector>(Op2)) {
04850       Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
04851       Constant *SplatValue = cast<Constant>(Op2)->getSplatValue();
04852       if (SplatValue) {
04853         ConstantInt *CInt = dyn_cast<ConstantInt>(SplatValue);
04854         if (CInt && CInt->getValue().isPowerOf2())
04855           Op2VP = TargetTransformInfo::OP_PowerOf2;
04856         Op2VK = TargetTransformInfo::OK_UniformConstantValue;
04857       }
04858     }
04859 
04860     return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK,
04861                                       Op1VP, Op2VP);
04862   }
04863   case Instruction::Select: {
04864     SelectInst *SI = cast<SelectInst>(I);
04865     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
04866     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
04867     Type *CondTy = SI->getCondition()->getType();
04868     if (!ScalarCond)
04869       CondTy = VectorType::get(CondTy, VF);
04870 
04871     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
04872   }
04873   case Instruction::ICmp:
04874   case Instruction::FCmp: {
04875     Type *ValTy = I->getOperand(0)->getType();
04876     VectorTy = ToVectorTy(ValTy, VF);
04877     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy);
04878   }
04879   case Instruction::Store:
04880   case Instruction::Load: {
04881     StoreInst *SI = dyn_cast<StoreInst>(I);
04882     LoadInst *LI = dyn_cast<LoadInst>(I);
04883     Type *ValTy = (SI ? SI->getValueOperand()->getType() :
04884                    LI->getType());
04885     VectorTy = ToVectorTy(ValTy, VF);
04886 
04887     unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment();
04888     unsigned AS = SI ? SI->getPointerAddressSpace() :
04889       LI->getPointerAddressSpace();
04890     Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand();
04891     // We add the cost of address computation here instead of with the gep
04892     // instruction because only here we know whether the operation is
04893     // scalarized.
04894     if (VF == 1)
04895       return TTI.getAddressComputationCost(VectorTy) +
04896         TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
04897 
04898     // Scalarized loads/stores.
04899     int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
04900     bool Reverse = ConsecutiveStride < 0;
04901     unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ValTy);
04902     unsigned VectorElementSize = DL->getTypeStoreSize(VectorTy)/VF;
04903     if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) {
04904       bool IsComplexComputation =
04905         isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop);
04906       unsigned Cost = 0;
04907       // The cost of extracting from the value vector and pointer vector.
04908       Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
04909       for (unsigned i = 0; i < VF; ++i) {
04910         //  The cost of extracting the pointer operand.
04911         Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i);
04912         // In case of STORE, the cost of ExtractElement from the vector.
04913         // In case of LOAD, the cost of InsertElement into the returned
04914         // vector.
04915         Cost += TTI.getVectorInstrCost(SI ? Instruction::ExtractElement :
04916                                             Instruction::InsertElement,
04917                                             VectorTy, i);
04918       }
04919 
04920       // The cost of the scalar loads/stores.
04921       Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation);
04922       Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
04923                                        Alignment, AS);
04924       return Cost;
04925     }
04926 
04927     // Wide load/stores.
04928     unsigned Cost = TTI.getAddressComputationCost(VectorTy);
04929     if (Legal->isMaskRequired(I))
04930       Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment,
04931                                         AS);
04932     else
04933       Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
04934 
04935     if (Reverse)
04936       Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
04937                                   VectorTy, 0);
04938     return Cost;
04939   }
04940   case Instruction::ZExt:
04941   case Instruction::SExt:
04942   case Instruction::FPToUI:
04943   case Instruction::FPToSI:
04944   case Instruction::FPExt:
04945   case Instruction::PtrToInt:
04946   case Instruction::IntToPtr:
04947   case Instruction::SIToFP:
04948   case Instruction::UIToFP:
04949   case Instruction::Trunc:
04950   case Instruction::FPTrunc:
04951   case Instruction::BitCast: {
04952     // We optimize the truncation of induction variable.
04953     // The cost of these is the same as the scalar operation.
04954     if (I->getOpcode() == Instruction::Trunc &&
04955         Legal->isInductionVariable(I->getOperand(0)))
04956       return TTI.getCastInstrCost(I->getOpcode(), I->getType(),
04957                                   I->getOperand(0)->getType());
04958 
04959     Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
04960     return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
04961   }
04962   case Instruction::Call: {
04963     CallInst *CI = cast<CallInst>(I);
04964     Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
04965     assert(ID && "Not an intrinsic call!");
04966     Type *RetTy = ToVectorTy(CI->getType(), VF);
04967     SmallVector<Type*, 4> Tys;
04968     for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
04969       Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF));
04970     return TTI.getIntrinsicInstrCost(ID, RetTy, Tys);
04971   }
04972   default: {
04973     // We are scalarizing the instruction. Return the cost of the scalar
04974     // instruction, plus the cost of insert and extract into vector
04975     // elements, times the vector width.
04976     unsigned Cost = 0;
04977 
04978     if (!RetTy->isVoidTy() && VF != 1) {
04979       unsigned InsCost = TTI.getVectorInstrCost(Instruction::InsertElement,
04980                                                 VectorTy);
04981       unsigned ExtCost = TTI.getVectorInstrCost(Instruction::ExtractElement,
04982                                                 VectorTy);
04983 
04984       // The cost of inserting the results plus extracting each one of the
04985       // operands.
04986       Cost += VF * (InsCost + ExtCost * I->getNumOperands());
04987     }
04988 
04989     // The cost of executing VF copies of the scalar instruction. This opcode
04990     // is unknown. Assume that it is the same as 'mul'.
04991     Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy);
04992     return Cost;
04993   }
04994   }// end of switch.
04995 }
04996 
04997 Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) {
04998   if (Scalar->isVoidTy() || VF == 1)
04999     return Scalar;
05000   return VectorType::get(Scalar, VF);
05001 }
05002 
05003 char LoopVectorize::ID = 0;
05004 static const char lv_name[] = "Loop Vectorization";
05005 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
05006 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
05007 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
05008 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
05009 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfo)
05010 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
05011 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
05012 INITIALIZE_PASS_DEPENDENCY(LCSSA)
05013 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
05014 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
05015 INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis)
05016 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
05017 
05018 namespace llvm {
05019   Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) {
05020     return new LoopVectorize(NoUnrolling, AlwaysVectorize);
05021   }
05022 }
05023 
05024 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
05025   // Check for a store.
05026   if (StoreInst *ST = dyn_cast<StoreInst>(Inst))
05027     return Legal->isConsecutivePtr(ST->getPointerOperand()) != 0;
05028 
05029   // Check for a load.
05030   if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
05031     return Legal->isConsecutivePtr(LI->getPointerOperand()) != 0;
05032 
05033   return false;
05034 }
05035 
05036 
05037 void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
05038                                              bool IfPredicateStore) {
05039   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
05040   // Holds vector parameters or scalars, in case of uniform vals.
05041   SmallVector<VectorParts, 4> Params;
05042 
05043   setDebugLocFromInst(Builder, Instr);
05044 
05045   // Find all of the vectorized parameters.
05046   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
05047     Value *SrcOp = Instr->getOperand(op);
05048 
05049     // If we are accessing the old induction variable, use the new one.
05050     if (SrcOp == OldInduction) {
05051       Params.push_back(getVectorValue(SrcOp));
05052       continue;
05053     }
05054 
05055     // Try using previously calculated values.
05056     Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
05057 
05058     // If the src is an instruction that appeared earlier in the basic block
05059     // then it should already be vectorized.
05060     if (SrcInst && OrigLoop->contains(SrcInst)) {
05061       assert(WidenMap.has(SrcInst) && "Source operand is unavailable");
05062       // The parameter is a vector value from earlier.
05063       Params.push_back(WidenMap.get(SrcInst));
05064     } else {
05065       // The parameter is a scalar from outside the loop. Maybe even a constant.
05066       VectorParts Scalars;
05067       Scalars.append(UF, SrcOp);
05068       Params.push_back(Scalars);
05069     }
05070   }
05071 
05072   assert(Params.size() == Instr->getNumOperands() &&
05073          "Invalid number of operands");
05074 
05075   // Does this instruction return a value ?
05076   bool IsVoidRetTy = Instr->getType()->isVoidTy();
05077 
05078   Value *UndefVec = IsVoidRetTy ? nullptr :
05079   UndefValue::get(Instr->getType());
05080   // Create a new entry in the WidenMap and initialize it to Undef or Null.
05081   VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
05082 
05083   Instruction *InsertPt = Builder.GetInsertPoint();
05084   BasicBlock *IfBlock = Builder.GetInsertBlock();
05085   BasicBlock *CondBlock = nullptr;
05086 
05087   VectorParts Cond;
05088   Loop *VectorLp = nullptr;
05089   if (IfPredicateStore) {
05090     assert(Instr->getParent()->getSinglePredecessor() &&
05091            "Only support single predecessor blocks");
05092     Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
05093                           Instr->getParent());
05094     VectorLp = LI->getLoopFor(IfBlock);
05095     assert(VectorLp && "Must have a loop for this block");
05096   }
05097 
05098   // For each vector unroll 'part':
05099   for (unsigned Part = 0; Part < UF; ++Part) {
05100     // For each scalar that we create:
05101 
05102     // Start an "if (pred) a[i] = ..." block.
05103     Value *Cmp = nullptr;
05104     if (IfPredicateStore) {
05105       if (Cond[Part]->getType()->isVectorTy())
05106         Cond[Part] =
05107             Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0));
05108       Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part],
05109                                ConstantInt::get(Cond[Part]->getType(), 1));
05110       CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
05111       LoopVectorBody.push_back(CondBlock);
05112       VectorLp->addBasicBlockToLoop(CondBlock, *LI);
05113       // Update Builder with newly created basic block.
05114       Builder.SetInsertPoint(InsertPt);
05115     }
05116 
05117     Instruction *Cloned = Instr->clone();
05118       if (!IsVoidRetTy)
05119         Cloned->setName(Instr->getName() + ".cloned");
05120       // Replace the operands of the cloned instructions with extracted scalars.
05121       for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
05122         Value *Op = Params[op][Part];
05123         Cloned->setOperand(op, Op);
05124       }
05125 
05126       // Place the cloned scalar in the new loop.
05127       Builder.Insert(Cloned);
05128 
05129       // If the original scalar returns a value we need to place it in a vector
05130       // so that future users will be able to use it.
05131       if (!IsVoidRetTy)
05132         VecResults[Part] = Cloned;
05133 
05134     // End if-block.
05135       if (IfPredicateStore) {
05136         BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
05137         LoopVectorBody.push_back(NewIfBlock);
05138         VectorLp->addBasicBlockToLoop(NewIfBlock, *LI);
05139         Builder.SetInsertPoint(InsertPt);
05140         Instruction *OldBr = IfBlock->getTerminator();
05141         BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
05142         OldBr->eraseFromParent();
05143         IfBlock = NewIfBlock;
05144       }
05145   }
05146 }
05147 
05148 void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) {
05149   StoreInst *SI = dyn_cast<StoreInst>(Instr);
05150   bool IfPredicateStore = (SI && Legal->blockNeedsPredication(SI->getParent()));
05151 
05152   return scalarizeInstruction(Instr, IfPredicateStore);
05153 }
05154 
05155 Value *InnerLoopUnroller::reverseVector(Value *Vec) {
05156   return Vec;
05157 }
05158 
05159 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) {
05160   return V;
05161 }
05162 
05163 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step) {
05164   // When unrolling and the VF is 1, we only need to add a simple scalar.
05165   Type *ITy = Val->getType();
05166   assert(!ITy->isVectorTy() && "Val must be a scalar");
05167   Constant *C = ConstantInt::get(ITy, StartIdx);
05168   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
05169 }