LLVM API Documentation

LoopVectorize.cpp
Go to the documentation of this file.
00001 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
00011 // and generates target-independent LLVM-IR.
00012 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
00013 // of instructions in order to estimate the profitability of vectorization.
00014 //
00015 // The loop vectorizer combines consecutive loop iterations into a single
00016 // 'wide' iteration. After this transformation the index is incremented
00017 // by the SIMD vector width, and not by one.
00018 //
00019 // This pass has three parts:
00020 // 1. The main loop pass that drives the different parts.
00021 // 2. LoopVectorizationLegality - A unit that checks for the legality
00022 //    of the vectorization.
00023 // 3. InnerLoopVectorizer - A unit that performs the actual
00024 //    widening of instructions.
00025 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
00026 //    of vectorization. It decides on the optimal vector width, which
00027 //    can be one, if vectorization is not profitable.
00028 //
00029 //===----------------------------------------------------------------------===//
00030 //
00031 // The reduction-variable vectorization is based on the paper:
00032 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
00033 //
00034 // Variable uniformity checks are inspired by:
00035 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
00036 //
00037 // Other ideas/concepts are from:
00038 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
00039 //
00040 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
00041 //  Vectorizing Compilers.
00042 //
00043 //===----------------------------------------------------------------------===//
00044 
00045 #define LV_NAME "loop-vectorize"
00046 #define DEBUG_TYPE LV_NAME
00047 
00048 #include "llvm/Transforms/Vectorize.h"
00049 #include "llvm/ADT/DenseMap.h"
00050 #include "llvm/ADT/MapVector.h"
00051 #include "llvm/ADT/SmallPtrSet.h"
00052 #include "llvm/ADT/SmallSet.h"
00053 #include "llvm/ADT/SmallVector.h"
00054 #include "llvm/ADT/StringExtras.h"
00055 #include "llvm/Analysis/AliasAnalysis.h"
00056 #include "llvm/Analysis/AliasSetTracker.h"
00057 #include "llvm/Analysis/Dominators.h"
00058 #include "llvm/Analysis/LoopInfo.h"
00059 #include "llvm/Analysis/LoopIterator.h"
00060 #include "llvm/Analysis/LoopPass.h"
00061 #include "llvm/Analysis/ScalarEvolution.h"
00062 #include "llvm/Analysis/ScalarEvolutionExpander.h"
00063 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
00064 #include "llvm/Analysis/TargetTransformInfo.h"
00065 #include "llvm/Analysis/ValueTracking.h"
00066 #include "llvm/Analysis/Verifier.h"
00067 #include "llvm/IR/Constants.h"
00068 #include "llvm/IR/DataLayout.h"
00069 #include "llvm/IR/DerivedTypes.h"
00070 #include "llvm/IR/Function.h"
00071 #include "llvm/IR/IRBuilder.h"
00072 #include "llvm/IR/Instructions.h"
00073 #include "llvm/IR/IntrinsicInst.h"
00074 #include "llvm/IR/LLVMContext.h"
00075 #include "llvm/IR/Module.h"
00076 #include "llvm/IR/Type.h"
00077 #include "llvm/IR/Value.h"
00078 #include "llvm/Pass.h"
00079 #include "llvm/Support/CommandLine.h"
00080 #include "llvm/Support/Debug.h"
00081 #include "llvm/Support/PatternMatch.h"
00082 #include "llvm/Support/raw_ostream.h"
00083 #include "llvm/Support/ValueHandle.h"
00084 #include "llvm/Target/TargetLibraryInfo.h"
00085 #include "llvm/Transforms/Scalar.h"
00086 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
00087 #include "llvm/Transforms/Utils/Local.h"
00088 #include <algorithm>
00089 #include <map>
00090 
00091 using namespace llvm;
00092 using namespace llvm::PatternMatch;
00093 
00094 static cl::opt<unsigned>
00095 VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
00096                     cl::desc("Sets the SIMD width. Zero is autoselect."));
00097 
00098 static cl::opt<unsigned>
00099 VectorizationUnroll("force-vector-unroll", cl::init(0), cl::Hidden,
00100                     cl::desc("Sets the vectorization unroll count. "
00101                              "Zero is autoselect."));
00102 
00103 static cl::opt<bool>
00104 EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
00105                    cl::desc("Enable if-conversion during vectorization."));
00106 
00107 /// We don't vectorize loops with a known constant trip count below this number.
00108 static cl::opt<unsigned>
00109 TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16),
00110                              cl::Hidden,
00111                              cl::desc("Don't vectorize loops with a constant "
00112                                       "trip count that is smaller than this "
00113                                       "value."));
00114 
00115 /// We don't unroll loops with a known constant trip count below this number.
00116 static const unsigned TinyTripCountUnrollThreshold = 128;
00117 
00118 /// When performing memory disambiguation checks at runtime do not make more
00119 /// than this number of comparisons.
00120 static const unsigned RuntimeMemoryCheckThreshold = 8;
00121 
00122 /// We use a metadata with this name  to indicate that a scalar loop was
00123 /// vectorized and that we don't need to re-vectorize it if we run into it
00124 /// again.
00125 static const char*
00126 AlreadyVectorizedMDName = "llvm.vectorizer.already_vectorized";
00127 
00128 namespace {
00129 
00130 // Forward declarations.
00131 class LoopVectorizationLegality;
00132 class LoopVectorizationCostModel;
00133 
00134 /// InnerLoopVectorizer vectorizes loops which contain only one basic
00135 /// block to a specified vectorization factor (VF).
00136 /// This class performs the widening of scalars into vectors, or multiple
00137 /// scalars. This class also implements the following features:
00138 /// * It inserts an epilogue loop for handling loops that don't have iteration
00139 ///   counts that are known to be a multiple of the vectorization factor.
00140 /// * It handles the code generation for reduction variables.
00141 /// * Scalarization (implementation using scalars) of un-vectorizable
00142 ///   instructions.
00143 /// InnerLoopVectorizer does not perform any vectorization-legality
00144 /// checks, and relies on the caller to check for the different legality
00145 /// aspects. The InnerLoopVectorizer relies on the
00146 /// LoopVectorizationLegality class to provide information about the induction
00147 /// and reduction variables that were found to a given vectorization factor.
00148 class InnerLoopVectorizer {
00149 public:
00150   InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
00151                       DominatorTree *DT, DataLayout *DL,
00152                       const TargetLibraryInfo *TLI, unsigned VecWidth,
00153                       unsigned UnrollFactor)
00154       : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), TLI(TLI),
00155         VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()), Induction(0),
00156         OldInduction(0), WidenMap(UnrollFactor) {}
00157 
00158   // Perform the actual loop widening (vectorization).
00159   void vectorize(LoopVectorizationLegality *Legal) {
00160     // Create a new empty loop. Unlink the old loop and connect the new one.
00161     createEmptyLoop(Legal);
00162     // Widen each instruction in the old loop to a new one in the new loop.
00163     // Use the Legality module to find the induction and reduction variables.
00164     vectorizeLoop(Legal);
00165     // Register the new loop and update the analysis passes.
00166     updateAnalysis();
00167   }
00168 
00169 private:
00170   /// A small list of PHINodes.
00171   typedef SmallVector<PHINode*, 4> PhiVector;
00172   /// When we unroll loops we have multiple vector values for each scalar.
00173   /// This data structure holds the unrolled and vectorized values that
00174   /// originated from one scalar instruction.
00175   typedef SmallVector<Value*, 2> VectorParts;
00176 
00177   /// Add code that checks at runtime if the accessed arrays overlap.
00178   /// Returns the comparator value or NULL if no check is needed.
00179   Instruction *addRuntimeCheck(LoopVectorizationLegality *Legal,
00180                                Instruction *Loc);
00181   /// Create an empty loop, based on the loop ranges of the old loop.
00182   void createEmptyLoop(LoopVectorizationLegality *Legal);
00183   /// Copy and widen the instructions from the old loop.
00184   void vectorizeLoop(LoopVectorizationLegality *Legal);
00185 
00186   /// A helper function that computes the predicate of the block BB, assuming
00187   /// that the header block of the loop is set to True. It returns the *entry*
00188   /// mask for the block BB.
00189   VectorParts createBlockInMask(BasicBlock *BB);
00190   /// A helper function that computes the predicate of the edge between SRC
00191   /// and DST.
00192   VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
00193 
00194   /// A helper function to vectorize a single BB within the innermost loop.
00195   void vectorizeBlockInLoop(LoopVectorizationLegality *Legal, BasicBlock *BB,
00196                             PhiVector *PV);
00197 
00198   /// Insert the new loop to the loop hierarchy and pass manager
00199   /// and update the analysis passes.
00200   void updateAnalysis();
00201 
00202   /// This instruction is un-vectorizable. Implement it as a sequence
00203   /// of scalars.
00204   void scalarizeInstruction(Instruction *Instr);
00205 
00206   /// Vectorize Load and Store instructions,
00207   void vectorizeMemoryInstruction(Instruction *Instr,
00208                                   LoopVectorizationLegality *Legal);
00209 
00210   /// Create a broadcast instruction. This method generates a broadcast
00211   /// instruction (shuffle) for loop invariant values and for the induction
00212   /// value. If this is the induction variable then we extend it to N, N+1, ...
00213   /// this is needed because each iteration in the loop corresponds to a SIMD
00214   /// element.
00215   Value *getBroadcastInstrs(Value *V);
00216 
00217   /// This function adds 0, 1, 2 ... to each vector element, starting at zero.
00218   /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...).
00219   /// The sequence starts at StartIndex.
00220   Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate);
00221 
00222   /// When we go over instructions in the basic block we rely on previous
00223   /// values within the current basic block or on loop invariant values.
00224   /// When we widen (vectorize) values we place them in the map. If the values
00225   /// are not within the map, they have to be loop invariant, so we simply
00226   /// broadcast them into a vector.
00227   VectorParts &getVectorValue(Value *V);
00228 
00229   /// Generate a shuffle sequence that will reverse the vector Vec.
00230   Value *reverseVector(Value *Vec);
00231 
00232   /// This is a helper class that holds the vectorizer state. It maps scalar
00233   /// instructions to vector instructions. When the code is 'unrolled' then
00234   /// then a single scalar value is mapped to multiple vector parts. The parts
00235   /// are stored in the VectorPart type.
00236   struct ValueMap {
00237     /// C'tor.  UnrollFactor controls the number of vectors ('parts') that
00238     /// are mapped.
00239     ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {}
00240 
00241     /// \return True if 'Key' is saved in the Value Map.
00242     bool has(Value *Key) const { return MapStorage.count(Key); }
00243 
00244     /// Initializes a new entry in the map. Sets all of the vector parts to the
00245     /// save value in 'Val'.
00246     /// \return A reference to a vector with splat values.
00247     VectorParts &splat(Value *Key, Value *Val) {
00248       VectorParts &Entry = MapStorage[Key];
00249       Entry.assign(UF, Val);
00250       return Entry;
00251     }
00252 
00253     ///\return A reference to the value that is stored at 'Key'.
00254     VectorParts &get(Value *Key) {
00255       VectorParts &Entry = MapStorage[Key];
00256       if (Entry.empty())
00257         Entry.resize(UF);
00258       assert(Entry.size() == UF);
00259       return Entry;
00260     }
00261 
00262   private:
00263     /// The unroll factor. Each entry in the map stores this number of vector
00264     /// elements.
00265     unsigned UF;
00266 
00267     /// Map storage. We use std::map and not DenseMap because insertions to a
00268     /// dense map invalidates its iterators.
00269     std::map<Value *, VectorParts> MapStorage;
00270   };
00271 
00272   /// The original loop.
00273   Loop *OrigLoop;
00274   /// Scev analysis to use.
00275   ScalarEvolution *SE;
00276   /// Loop Info.
00277   LoopInfo *LI;
00278   /// Dominator Tree.
00279   DominatorTree *DT;
00280   /// Data Layout.
00281   DataLayout *DL;
00282   /// Target Library Info.
00283   const TargetLibraryInfo *TLI;
00284 
00285   /// The vectorization SIMD factor to use. Each vector will have this many
00286   /// vector elements.
00287   unsigned VF;
00288   /// The vectorization unroll factor to use. Each scalar is vectorized to this
00289   /// many different vector instructions.
00290   unsigned UF;
00291 
00292   /// The builder that we use
00293   IRBuilder<> Builder;
00294 
00295   // --- Vectorization state ---
00296 
00297   /// The vector-loop preheader.
00298   BasicBlock *LoopVectorPreHeader;
00299   /// The scalar-loop preheader.
00300   BasicBlock *LoopScalarPreHeader;
00301   /// Middle Block between the vector and the scalar.
00302   BasicBlock *LoopMiddleBlock;
00303   ///The ExitBlock of the scalar loop.
00304   BasicBlock *LoopExitBlock;
00305   ///The vector loop body.
00306   BasicBlock *LoopVectorBody;
00307   ///The scalar loop body.
00308   BasicBlock *LoopScalarBody;
00309   /// A list of all bypass blocks. The first block is the entry of the loop.
00310   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
00311 
00312   /// The new Induction variable which was added to the new block.
00313   PHINode *Induction;
00314   /// The induction variable of the old basic block.
00315   PHINode *OldInduction;
00316   /// Holds the extended (to the widest induction type) start index.
00317   Value *ExtendedIdx;
00318   /// Maps scalars to widened vectors.
00319   ValueMap WidenMap;
00320 };
00321 
00322 /// \brief Check if conditionally executed loads are hoistable.
00323 ///
00324 /// This class has two functions: isHoistableLoad and canHoistAllLoads.
00325 /// isHoistableLoad should be called on all load instructions that are executed
00326 /// conditionally. After all conditional loads are processed, the client should
00327 /// call canHoistAllLoads to determine if all of the conditional executed loads
00328 /// have an unconditional memory access to the same memory address in the loop.
00329 class LoadHoisting {
00330   typedef SmallPtrSet<Value *, 8> MemorySet;
00331 
00332   Loop *TheLoop;
00333   DominatorTree *DT;
00334   MemorySet CondLoadAddrSet;
00335 
00336 public:
00337   LoadHoisting(Loop *L, DominatorTree *D) : TheLoop(L), DT(D) {}
00338 
00339   /// \brief Check if the instruction is a load with a identifiable address.
00340   bool isHoistableLoad(Instruction *L);
00341 
00342   /// \brief Check if all of the conditional loads are hoistable because there
00343   /// exists an unconditional memory access to the same address in the loop.
00344   bool canHoistAllLoads();
00345 };
00346 
00347 bool LoadHoisting::isHoistableLoad(Instruction *L) {
00348   LoadInst *LI = dyn_cast<LoadInst>(L);
00349   if (!LI)
00350     return false;
00351 
00352   CondLoadAddrSet.insert(LI->getPointerOperand());
00353   return true;
00354 }
00355 
00356 static void addMemAccesses(BasicBlock *BB, SmallPtrSet<Value *, 8> &Set) {
00357   for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE; ++BI) {
00358     if (LoadInst *LI = dyn_cast<LoadInst>(BI)) // Try a load.
00359       Set.insert(LI->getPointerOperand());
00360     else if (StoreInst *SI = dyn_cast<StoreInst>(BI)) // Try a store.
00361       Set.insert(SI->getPointerOperand());
00362   }
00363 }
00364 
00365 bool LoadHoisting::canHoistAllLoads() {
00366   // No conditional loads.
00367   if (CondLoadAddrSet.empty())
00368     return true;
00369 
00370   MemorySet UncondMemAccesses;
00371   std::vector<BasicBlock*> &LoopBlocks = TheLoop->getBlocksVector();
00372   BasicBlock *LoopLatch = TheLoop->getLoopLatch();
00373 
00374   // Iterate over the unconditional blocks and collect memory access addresses.
00375   for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) {
00376     BasicBlock *BB = LoopBlocks[i];
00377 
00378     // Ignore conditional blocks.
00379     if (BB != LoopLatch && !DT->dominates(BB, LoopLatch))
00380       continue;
00381 
00382     addMemAccesses(BB, UncondMemAccesses);
00383   }
00384 
00385   // And make sure there is a matching unconditional access for every
00386   // conditional load.
00387   for (MemorySet::iterator MI = CondLoadAddrSet.begin(),
00388        ME = CondLoadAddrSet.end(); MI != ME; ++MI)
00389     if (!UncondMemAccesses.count(*MI))
00390       return false;
00391 
00392   return true;
00393 }
00394 
00395 /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
00396 /// to what vectorization factor.
00397 /// This class does not look at the profitability of vectorization, only the
00398 /// legality. This class has two main kinds of checks:
00399 /// * Memory checks - The code in canVectorizeMemory checks if vectorization
00400 ///   will change the order of memory accesses in a way that will change the
00401 ///   correctness of the program.
00402 /// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory
00403 /// checks for a number of different conditions, such as the availability of a
00404 /// single induction variable, that all types are supported and vectorize-able,
00405 /// etc. This code reflects the capabilities of InnerLoopVectorizer.
00406 /// This class is also used by InnerLoopVectorizer for identifying
00407 /// induction variable and the different reduction variables.
00408 class LoopVectorizationLegality {
00409 public:
00410   LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DataLayout *DL,
00411                             DominatorTree *DT, TargetTransformInfo* TTI,
00412                             AliasAnalysis *AA, TargetLibraryInfo *TLI)
00413       : TheLoop(L), SE(SE), DL(DL), DT(DT), TTI(TTI), AA(AA), TLI(TLI),
00414         Induction(0), WidestIndTy(0), HasFunNoNaNAttr(false),
00415         LoadSpeculation(L, DT) {}
00416 
00417   /// This enum represents the kinds of reductions that we support.
00418   enum ReductionKind {
00419     RK_NoReduction, ///< Not a reduction.
00420     RK_IntegerAdd,  ///< Sum of integers.
00421     RK_IntegerMult, ///< Product of integers.
00422     RK_IntegerOr,   ///< Bitwise or logical OR of numbers.
00423     RK_IntegerAnd,  ///< Bitwise or logical AND of numbers.
00424     RK_IntegerXor,  ///< Bitwise or logical XOR of numbers.
00425     RK_IntegerMinMax, ///< Min/max implemented in terms of select(cmp()).
00426     RK_FloatAdd,    ///< Sum of floats.
00427     RK_FloatMult,   ///< Product of floats.
00428     RK_FloatMinMax  ///< Min/max implemented in terms of select(cmp()).
00429   };
00430 
00431   /// This enum represents the kinds of inductions that we support.
00432   enum InductionKind {
00433     IK_NoInduction,         ///< Not an induction variable.
00434     IK_IntInduction,        ///< Integer induction variable. Step = 1.
00435     IK_ReverseIntInduction, ///< Reverse int induction variable. Step = -1.
00436     IK_PtrInduction,        ///< Pointer induction var. Step = sizeof(elem).
00437     IK_ReversePtrInduction  ///< Reverse ptr indvar. Step = - sizeof(elem).
00438   };
00439 
00440   // This enum represents the kind of minmax reduction.
00441   enum MinMaxReductionKind {
00442     MRK_Invalid,
00443     MRK_UIntMin,
00444     MRK_UIntMax,
00445     MRK_SIntMin,
00446     MRK_SIntMax,
00447     MRK_FloatMin,
00448     MRK_FloatMax
00449   };
00450 
00451   /// This POD struct holds information about reduction variables.
00452   struct ReductionDescriptor {
00453     ReductionDescriptor() : StartValue(0), LoopExitInstr(0),
00454       Kind(RK_NoReduction), MinMaxKind(MRK_Invalid) {}
00455 
00456     ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K,
00457                         MinMaxReductionKind MK)
00458         : StartValue(Start), LoopExitInstr(Exit), Kind(K), MinMaxKind(MK) {}
00459 
00460     // The starting value of the reduction.
00461     // It does not have to be zero!
00462     TrackingVH<Value> StartValue;
00463     // The instruction who's value is used outside the loop.
00464     Instruction *LoopExitInstr;
00465     // The kind of the reduction.
00466     ReductionKind Kind;
00467     // If this a min/max reduction the kind of reduction.
00468     MinMaxReductionKind MinMaxKind;
00469   };
00470 
00471   /// This POD struct holds information about a potential reduction operation.
00472   struct ReductionInstDesc {
00473     ReductionInstDesc(bool IsRedux, Instruction *I) :
00474       IsReduction(IsRedux), PatternLastInst(I), MinMaxKind(MRK_Invalid) {}
00475 
00476     ReductionInstDesc(Instruction *I, MinMaxReductionKind K) :
00477       IsReduction(true), PatternLastInst(I), MinMaxKind(K) {}
00478 
00479     // Is this instruction a reduction candidate.
00480     bool IsReduction;
00481     // The last instruction in a min/max pattern (select of the select(icmp())
00482     // pattern), or the current reduction instruction otherwise.
00483     Instruction *PatternLastInst;
00484     // If this is a min/max pattern the comparison predicate.
00485     MinMaxReductionKind MinMaxKind;
00486   };
00487 
00488   // This POD struct holds information about the memory runtime legality
00489   // check that a group of pointers do not overlap.
00490   struct RuntimePointerCheck {
00491     RuntimePointerCheck() : Need(false) {}
00492 
00493     /// Reset the state of the pointer runtime information.
00494     void reset() {
00495       Need = false;
00496       Pointers.clear();
00497       Starts.clear();
00498       Ends.clear();
00499     }
00500 
00501     /// Insert a pointer and calculate the start and end SCEVs.
00502     void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr);
00503 
00504     /// This flag indicates if we need to add the runtime check.
00505     bool Need;
00506     /// Holds the pointers that we need to check.
00507     SmallVector<TrackingVH<Value>, 2> Pointers;
00508     /// Holds the pointer value at the beginning of the loop.
00509     SmallVector<const SCEV*, 2> Starts;
00510     /// Holds the pointer value at the end of the loop.
00511     SmallVector<const SCEV*, 2> Ends;
00512     /// Holds the information if this pointer is used for writing to memory.
00513     SmallVector<bool, 2> IsWritePtr;
00514   };
00515 
00516   /// A POD for saving information about induction variables.
00517   struct InductionInfo {
00518     InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {}
00519     InductionInfo() : StartValue(0), IK(IK_NoInduction) {}
00520     /// Start value.
00521     TrackingVH<Value> StartValue;
00522     /// Induction kind.
00523     InductionKind IK;
00524   };
00525 
00526   /// ReductionList contains the reduction descriptors for all
00527   /// of the reductions that were found in the loop.
00528   typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
00529 
00530   /// InductionList saves induction variables and maps them to the
00531   /// induction descriptor.
00532   typedef MapVector<PHINode*, InductionInfo> InductionList;
00533 
00534   /// Alias(Multi)Map stores the values (GEPs or underlying objects and their
00535   /// respective Store/Load instruction(s) to calculate aliasing.
00536   typedef MapVector<Value*, Instruction* > AliasMap;
00537   typedef DenseMap<Value*, std::vector<Instruction*> > AliasMultiMap;
00538 
00539   /// Returns true if it is legal to vectorize this loop.
00540   /// This does not mean that it is profitable to vectorize this
00541   /// loop, only that it is legal to do so.
00542   bool canVectorize();
00543 
00544   /// Returns the Induction variable.
00545   PHINode *getInduction() { return Induction; }
00546 
00547   /// Returns the reduction variables found in the loop.
00548   ReductionList *getReductionVars() { return &Reductions; }
00549 
00550   /// Returns the induction variables found in the loop.
00551   InductionList *getInductionVars() { return &Inductions; }
00552 
00553   /// Returns the widest induction type.
00554   Type *getWidestInductionType() { return WidestIndTy; }
00555 
00556   /// Returns True if V is an induction variable in this loop.
00557   bool isInductionVariable(const Value *V);
00558 
00559   /// Return true if the block BB needs to be predicated in order for the loop
00560   /// to be vectorized.
00561   bool blockNeedsPredication(BasicBlock *BB);
00562 
00563   /// Check if this  pointer is consecutive when vectorizing. This happens
00564   /// when the last index of the GEP is the induction variable, or that the
00565   /// pointer itself is an induction variable.
00566   /// This check allows us to vectorize A[idx] into a wide load/store.
00567   /// Returns:
00568   /// 0 - Stride is unknown or non consecutive.
00569   /// 1 - Address is consecutive.
00570   /// -1 - Address is consecutive, and decreasing.
00571   int isConsecutivePtr(Value *Ptr);
00572 
00573   /// Returns true if the value V is uniform within the loop.
00574   bool isUniform(Value *V);
00575 
00576   /// Returns true if this instruction will remain scalar after vectorization.
00577   bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); }
00578 
00579   /// Returns the information that we collected about runtime memory check.
00580   RuntimePointerCheck *getRuntimePointerCheck() { return &PtrRtCheck; }
00581 
00582   /// This function returns the identity element (or neutral element) for
00583   /// the operation K.
00584   static Constant *getReductionIdentity(ReductionKind K, Type *Tp);
00585 private:
00586   /// Check if a single basic block loop is vectorizable.
00587   /// At this point we know that this is a loop with a constant trip count
00588   /// and we only need to check individual instructions.
00589   bool canVectorizeInstrs();
00590 
00591   /// When we vectorize loops we may change the order in which
00592   /// we read and write from memory. This method checks if it is
00593   /// legal to vectorize the code, considering only memory constrains.
00594   /// Returns true if the loop is vectorizable
00595   bool canVectorizeMemory();
00596 
00597   /// Return true if we can vectorize this loop using the IF-conversion
00598   /// transformation.
00599   bool canVectorizeWithIfConvert();
00600 
00601   /// Collect the variables that need to stay uniform after vectorization.
00602   void collectLoopUniforms();
00603 
00604   /// Return true if all of the instructions in the block can be speculatively
00605   /// executed.
00606   bool blockCanBePredicated(BasicBlock *BB);
00607 
00608   /// Returns True, if 'Phi' is the kind of reduction variable for type
00609   /// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
00610   bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
00611   /// Returns a struct describing if the instruction 'I' can be a reduction
00612   /// variable of type 'Kind'. If the reduction is a min/max pattern of
00613   /// select(icmp()) this function advances the instruction pointer 'I' from the
00614   /// compare instruction to the select instruction and stores this pointer in
00615   /// 'PatternLastInst' member of the returned struct.
00616   ReductionInstDesc isReductionInstr(Instruction *I, ReductionKind Kind,
00617                                      ReductionInstDesc &Desc);
00618   /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction
00619   /// pattern corresponding to a min(X, Y) or max(X, Y).
00620   static ReductionInstDesc isMinMaxSelectCmpPattern(Instruction *I,
00621                                                     ReductionInstDesc &Prev);
00622   /// Returns the induction kind of Phi. This function may return NoInduction
00623   /// if the PHI is not an induction variable.
00624   InductionKind isInductionVariable(PHINode *Phi);
00625   /// Return true if can compute the address bounds of Ptr within the loop.
00626   bool hasComputableBounds(Value *Ptr);
00627   /// Return true if there is the chance of write reorder.
00628   bool hasPossibleGlobalWriteReorder(Value *Object,
00629                                      Instruction *Inst,
00630                                      AliasMultiMap &WriteObjects,
00631                                      unsigned MaxByteWidth);
00632   /// Return the AA location for a load or a store.
00633   AliasAnalysis::Location getLoadStoreLocation(Instruction *Inst);
00634 
00635 
00636   /// The loop that we evaluate.
00637   Loop *TheLoop;
00638   /// Scev analysis.
00639   ScalarEvolution *SE;
00640   /// DataLayout analysis.
00641   DataLayout *DL;
00642   /// Dominators.
00643   DominatorTree *DT;
00644   /// Target Info.
00645   TargetTransformInfo *TTI;
00646   /// Alias Analysis.
00647   AliasAnalysis *AA;
00648   /// Target Library Info.
00649   TargetLibraryInfo *TLI;
00650 
00651   //  ---  vectorization state --- //
00652 
00653   /// Holds the integer induction variable. This is the counter of the
00654   /// loop.
00655   PHINode *Induction;
00656   /// Holds the reduction variables.
00657   ReductionList Reductions;
00658   /// Holds all of the induction variables that we found in the loop.
00659   /// Notice that inductions don't need to start at zero and that induction
00660   /// variables can be pointers.
00661   InductionList Inductions;
00662   /// Holds the widest induction type encountered.
00663   Type *WidestIndTy;
00664 
00665   /// Allowed outside users. This holds the reduction
00666   /// vars which can be accessed from outside the loop.
00667   SmallPtrSet<Value*, 4> AllowedExit;
00668   /// This set holds the variables which are known to be uniform after
00669   /// vectorization.
00670   SmallPtrSet<Instruction*, 4> Uniforms;
00671   /// We need to check that all of the pointers in this list are disjoint
00672   /// at runtime.
00673   RuntimePointerCheck PtrRtCheck;
00674   /// Can we assume the absence of NaNs.
00675   bool HasFunNoNaNAttr;
00676 
00677   /// Utility to determine whether loads can be speculated.
00678   LoadHoisting LoadSpeculation;
00679 };
00680 
00681 /// LoopVectorizationCostModel - estimates the expected speedups due to
00682 /// vectorization.
00683 /// In many cases vectorization is not profitable. This can happen because of
00684 /// a number of reasons. In this class we mainly attempt to predict the
00685 /// expected speedup/slowdowns due to the supported instruction set. We use the
00686 /// TargetTransformInfo to query the different backends for the cost of
00687 /// different operations.
00688 class LoopVectorizationCostModel {
00689 public:
00690   LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI,
00691                              LoopVectorizationLegality *Legal,
00692                              const TargetTransformInfo &TTI,
00693                              DataLayout *DL, const TargetLibraryInfo *TLI)
00694       : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI) {}
00695 
00696   /// Information about vectorization costs
00697   struct VectorizationFactor {
00698     unsigned Width; // Vector width with best cost
00699     unsigned Cost; // Cost of the loop with that width
00700   };
00701   /// \return The most profitable vectorization factor and the cost of that VF.
00702   /// This method checks every power of two up to VF. If UserVF is not ZERO
00703   /// then this vectorization factor will be selected if vectorization is
00704   /// possible.
00705   VectorizationFactor selectVectorizationFactor(bool OptForSize,
00706                                                 unsigned UserVF);
00707 
00708   /// \return The size (in bits) of the widest type in the code that
00709   /// needs to be vectorized. We ignore values that remain scalar such as
00710   /// 64 bit loop indices.
00711   unsigned getWidestType();
00712 
00713   /// \return The most profitable unroll factor.
00714   /// If UserUF is non-zero then this method finds the best unroll-factor
00715   /// based on register pressure and other parameters.
00716   /// VF and LoopCost are the selected vectorization factor and the cost of the
00717   /// selected VF.
00718   unsigned selectUnrollFactor(bool OptForSize, unsigned UserUF, unsigned VF,
00719                               unsigned LoopCost);
00720 
00721   /// \brief A struct that represents some properties of the register usage
00722   /// of a loop.
00723   struct RegisterUsage {
00724     /// Holds the number of loop invariant values that are used in the loop.
00725     unsigned LoopInvariantRegs;
00726     /// Holds the maximum number of concurrent live intervals in the loop.
00727     unsigned MaxLocalUsers;
00728     /// Holds the number of instructions in the loop.
00729     unsigned NumInstructions;
00730   };
00731 
00732   /// \return  information about the register usage of the loop.
00733   RegisterUsage calculateRegisterUsage();
00734 
00735 private:
00736   /// Returns the expected execution cost. The unit of the cost does
00737   /// not matter because we use the 'cost' units to compare different
00738   /// vector widths. The cost that is returned is *not* normalized by
00739   /// the factor width.
00740   unsigned expectedCost(unsigned VF);
00741 
00742   /// Returns the execution time cost of an instruction for a given vector
00743   /// width. Vector width of one means scalar.
00744   unsigned getInstructionCost(Instruction *I, unsigned VF);
00745 
00746   /// A helper function for converting Scalar types to vector types.
00747   /// If the incoming type is void, we return void. If the VF is 1, we return
00748   /// the scalar type.
00749   static Type* ToVectorTy(Type *Scalar, unsigned VF);
00750 
00751   /// Returns whether the instruction is a load or store and will be a emitted
00752   /// as a vector operation.
00753   bool isConsecutiveLoadOrStore(Instruction *I);
00754 
00755   /// The loop that we evaluate.
00756   Loop *TheLoop;
00757   /// Scev analysis.
00758   ScalarEvolution *SE;
00759   /// Loop Info analysis.
00760   LoopInfo *LI;
00761   /// Vectorization legality.
00762   LoopVectorizationLegality *Legal;
00763   /// Vector target information.
00764   const TargetTransformInfo &TTI;
00765   /// Target data layout information.
00766   DataLayout *DL;
00767   /// Target Library Info.
00768   const TargetLibraryInfo *TLI;
00769 };
00770 
00771 /// The LoopVectorize Pass.
00772 struct LoopVectorize : public LoopPass {
00773   /// Pass identification, replacement for typeid
00774   static char ID;
00775 
00776   explicit LoopVectorize() : LoopPass(ID) {
00777     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
00778   }
00779 
00780   ScalarEvolution *SE;
00781   DataLayout *DL;
00782   LoopInfo *LI;
00783   TargetTransformInfo *TTI;
00784   DominatorTree *DT;
00785   AliasAnalysis *AA;
00786   TargetLibraryInfo *TLI;
00787 
00788   virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
00789     // We only vectorize innermost loops.
00790     if (!L->empty())
00791       return false;
00792 
00793     SE = &getAnalysis<ScalarEvolution>();
00794     DL = getAnalysisIfAvailable<DataLayout>();
00795     LI = &getAnalysis<LoopInfo>();
00796     TTI = &getAnalysis<TargetTransformInfo>();
00797     DT = &getAnalysis<DominatorTree>();
00798     AA = getAnalysisIfAvailable<AliasAnalysis>();
00799     TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
00800 
00801     if (DL == NULL) {
00802       DEBUG(dbgs() << "LV: Not vectorizing because of missing data layout");
00803       return false;
00804     }
00805 
00806     DEBUG(dbgs() << "LV: Checking a loop in \"" <<
00807           L->getHeader()->getParent()->getName() << "\"\n");
00808 
00809     // Check if it is legal to vectorize the loop.
00810     LoopVectorizationLegality LVL(L, SE, DL, DT, TTI, AA, TLI);
00811     if (!LVL.canVectorize()) {
00812       DEBUG(dbgs() << "LV: Not vectorizing.\n");
00813       return false;
00814     }
00815 
00816     // Use the cost model.
00817     LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL, TLI);
00818 
00819     // Check the function attributes to find out if this function should be
00820     // optimized for size.
00821     Function *F = L->getHeader()->getParent();
00822     Attribute::AttrKind SzAttr = Attribute::OptimizeForSize;
00823     Attribute::AttrKind FlAttr = Attribute::NoImplicitFloat;
00824     unsigned FnIndex = AttributeSet::FunctionIndex;
00825     bool OptForSize = F->getAttributes().hasAttribute(FnIndex, SzAttr);
00826     bool NoFloat = F->getAttributes().hasAttribute(FnIndex, FlAttr);
00827 
00828     if (NoFloat) {
00829       DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
00830             "attribute is used.\n");
00831       return false;
00832     }
00833 
00834     // Select the optimal vectorization factor.
00835     LoopVectorizationCostModel::VectorizationFactor VF;
00836     VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor);
00837     // Select the unroll factor.
00838     unsigned UF = CM.selectUnrollFactor(OptForSize, VectorizationUnroll,
00839                                         VF.Width, VF.Cost);
00840 
00841     if (VF.Width == 1) {
00842       DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
00843       return false;
00844     }
00845 
00846     DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF.Width << ") in "<<
00847           F->getParent()->getModuleIdentifier()<<"\n");
00848     DEBUG(dbgs() << "LV: Unroll Factor is " << UF << "\n");
00849 
00850     // If we decided that it is *legal* to vectorize the loop then do it.
00851     InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF);
00852     LB.vectorize(&LVL);
00853 
00854     DEBUG(verifyFunction(*L->getHeader()->getParent()));
00855     return true;
00856   }
00857 
00858   virtual void getAnalysisUsage(AnalysisUsage &AU) const {
00859     LoopPass::getAnalysisUsage(AU);
00860     AU.addRequiredID(LoopSimplifyID);
00861     AU.addRequiredID(LCSSAID);
00862     AU.addRequired<DominatorTree>();
00863     AU.addRequired<LoopInfo>();
00864     AU.addRequired<ScalarEvolution>();
00865     AU.addRequired<TargetTransformInfo>();
00866     AU.addPreserved<LoopInfo>();
00867     AU.addPreserved<DominatorTree>();
00868   }
00869 
00870 };
00871 
00872 } // end anonymous namespace
00873 
00874 //===----------------------------------------------------------------------===//
00875 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
00876 // LoopVectorizationCostModel.
00877 //===----------------------------------------------------------------------===//
00878 
00879 void
00880 LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE,
00881                                                        Loop *Lp, Value *Ptr,
00882                                                        bool WritePtr) {
00883   const SCEV *Sc = SE->getSCEV(Ptr);
00884   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
00885   assert(AR && "Invalid addrec expression");
00886   const SCEV *Ex = SE->getExitCount(Lp, Lp->getLoopLatch());
00887   const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE);
00888   Pointers.push_back(Ptr);
00889   Starts.push_back(AR->getStart());
00890   Ends.push_back(ScEnd);
00891   IsWritePtr.push_back(WritePtr);
00892 }
00893 
00894 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
00895   // Save the current insertion location.
00896   Instruction *Loc = Builder.GetInsertPoint();
00897 
00898   // We need to place the broadcast of invariant variables outside the loop.
00899   Instruction *Instr = dyn_cast<Instruction>(V);
00900   bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody);
00901   bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
00902 
00903   // Place the code for broadcasting invariant variables in the new preheader.
00904   if (Invariant)
00905     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
00906 
00907   // Broadcast the scalar into all locations in the vector.
00908   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
00909 
00910   // Restore the builder insertion point.
00911   if (Invariant)
00912     Builder.SetInsertPoint(Loc);
00913 
00914   return Shuf;
00915 }
00916 
00917 Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, int StartIdx,
00918                                                  bool Negate) {
00919   assert(Val->getType()->isVectorTy() && "Must be a vector");
00920   assert(Val->getType()->getScalarType()->isIntegerTy() &&
00921          "Elem must be an integer");
00922   // Create the types.
00923   Type *ITy = Val->getType()->getScalarType();
00924   VectorType *Ty = cast<VectorType>(Val->getType());
00925   int VLen = Ty->getNumElements();
00926   SmallVector<Constant*, 8> Indices;
00927 
00928   // Create a vector of consecutive numbers from zero to VF.
00929   for (int i = 0; i < VLen; ++i) {
00930     int64_t Idx = Negate ? (-i) : i;
00931     Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx, Negate));
00932   }
00933 
00934   // Add the consecutive indices to the vector value.
00935   Constant *Cv = ConstantVector::get(Indices);
00936   assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
00937   return Builder.CreateAdd(Val, Cv, "induction");
00938 }
00939 
00940 int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
00941   assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr");
00942   // Make sure that the pointer does not point to structs.
00943   if (cast<PointerType>(Ptr->getType())->getElementType()->isAggregateType())
00944     return 0;
00945 
00946   // If this value is a pointer induction variable we know it is consecutive.
00947   PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);
00948   if (Phi && Inductions.count(Phi)) {
00949     InductionInfo II = Inductions[Phi];
00950     if (IK_PtrInduction == II.IK)
00951       return 1;
00952     else if (IK_ReversePtrInduction == II.IK)
00953       return -1;
00954   }
00955 
00956   GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
00957   if (!Gep)
00958     return 0;
00959 
00960   unsigned NumOperands = Gep->getNumOperands();
00961   Value *LastIndex = Gep->getOperand(NumOperands - 1);
00962 
00963   Value *GpPtr = Gep->getPointerOperand();
00964   // If this GEP value is a consecutive pointer induction variable and all of
00965   // the indices are constant then we know it is consecutive. We can
00966   Phi = dyn_cast<PHINode>(GpPtr);
00967   if (Phi && Inductions.count(Phi)) {
00968 
00969     // Make sure that the pointer does not point to structs.
00970     PointerType *GepPtrType = cast<PointerType>(GpPtr->getType());
00971     if (GepPtrType->getElementType()->isAggregateType())
00972       return 0;
00973 
00974     // Make sure that all of the index operands are loop invariant.
00975     for (unsigned i = 1; i < NumOperands; ++i)
00976       if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
00977         return 0;
00978 
00979     InductionInfo II = Inductions[Phi];
00980     if (IK_PtrInduction == II.IK)
00981       return 1;
00982     else if (IK_ReversePtrInduction == II.IK)
00983       return -1;
00984   }
00985 
00986   // Check that all of the gep indices are uniform except for the last.
00987   for (unsigned i = 0; i < NumOperands - 1; ++i)
00988     if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
00989       return 0;
00990 
00991   // We can emit wide load/stores only if the last index is the induction
00992   // variable.
00993   const SCEV *Last = SE->getSCEV(LastIndex);
00994   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {
00995     const SCEV *Step = AR->getStepRecurrence(*SE);
00996 
00997     // The memory is consecutive because the last index is consecutive
00998     // and all other indices are loop invariant.
00999     if (Step->isOne())
01000       return 1;
01001     if (Step->isAllOnesValue())
01002       return -1;
01003   }
01004 
01005   return 0;
01006 }
01007 
01008 bool LoopVectorizationLegality::isUniform(Value *V) {
01009   return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
01010 }
01011 
01012 InnerLoopVectorizer::VectorParts&
01013 InnerLoopVectorizer::getVectorValue(Value *V) {
01014   assert(V != Induction && "The new induction variable should not be used.");
01015   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
01016 
01017   // If we have this scalar in the map, return it.
01018   if (WidenMap.has(V))
01019     return WidenMap.get(V);
01020 
01021   // If this scalar is unknown, assume that it is a constant or that it is
01022   // loop invariant. Broadcast V and save the value for future uses.
01023   Value *B = getBroadcastInstrs(V);
01024   return WidenMap.splat(V, B);
01025 }
01026 
01027 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
01028   assert(Vec->getType()->isVectorTy() && "Invalid type");
01029   SmallVector<Constant*, 8> ShuffleMask;
01030   for (unsigned i = 0; i < VF; ++i)
01031     ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
01032 
01033   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
01034                                      ConstantVector::get(ShuffleMask),
01035                                      "reverse");
01036 }
01037 
01038 
01039 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
01040                                              LoopVectorizationLegality *Legal) {
01041   // Attempt to issue a wide load.
01042   LoadInst *LI = dyn_cast<LoadInst>(Instr);
01043   StoreInst *SI = dyn_cast<StoreInst>(Instr);
01044 
01045   assert((LI || SI) && "Invalid Load/Store instruction");
01046 
01047   Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType();
01048   Type *DataTy = VectorType::get(ScalarDataTy, VF);
01049   Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
01050   unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
01051 
01052   unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy);
01053   unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF;
01054 
01055   if (ScalarAllocatedSize != VectorElementSize)
01056     return scalarizeInstruction(Instr);
01057 
01058   // If the pointer is loop invariant or if it is non consecutive,
01059   // scalarize the load.
01060   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
01061   bool Reverse = ConsecutiveStride < 0;
01062   bool UniformLoad = LI && Legal->isUniform(Ptr);
01063   if (!ConsecutiveStride || UniformLoad)
01064     return scalarizeInstruction(Instr);
01065 
01066   Constant *Zero = Builder.getInt32(0);
01067   VectorParts &Entry = WidenMap.get(Instr);
01068 
01069   // Handle consecutive loads/stores.
01070   GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
01071   if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) {
01072     Value *PtrOperand = Gep->getPointerOperand();
01073     Value *FirstBasePtr = getVectorValue(PtrOperand)[0];
01074     FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero);
01075 
01076     // Create the new GEP with the new induction variable.
01077     GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
01078     Gep2->setOperand(0, FirstBasePtr);
01079     Gep2->setName("gep.indvar.base");
01080     Ptr = Builder.Insert(Gep2);
01081   } else if (Gep) {
01082     assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()),
01083                                OrigLoop) && "Base ptr must be invariant");
01084 
01085     // The last index does not have to be the induction. It can be
01086     // consecutive and be a function of the index. For example A[I+1];
01087     unsigned NumOperands = Gep->getNumOperands();
01088 
01089     Value *LastGepOperand = Gep->getOperand(NumOperands - 1);
01090     VectorParts &GEPParts = getVectorValue(LastGepOperand);
01091     Value *LastIndex = GEPParts[0];
01092     LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
01093 
01094     // Create the new GEP with the new induction variable.
01095     GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
01096     Gep2->setOperand(NumOperands - 1, LastIndex);
01097     Gep2->setName("gep.indvar.idx");
01098     Ptr = Builder.Insert(Gep2);
01099   } else {
01100     // Use the induction element ptr.
01101     assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
01102     VectorParts &PtrVal = getVectorValue(Ptr);
01103     Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
01104   }
01105 
01106   // Handle Stores:
01107   if (SI) {
01108     assert(!Legal->isUniform(SI->getPointerOperand()) &&
01109            "We do not allow storing to uniform addresses");
01110 
01111     VectorParts &StoredVal = getVectorValue(SI->getValueOperand());
01112     for (unsigned Part = 0; Part < UF; ++Part) {
01113       // Calculate the pointer for the specific unroll-part.
01114       Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
01115 
01116       if (Reverse) {
01117         // If we store to reverse consecutive memory locations then we need
01118         // to reverse the order of elements in the stored value.
01119         StoredVal[Part] = reverseVector(StoredVal[Part]);
01120         // If the address is consecutive but reversed, then the
01121         // wide store needs to start at the last vector element.
01122         PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
01123         PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
01124       }
01125 
01126       Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo());
01127       Builder.CreateStore(StoredVal[Part], VecPtr)->setAlignment(Alignment);
01128     }
01129   }
01130 
01131   for (unsigned Part = 0; Part < UF; ++Part) {
01132     // Calculate the pointer for the specific unroll-part.
01133     Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
01134 
01135     if (Reverse) {
01136       // If the address is consecutive but reversed, then the
01137       // wide store needs to start at the last vector element.
01138       PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
01139       PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
01140     }
01141 
01142     Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo());
01143     Value *LI = Builder.CreateLoad(VecPtr, "wide.load");
01144     cast<LoadInst>(LI)->setAlignment(Alignment);
01145     Entry[Part] = Reverse ? reverseVector(LI) :  LI;
01146   }
01147 }
01148 
01149 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
01150   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
01151   // Holds vector parameters or scalars, in case of uniform vals.
01152   SmallVector<VectorParts, 4> Params;
01153 
01154   // Find all of the vectorized parameters.
01155   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
01156     Value *SrcOp = Instr->getOperand(op);
01157 
01158     // If we are accessing the old induction variable, use the new one.
01159     if (SrcOp == OldInduction) {
01160       Params.push_back(getVectorValue(SrcOp));
01161       continue;
01162     }
01163 
01164     // Try using previously calculated values.
01165     Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
01166 
01167     // If the src is an instruction that appeared earlier in the basic block
01168     // then it should already be vectorized.
01169     if (SrcInst && OrigLoop->contains(SrcInst)) {
01170       assert(WidenMap.has(SrcInst) && "Source operand is unavailable");
01171       // The parameter is a vector value from earlier.
01172       Params.push_back(WidenMap.get(SrcInst));
01173     } else {
01174       // The parameter is a scalar from outside the loop. Maybe even a constant.
01175       VectorParts Scalars;
01176       Scalars.append(UF, SrcOp);
01177       Params.push_back(Scalars);
01178     }
01179   }
01180 
01181   assert(Params.size() == Instr->getNumOperands() &&
01182          "Invalid number of operands");
01183 
01184   // Does this instruction return a value ?
01185   bool IsVoidRetTy = Instr->getType()->isVoidTy();
01186 
01187   Value *UndefVec = IsVoidRetTy ? 0 :
01188     UndefValue::get(VectorType::get(Instr->getType(), VF));
01189   // Create a new entry in the WidenMap and initialize it to Undef or Null.
01190   VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
01191 
01192   // For each vector unroll 'part':
01193   for (unsigned Part = 0; Part < UF; ++Part) {
01194     // For each scalar that we create:
01195     for (unsigned Width = 0; Width < VF; ++Width) {
01196       Instruction *Cloned = Instr->clone();
01197       if (!IsVoidRetTy)
01198         Cloned->setName(Instr->getName() + ".cloned");
01199       // Replace the operands of the cloned instrucions with extracted scalars.
01200       for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
01201         Value *Op = Params[op][Part];
01202         // Param is a vector. Need to extract the right lane.
01203         if (Op->getType()->isVectorTy())
01204           Op = Builder.CreateExtractElement(Op, Builder.getInt32(Width));
01205         Cloned->setOperand(op, Op);
01206       }
01207 
01208       // Place the cloned scalar in the new loop.
01209       Builder.Insert(Cloned);
01210 
01211       // If the original scalar returns a value we need to place it in a vector
01212       // so that future users will be able to use it.
01213       if (!IsVoidRetTy)
01214         VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned,
01215                                                        Builder.getInt32(Width));
01216     }
01217   }
01218 }
01219 
01220 Instruction *
01221 InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
01222                                      Instruction *Loc) {
01223   LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck =
01224   Legal->getRuntimePointerCheck();
01225 
01226   if (!PtrRtCheck->Need)
01227     return NULL;
01228 
01229   Instruction *MemoryRuntimeCheck = 0;
01230   unsigned NumPointers = PtrRtCheck->Pointers.size();
01231   SmallVector<Value* , 2> Starts;
01232   SmallVector<Value* , 2> Ends;
01233 
01234   SCEVExpander Exp(*SE, "induction");
01235 
01236   // Use this type for pointer arithmetic.
01237   Type* PtrArithTy = Type::getInt8PtrTy(Loc->getContext(), 0);
01238 
01239   for (unsigned i = 0; i < NumPointers; ++i) {
01240     Value *Ptr = PtrRtCheck->Pointers[i];
01241     const SCEV *Sc = SE->getSCEV(Ptr);
01242 
01243     if (SE->isLoopInvariant(Sc, OrigLoop)) {
01244       DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" <<
01245             *Ptr <<"\n");
01246       Starts.push_back(Ptr);
01247       Ends.push_back(Ptr);
01248     } else {
01249       DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n");
01250 
01251       Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc);
01252       Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc);
01253       Starts.push_back(Start);
01254       Ends.push_back(End);
01255     }
01256   }
01257 
01258   IRBuilder<> ChkBuilder(Loc);
01259 
01260   for (unsigned i = 0; i < NumPointers; ++i) {
01261     for (unsigned j = i+1; j < NumPointers; ++j) {
01262       // No need to check if two readonly pointers intersect.
01263       if (!PtrRtCheck->IsWritePtr[i] && !PtrRtCheck->IsWritePtr[j])
01264         continue;
01265 
01266       Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy, "bc");
01267       Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy, "bc");
01268       Value *End0 =   ChkBuilder.CreateBitCast(Ends[i],   PtrArithTy, "bc");
01269       Value *End1 =   ChkBuilder.CreateBitCast(Ends[j],   PtrArithTy, "bc");
01270 
01271       Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0");
01272       Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1");
01273       Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict");
01274       if (MemoryRuntimeCheck)
01275         IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict,
01276                                          "conflict.rdx");
01277 
01278       MemoryRuntimeCheck = cast<Instruction>(IsConflict);
01279     }
01280   }
01281 
01282   return MemoryRuntimeCheck;
01283 }
01284 
01285 void
01286 InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
01287   /*
01288    In this function we generate a new loop. The new loop will contain
01289    the vectorized instructions while the old loop will continue to run the
01290    scalar remainder.
01291 
01292        [ ] <-- vector loop bypass (may consist of multiple blocks).
01293      /  |
01294     /   v
01295    |   [ ]     <-- vector pre header.
01296    |    |
01297    |    v
01298    |   [  ] \
01299    |   [  ]_|   <-- vector loop.
01300    |    |
01301     \   v
01302       >[ ]   <--- middle-block.
01303      /  |
01304     /   v
01305    |   [ ]     <--- new preheader.
01306    |    |
01307    |    v
01308    |   [ ] \
01309    |   [ ]_|   <-- old scalar loop to handle remainder.
01310     \   |
01311      \  v
01312       >[ ]     <-- exit block.
01313    ...
01314    */
01315 
01316   BasicBlock *OldBasicBlock = OrigLoop->getHeader();
01317   BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
01318   BasicBlock *ExitBlock = OrigLoop->getExitBlock();
01319   assert(ExitBlock && "Must have an exit block");
01320 
01321   // Mark the old scalar loop with metadata that tells us not to vectorize this
01322   // loop again if we run into it.
01323   MDNode *MD = MDNode::get(OldBasicBlock->getContext(), None);
01324   OldBasicBlock->getTerminator()->setMetadata(AlreadyVectorizedMDName, MD);
01325 
01326   // Some loops have a single integer induction variable, while other loops
01327   // don't. One example is c++ iterators that often have multiple pointer
01328   // induction variables. In the code below we also support a case where we
01329   // don't have a single induction variable.
01330   OldInduction = Legal->getInduction();
01331   Type *IdxTy = Legal->getWidestInductionType();
01332 
01333   // Find the loop boundaries.
01334   const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getLoopLatch());
01335   assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
01336 
01337   // Get the total trip count from the count by adding 1.
01338   ExitCount = SE->getAddExpr(ExitCount,
01339                              SE->getConstant(ExitCount->getType(), 1));
01340 
01341   // Expand the trip count and place the new instructions in the preheader.
01342   // Notice that the pre-header does not change, only the loop body.
01343   SCEVExpander Exp(*SE, "induction");
01344 
01345   // Count holds the overall loop count (N).
01346   Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
01347                                    BypassBlock->getTerminator());
01348 
01349   // The loop index does not have to start at Zero. Find the original start
01350   // value from the induction PHI node. If we don't have an induction variable
01351   // then we know that it starts at zero.
01352   Builder.SetInsertPoint(BypassBlock->getTerminator());
01353   Value *StartIdx = ExtendedIdx = OldInduction ?
01354     Builder.CreateZExt(OldInduction->getIncomingValueForBlock(BypassBlock),
01355                        IdxTy):
01356     ConstantInt::get(IdxTy, 0);
01357 
01358   assert(BypassBlock && "Invalid loop structure");
01359   LoopBypassBlocks.push_back(BypassBlock);
01360 
01361   // Split the single block loop into the two loop structure described above.
01362   BasicBlock *VectorPH =
01363   BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
01364   BasicBlock *VecBody =
01365   VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
01366   BasicBlock *MiddleBlock =
01367   VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
01368   BasicBlock *ScalarPH =
01369   MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
01370 
01371   // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
01372   // inside the loop.
01373   Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
01374 
01375   // Generate the induction variable.
01376   Induction = Builder.CreatePHI(IdxTy, 2, "index");
01377   // The loop step is equal to the vectorization factor (num of SIMD elements)
01378   // times the unroll factor (num of SIMD instructions).
01379   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
01380 
01381   // This is the IR builder that we use to add all of the logic for bypassing
01382   // the new vector loop.
01383   IRBuilder<> BypassBuilder(BypassBlock->getTerminator());
01384 
01385   // We may need to extend the index in case there is a type mismatch.
01386   // We know that the count starts at zero and does not overflow.
01387   if (Count->getType() != IdxTy) {
01388     // The exit count can be of pointer type. Convert it to the correct
01389     // integer type.
01390     if (ExitCount->getType()->isPointerTy())
01391       Count = BypassBuilder.CreatePointerCast(Count, IdxTy, "ptrcnt.to.int");
01392     else
01393       Count = BypassBuilder.CreateZExtOrTrunc(Count, IdxTy, "cnt.cast");
01394   }
01395 
01396   // Add the start index to the loop count to get the new end index.
01397   Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx, "end.idx");
01398 
01399   // Now we need to generate the expression for N - (N % VF), which is
01400   // the part that the vectorized body will execute.
01401   Value *R = BypassBuilder.CreateURem(Count, Step, "n.mod.vf");
01402   Value *CountRoundDown = BypassBuilder.CreateSub(Count, R, "n.vec");
01403   Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx,
01404                                                      "end.idx.rnd.down");
01405 
01406   // Now, compare the new count to zero. If it is zero skip the vector loop and
01407   // jump to the scalar loop.
01408   Value *Cmp = BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx,
01409                                           "cmp.zero");
01410 
01411   BasicBlock *LastBypassBlock = BypassBlock;
01412 
01413   // Generate the code that checks in runtime if arrays overlap. We put the
01414   // checks into a separate block to make the more common case of few elements
01415   // faster.
01416   Instruction *MemRuntimeCheck = addRuntimeCheck(Legal,
01417                                                  BypassBlock->getTerminator());
01418   if (MemRuntimeCheck) {
01419     // Create a new block containing the memory check.
01420     BasicBlock *CheckBlock = BypassBlock->splitBasicBlock(MemRuntimeCheck,
01421                                                           "vector.memcheck");
01422     LoopBypassBlocks.push_back(CheckBlock);
01423 
01424     // Replace the branch into the memory check block with a conditional branch
01425     // for the "few elements case".
01426     Instruction *OldTerm = BypassBlock->getTerminator();
01427     BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm);
01428     OldTerm->eraseFromParent();
01429 
01430     Cmp = MemRuntimeCheck;
01431     LastBypassBlock = CheckBlock;
01432   }
01433 
01434   LastBypassBlock->getTerminator()->eraseFromParent();
01435   BranchInst::Create(MiddleBlock, VectorPH, Cmp,
01436                      LastBypassBlock);
01437 
01438   // We are going to resume the execution of the scalar loop.
01439   // Go over all of the induction variables that we found and fix the
01440   // PHIs that are left in the scalar version of the loop.
01441   // The starting values of PHI nodes depend on the counter of the last
01442   // iteration in the vectorized loop.
01443   // If we come from a bypass edge then we need to start from the original
01444   // start value.
01445 
01446   // This variable saves the new starting index for the scalar loop.
01447   PHINode *ResumeIndex = 0;
01448   LoopVectorizationLegality::InductionList::iterator I, E;
01449   LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
01450   // Set builder to point to last bypass block.
01451   BypassBuilder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator());
01452   for (I = List->begin(), E = List->end(); I != E; ++I) {
01453     PHINode *OrigPhi = I->first;
01454     LoopVectorizationLegality::InductionInfo II = I->second;
01455 
01456     Type *ResumeValTy = (OrigPhi == OldInduction) ? IdxTy : OrigPhi->getType();
01457     PHINode *ResumeVal = PHINode::Create(ResumeValTy, 2, "resume.val",
01458                                          MiddleBlock->getTerminator());
01459     // We might have extended the type of the induction variable but we need a
01460     // truncated version for the scalar loop.
01461     PHINode *TruncResumeVal = (OrigPhi == OldInduction) ?
01462       PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val",
01463                       MiddleBlock->getTerminator()) : 0;
01464 
01465     Value *EndValue = 0;
01466     switch (II.IK) {
01467     case LoopVectorizationLegality::IK_NoInduction:
01468       llvm_unreachable("Unknown induction");
01469     case LoopVectorizationLegality::IK_IntInduction: {
01470       // Handle the integer induction counter.
01471       assert(OrigPhi->getType()->isIntegerTy() && "Invalid type");
01472 
01473       // We have the canonical induction variable.
01474       if (OrigPhi == OldInduction) {
01475         // Create a truncated version of the resume value for the scalar loop,
01476         // we might have promoted the type to a larger width.
01477         EndValue =
01478           BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType());
01479         // The new PHI merges the original incoming value, in case of a bypass,
01480         // or the value at the end of the vectorized loop.
01481         for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
01482           TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
01483         TruncResumeVal->addIncoming(EndValue, VecBody);
01484 
01485         // We know what the end value is.
01486         EndValue = IdxEndRoundDown;
01487         // We also know which PHI node holds it.
01488         ResumeIndex = ResumeVal;
01489         break;
01490       }
01491 
01492       // Not the canonical induction variable - add the vector loop count to the
01493       // start value.
01494       Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,
01495                                                    II.StartValue->getType(),
01496                                                    "cast.crd");
01497       EndValue = BypassBuilder.CreateAdd(CRD, II.StartValue , "ind.end");
01498       break;
01499     }
01500     case LoopVectorizationLegality::IK_ReverseIntInduction: {
01501       // Convert the CountRoundDown variable to the PHI size.
01502       Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,
01503                                                    II.StartValue->getType(),
01504                                                    "cast.crd");
01505       // Handle reverse integer induction counter.
01506       EndValue = BypassBuilder.CreateSub(II.StartValue, CRD, "rev.ind.end");
01507       break;
01508     }
01509     case LoopVectorizationLegality::IK_PtrInduction: {
01510       // For pointer induction variables, calculate the offset using
01511       // the end index.
01512       EndValue = BypassBuilder.CreateGEP(II.StartValue, CountRoundDown,
01513                                          "ptr.ind.end");
01514       break;
01515     }
01516     case LoopVectorizationLegality::IK_ReversePtrInduction: {
01517       // The value at the end of the loop for the reverse pointer is calculated
01518       // by creating a GEP with a negative index starting from the start value.
01519       Value *Zero = ConstantInt::get(CountRoundDown->getType(), 0);
01520       Value *NegIdx = BypassBuilder.CreateSub(Zero, CountRoundDown,
01521                                               "rev.ind.end");
01522       EndValue = BypassBuilder.CreateGEP(II.StartValue, NegIdx,
01523                                          "rev.ptr.ind.end");
01524       break;
01525     }
01526     }// end of case
01527 
01528     // The new PHI merges the original incoming value, in case of a bypass,
01529     // or the value at the end of the vectorized loop.
01530     for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) {
01531       if (OrigPhi == OldInduction)
01532         ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]);
01533       else
01534         ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
01535     }
01536     ResumeVal->addIncoming(EndValue, VecBody);
01537 
01538     // Fix the scalar body counter (PHI node).
01539     unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
01540     // The old inductions phi node in the scalar body needs the truncated value.
01541     if (OrigPhi == OldInduction)
01542       OrigPhi->setIncomingValue(BlockIdx, TruncResumeVal);
01543     else
01544       OrigPhi->setIncomingValue(BlockIdx, ResumeVal);
01545   }
01546 
01547   // If we are generating a new induction variable then we also need to
01548   // generate the code that calculates the exit value. This value is not
01549   // simply the end of the counter because we may skip the vectorized body
01550   // in case of a runtime check.
01551   if (!OldInduction){
01552     assert(!ResumeIndex && "Unexpected resume value found");
01553     ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val",
01554                                   MiddleBlock->getTerminator());
01555     for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
01556       ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]);
01557     ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
01558   }
01559 
01560   // Make sure that we found the index where scalar loop needs to continue.
01561   assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() &&
01562          "Invalid resume Index");
01563 
01564   // Add a check in the middle block to see if we have completed
01565   // all of the iterations in the first vector loop.
01566   // If (N - N%VF) == N, then we *don't* need to run the remainder.
01567   Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd,
01568                                 ResumeIndex, "cmp.n",
01569                                 MiddleBlock->getTerminator());
01570 
01571   BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator());
01572   // Remove the old terminator.
01573   MiddleBlock->getTerminator()->eraseFromParent();
01574 
01575   // Create i+1 and fill the PHINode.
01576   Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next");
01577   Induction->addIncoming(StartIdx, VectorPH);
01578   Induction->addIncoming(NextIdx, VecBody);
01579   // Create the compare.
01580   Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown);
01581   Builder.CreateCondBr(ICmp, MiddleBlock, VecBody);
01582 
01583   // Now we have two terminators. Remove the old one from the block.
01584   VecBody->getTerminator()->eraseFromParent();
01585 
01586   // Get ready to start creating new instructions into the vectorized body.
01587   Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
01588 
01589   // Create and register the new vector loop.
01590   Loop* Lp = new Loop();
01591   Loop *ParentLoop = OrigLoop->getParentLoop();
01592 
01593   // Insert the new loop into the loop nest and register the new basic blocks.
01594   if (ParentLoop) {
01595     ParentLoop->addChildLoop(Lp);
01596     for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
01597       ParentLoop->addBasicBlockToLoop(LoopBypassBlocks[I], LI->getBase());
01598     ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase());
01599     ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase());
01600     ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase());
01601   } else {
01602     LI->addTopLevelLoop(Lp);
01603   }
01604 
01605   Lp->addBasicBlockToLoop(VecBody, LI->getBase());
01606 
01607   // Save the state.
01608   LoopVectorPreHeader = VectorPH;
01609   LoopScalarPreHeader = ScalarPH;
01610   LoopMiddleBlock = MiddleBlock;
01611   LoopExitBlock = ExitBlock;
01612   LoopVectorBody = VecBody;
01613   LoopScalarBody = OldBasicBlock;
01614 }
01615 
01616 /// This function returns the identity element (or neutral element) for
01617 /// the operation K.
01618 Constant*
01619 LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp) {
01620   switch (K) {
01621   case RK_IntegerXor:
01622   case RK_IntegerAdd:
01623   case RK_IntegerOr:
01624     // Adding, Xoring, Oring zero to a number does not change it.
01625     return ConstantInt::get(Tp, 0);
01626   case RK_IntegerMult:
01627     // Multiplying a number by 1 does not change it.
01628     return ConstantInt::get(Tp, 1);
01629   case RK_IntegerAnd:
01630     // AND-ing a number with an all-1 value does not change it.
01631     return ConstantInt::get(Tp, -1, true);
01632   case  RK_FloatMult:
01633     // Multiplying a number by 1 does not change it.
01634     return ConstantFP::get(Tp, 1.0L);
01635   case  RK_FloatAdd:
01636     // Adding zero to a number does not change it.
01637     return ConstantFP::get(Tp, 0.0L);
01638   default:
01639     llvm_unreachable("Unknown reduction kind");
01640   }
01641 }
01642 
01643 static Intrinsic::ID
01644 getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) {
01645   // If we have an intrinsic call, check if it is trivially vectorizable.
01646   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
01647     switch (II->getIntrinsicID()) {
01648     case Intrinsic::sqrt:
01649     case Intrinsic::sin:
01650     case Intrinsic::cos:
01651     case Intrinsic::exp:
01652     case Intrinsic::exp2:
01653     case Intrinsic::log:
01654     case Intrinsic::log10:
01655     case Intrinsic::log2:
01656     case Intrinsic::fabs:
01657     case Intrinsic::floor:
01658     case Intrinsic::ceil:
01659     case Intrinsic::trunc:
01660     case Intrinsic::rint:
01661     case Intrinsic::nearbyint:
01662     case Intrinsic::pow:
01663     case Intrinsic::fma:
01664     case Intrinsic::fmuladd:
01665       return II->getIntrinsicID();
01666     default:
01667       return Intrinsic::not_intrinsic;
01668     }
01669   }
01670 
01671   if (!TLI)
01672     return Intrinsic::not_intrinsic;
01673 
01674   LibFunc::Func Func;
01675   Function *F = CI->getCalledFunction();
01676   // We're going to make assumptions on the semantics of the functions, check
01677   // that the target knows that it's available in this environment.
01678   if (!F || !TLI->getLibFunc(F->getName(), Func))
01679     return Intrinsic::not_intrinsic;
01680 
01681   // Otherwise check if we have a call to a function that can be turned into a
01682   // vector intrinsic.
01683   switch (Func) {
01684   default:
01685     break;
01686   case LibFunc::sin:
01687   case LibFunc::sinf:
01688   case LibFunc::sinl:
01689     return Intrinsic::sin;
01690   case LibFunc::cos:
01691   case LibFunc::cosf:
01692   case LibFunc::cosl:
01693     return Intrinsic::cos;
01694   case LibFunc::exp:
01695   case LibFunc::expf:
01696   case LibFunc::expl:
01697     return Intrinsic::exp;
01698   case LibFunc::exp2:
01699   case LibFunc::exp2f:
01700   case LibFunc::exp2l:
01701     return Intrinsic::exp2;
01702   case LibFunc::log:
01703   case LibFunc::logf:
01704   case LibFunc::logl:
01705     return Intrinsic::log;
01706   case LibFunc::log10:
01707   case LibFunc::log10f:
01708   case LibFunc::log10l:
01709     return Intrinsic::log10;
01710   case LibFunc::log2:
01711   case LibFunc::log2f:
01712   case LibFunc::log2l:
01713     return Intrinsic::log2;
01714   case LibFunc::fabs:
01715   case LibFunc::fabsf:
01716   case LibFunc::fabsl:
01717     return Intrinsic::fabs;
01718   case LibFunc::floor:
01719   case LibFunc::floorf:
01720   case LibFunc::floorl:
01721     return Intrinsic::floor;
01722   case LibFunc::ceil:
01723   case LibFunc::ceilf:
01724   case LibFunc::ceill:
01725     return Intrinsic::ceil;
01726   case LibFunc::trunc:
01727   case LibFunc::truncf:
01728   case LibFunc::truncl:
01729     return Intrinsic::trunc;
01730   case LibFunc::rint:
01731   case LibFunc::rintf:
01732   case LibFunc::rintl:
01733     return Intrinsic::rint;
01734   case LibFunc::nearbyint:
01735   case LibFunc::nearbyintf:
01736   case LibFunc::nearbyintl:
01737     return Intrinsic::nearbyint;
01738   case LibFunc::pow:
01739   case LibFunc::powf:
01740   case LibFunc::powl:
01741     return Intrinsic::pow;
01742   }
01743 
01744   return Intrinsic::not_intrinsic;
01745 }
01746 
01747 /// This function translates the reduction kind to an LLVM binary operator.
01748 static unsigned
01749 getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) {
01750   switch (Kind) {
01751     case LoopVectorizationLegality::RK_IntegerAdd:
01752       return Instruction::Add;
01753     case LoopVectorizationLegality::RK_IntegerMult:
01754       return Instruction::Mul;
01755     case LoopVectorizationLegality::RK_IntegerOr:
01756       return Instruction::Or;
01757     case LoopVectorizationLegality::RK_IntegerAnd:
01758       return Instruction::And;
01759     case LoopVectorizationLegality::RK_IntegerXor:
01760       return Instruction::Xor;
01761     case LoopVectorizationLegality::RK_FloatMult:
01762       return Instruction::FMul;
01763     case LoopVectorizationLegality::RK_FloatAdd:
01764       return Instruction::FAdd;
01765     case LoopVectorizationLegality::RK_IntegerMinMax:
01766       return Instruction::ICmp;
01767     case LoopVectorizationLegality::RK_FloatMinMax:
01768       return Instruction::FCmp;
01769     default:
01770       llvm_unreachable("Unknown reduction operation");
01771   }
01772 }
01773 
01774 Value *createMinMaxOp(IRBuilder<> &Builder,
01775                       LoopVectorizationLegality::MinMaxReductionKind RK,
01776                       Value *Left,
01777                       Value *Right) {
01778   CmpInst::Predicate P = CmpInst::ICMP_NE;
01779   switch (RK) {
01780   default:
01781     llvm_unreachable("Unknown min/max reduction kind");
01782   case LoopVectorizationLegality::MRK_UIntMin:
01783     P = CmpInst::ICMP_ULT;
01784     break;
01785   case LoopVectorizationLegality::MRK_UIntMax:
01786     P = CmpInst::ICMP_UGT;
01787     break;
01788   case LoopVectorizationLegality::MRK_SIntMin:
01789     P = CmpInst::ICMP_SLT;
01790     break;
01791   case LoopVectorizationLegality::MRK_SIntMax:
01792     P = CmpInst::ICMP_SGT;
01793     break;
01794   case LoopVectorizationLegality::MRK_FloatMin:
01795     P = CmpInst::FCMP_OLT;
01796     break;
01797   case LoopVectorizationLegality::MRK_FloatMax:
01798     P = CmpInst::FCMP_OGT;
01799     break;
01800   }
01801 
01802   Value *Cmp;
01803   if (RK == LoopVectorizationLegality::MRK_FloatMin || RK == LoopVectorizationLegality::MRK_FloatMax)
01804     Cmp = Builder.CreateFCmp(P, Left, Right, "rdx.minmax.cmp");
01805   else
01806     Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp");
01807 
01808   Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select");
01809   return Select;
01810 }
01811 
01812 void
01813 InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
01814   //===------------------------------------------------===//
01815   //
01816   // Notice: any optimization or new instruction that go
01817   // into the code below should be also be implemented in
01818   // the cost-model.
01819   //
01820   //===------------------------------------------------===//
01821   Constant *Zero = Builder.getInt32(0);
01822 
01823   // In order to support reduction variables we need to be able to vectorize
01824   // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
01825   // stages. First, we create a new vector PHI node with no incoming edges.
01826   // We use this value when we vectorize all of the instructions that use the
01827   // PHI. Next, after all of the instructions in the block are complete we
01828   // add the new incoming edges to the PHI. At this point all of the
01829   // instructions in the basic block are vectorized, so we can use them to
01830   // construct the PHI.
01831   PhiVector RdxPHIsToFix;
01832 
01833   // Scan the loop in a topological order to ensure that defs are vectorized
01834   // before users.
01835   LoopBlocksDFS DFS(OrigLoop);
01836   DFS.perform(LI);
01837 
01838   // Vectorize all of the blocks in the original loop.
01839   for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
01840        be = DFS.endRPO(); bb != be; ++bb)
01841     vectorizeBlockInLoop(Legal, *bb, &RdxPHIsToFix);
01842 
01843   // At this point every instruction in the original loop is widened to
01844   // a vector form. We are almost done. Now, we need to fix the PHI nodes
01845   // that we vectorized. The PHI nodes are currently empty because we did
01846   // not want to introduce cycles. Notice that the remaining PHI nodes
01847   // that we need to fix are reduction variables.
01848 
01849   // Create the 'reduced' values for each of the induction vars.
01850   // The reduced values are the vector values that we scalarize and combine
01851   // after the loop is finished.
01852   for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end();
01853        it != e; ++it) {
01854     PHINode *RdxPhi = *it;
01855     assert(RdxPhi && "Unable to recover vectorized PHI");
01856 
01857     // Find the reduction variable descriptor.
01858     assert(Legal->getReductionVars()->count(RdxPhi) &&
01859            "Unable to find the reduction variable");
01860     LoopVectorizationLegality::ReductionDescriptor RdxDesc =
01861     (*Legal->getReductionVars())[RdxPhi];
01862 
01863     // We need to generate a reduction vector from the incoming scalar.
01864     // To do so, we need to generate the 'identity' vector and overide
01865     // one of the elements with the incoming scalar reduction. We need
01866     // to do it in the vector-loop preheader.
01867     Builder.SetInsertPoint(LoopBypassBlocks.front()->getTerminator());
01868 
01869     // This is the vector-clone of the value that leaves the loop.
01870     VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
01871     Type *VecTy = VectorExit[0]->getType();
01872 
01873     // Find the reduction identity variable. Zero for addition, or, xor,
01874     // one for multiplication, -1 for And.
01875     Value *Identity;
01876     Value *VectorStart;
01877     if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerMinMax ||
01878         RdxDesc.Kind == LoopVectorizationLegality::RK_FloatMinMax) {
01879       // MinMax reduction have the start value as their identify.
01880       VectorStart = Identity = Builder.CreateVectorSplat(VF, RdxDesc.StartValue,
01881                                                          "minmax.ident");
01882     } else {
01883       Constant *Iden =
01884         LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind,
01885                                                         VecTy->getScalarType());
01886       Identity = ConstantVector::getSplat(VF, Iden);
01887 
01888       // This vector is the Identity vector where the first element is the
01889       // incoming scalar reduction.
01890       VectorStart = Builder.CreateInsertElement(Identity,
01891                                                 RdxDesc.StartValue, Zero);
01892     }
01893 
01894     // Fix the vector-loop phi.
01895     // We created the induction variable so we know that the
01896     // preheader is the first entry.
01897     BasicBlock *VecPreheader = Induction->getIncomingBlock(0);
01898 
01899     // Reductions do not have to start at zero. They can start with
01900     // any loop invariant values.
01901     VectorParts &VecRdxPhi = WidenMap.get(RdxPhi);
01902     BasicBlock *Latch = OrigLoop->getLoopLatch();
01903     Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch);
01904     VectorParts &Val = getVectorValue(LoopVal);
01905     for (unsigned part = 0; part < UF; ++part) {
01906       // Make sure to add the reduction stat value only to the 
01907       // first unroll part.
01908       Value *StartVal = (part == 0) ? VectorStart : Identity;
01909       cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal, VecPreheader);
01910       cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part], LoopVectorBody);
01911     }
01912 
01913     // Before each round, move the insertion point right between
01914     // the PHIs and the values we are going to write.
01915     // This allows us to write both PHINodes and the extractelement
01916     // instructions.
01917     Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
01918 
01919     VectorParts RdxParts;
01920     for (unsigned part = 0; part < UF; ++part) {
01921       // This PHINode contains the vectorized reduction variable, or
01922       // the initial value vector, if we bypass the vector loop.
01923       VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr);
01924       PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
01925       Value *StartVal = (part == 0) ? VectorStart : Identity;
01926       for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
01927         NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]);
01928       NewPhi->addIncoming(RdxExitVal[part], LoopVectorBody);
01929       RdxParts.push_back(NewPhi);
01930     }
01931 
01932     // Reduce all of the unrolled parts into a single vector.
01933     Value *ReducedPartRdx = RdxParts[0];
01934     unsigned Op = getReductionBinOp(RdxDesc.Kind);
01935     for (unsigned part = 1; part < UF; ++part) {
01936       if (Op != Instruction::ICmp && Op != Instruction::FCmp)
01937         ReducedPartRdx = Builder.CreateBinOp((Instruction::BinaryOps)Op,
01938                                              RdxParts[part], ReducedPartRdx,
01939                                              "bin.rdx");
01940       else
01941         ReducedPartRdx = createMinMaxOp(Builder, RdxDesc.MinMaxKind,
01942                                         ReducedPartRdx, RdxParts[part]);
01943     }
01944 
01945     // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
01946     // and vector ops, reducing the set of values being computed by half each
01947     // round.
01948     assert(isPowerOf2_32(VF) &&
01949            "Reduction emission only supported for pow2 vectors!");
01950     Value *TmpVec = ReducedPartRdx;
01951     SmallVector<Constant*, 32> ShuffleMask(VF, 0);
01952     for (unsigned i = VF; i != 1; i >>= 1) {
01953       // Move the upper half of the vector to the lower half.
01954       for (unsigned j = 0; j != i/2; ++j)
01955         ShuffleMask[j] = Builder.getInt32(i/2 + j);
01956 
01957       // Fill the rest of the mask with undef.
01958       std::fill(&ShuffleMask[i/2], ShuffleMask.end(),
01959                 UndefValue::get(Builder.getInt32Ty()));
01960 
01961       Value *Shuf =
01962         Builder.CreateShuffleVector(TmpVec,
01963                                     UndefValue::get(TmpVec->getType()),
01964                                     ConstantVector::get(ShuffleMask),
01965                                     "rdx.shuf");
01966 
01967       if (Op != Instruction::ICmp && Op != Instruction::FCmp)
01968         TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf,
01969                                      "bin.rdx");
01970       else
01971         TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf);
01972     }
01973 
01974     // The result is in the first element of the vector.
01975     Value *Scalar0 = Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
01976 
01977     // Now, we need to fix the users of the reduction variable
01978     // inside and outside of the scalar remainder loop.
01979     // We know that the loop is in LCSSA form. We need to update the
01980     // PHI nodes in the exit blocks.
01981     for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
01982          LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
01983       PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
01984       if (!LCSSAPhi) continue;
01985 
01986       // All PHINodes need to have a single entry edge, or two if
01987       // we already fixed them.
01988       assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
01989 
01990       // We found our reduction value exit-PHI. Update it with the
01991       // incoming bypass edge.
01992       if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) {
01993         // Add an edge coming from the bypass.
01994         LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock);
01995         break;
01996       }
01997     }// end of the LCSSA phi scan.
01998 
01999     // Fix the scalar loop reduction variable with the incoming reduction sum
02000     // from the vector body and from the backedge value.
02001     int IncomingEdgeBlockIdx =
02002     (RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch());
02003     assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
02004     // Pick the other block.
02005     int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
02006     (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0);
02007     (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
02008   }// end of for each redux variable.
02009 
02010   // The Loop exit block may have single value PHI nodes where the incoming
02011   // value is 'undef'. While vectorizing we only handled real values that
02012   // were defined inside the loop. Here we handle the 'undef case'.
02013   // See PR14725.
02014   for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
02015        LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
02016     PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
02017     if (!LCSSAPhi) continue;
02018     if (LCSSAPhi->getNumIncomingValues() == 1)
02019       LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()),
02020                             LoopMiddleBlock);
02021   }
02022 }
02023 
02024 InnerLoopVectorizer::VectorParts
02025 InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
02026   assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) &&
02027          "Invalid edge");
02028 
02029   VectorParts SrcMask = createBlockInMask(Src);
02030 
02031   // The terminator has to be a branch inst!
02032   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
02033   assert(BI && "Unexpected terminator found");
02034 
02035   if (BI->isConditional()) {
02036     VectorParts EdgeMask = getVectorValue(BI->getCondition());
02037 
02038     if (BI->getSuccessor(0) != Dst)
02039       for (unsigned part = 0; part < UF; ++part)
02040         EdgeMask[part] = Builder.CreateNot(EdgeMask[part]);
02041 
02042     for (unsigned part = 0; part < UF; ++part)
02043       EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]);
02044     return EdgeMask;
02045   }
02046 
02047   return SrcMask;
02048 }
02049 
02050 InnerLoopVectorizer::VectorParts
02051 InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
02052   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
02053 
02054   // Loop incoming mask is all-one.
02055   if (OrigLoop->getHeader() == BB) {
02056     Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1);
02057     return getVectorValue(C);
02058   }
02059 
02060   // This is the block mask. We OR all incoming edges, and with zero.
02061   Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0);
02062   VectorParts BlockMask = getVectorValue(Zero);
02063 
02064   // For each pred:
02065   for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) {
02066     VectorParts EM = createEdgeMask(*it, BB);
02067     for (unsigned part = 0; part < UF; ++part)
02068       BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]);
02069   }
02070 
02071   return BlockMask;
02072 }
02073 
02074 void
02075 InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
02076                                           BasicBlock *BB, PhiVector *PV) {
02077   // For each instruction in the old loop.
02078   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
02079     VectorParts &Entry = WidenMap.get(it);
02080     switch (it->getOpcode()) {
02081     case Instruction::Br:
02082       // Nothing to do for PHIs and BR, since we already took care of the
02083       // loop control flow instructions.
02084       continue;
02085     case Instruction::PHI:{
02086       PHINode* P = cast<PHINode>(it);
02087       // Handle reduction variables:
02088       if (Legal->getReductionVars()->count(P)) {
02089         for (unsigned part = 0; part < UF; ++part) {
02090           // This is phase one of vectorizing PHIs.
02091           Type *VecTy = VectorType::get(it->getType(), VF);
02092           Entry[part] = PHINode::Create(VecTy, 2, "vec.phi",
02093                                         LoopVectorBody-> getFirstInsertionPt());
02094         }
02095         PV->push_back(P);
02096         continue;
02097       }
02098 
02099       // Check for PHI nodes that are lowered to vector selects.
02100       if (P->getParent() != OrigLoop->getHeader()) {
02101         // We know that all PHIs in non header blocks are converted into
02102         // selects, so we don't have to worry about the insertion order and we
02103         // can just use the builder.
02104         // At this point we generate the predication tree. There may be
02105         // duplications since this is a simple recursive scan, but future
02106         // optimizations will clean it up.
02107 
02108         unsigned NumIncoming = P->getNumIncomingValues();
02109 
02110         // Generate a sequence of selects of the form:
02111         // SELECT(Mask3, In3,
02112         //      SELECT(Mask2, In2,
02113         //                   ( ...)))
02114         for (unsigned In = 0; In < NumIncoming; In++) {
02115           VectorParts Cond = createEdgeMask(P->getIncomingBlock(In),
02116                                             P->getParent());
02117           VectorParts &In0 = getVectorValue(P->getIncomingValue(In));
02118 
02119           for (unsigned part = 0; part < UF; ++part) {
02120             // We might have single edge PHIs (blocks) - use an identity
02121             // 'select' for the first PHI operand.
02122             if (In == 0)
02123               Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
02124                                                  In0[part]);
02125             else
02126               // Select between the current value and the previous incoming edge
02127               // based on the incoming mask.
02128               Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
02129                                                  Entry[part], "predphi");
02130           }
02131         }
02132         continue;
02133       }
02134 
02135       // This PHINode must be an induction variable.
02136       // Make sure that we know about it.
02137       assert(Legal->getInductionVars()->count(P) &&
02138              "Not an induction variable");
02139 
02140       LoopVectorizationLegality::InductionInfo II =
02141         Legal->getInductionVars()->lookup(P);
02142 
02143       switch (II.IK) {
02144       case LoopVectorizationLegality::IK_NoInduction:
02145         llvm_unreachable("Unknown induction");
02146       case LoopVectorizationLegality::IK_IntInduction: {
02147         assert(P->getType() == II.StartValue->getType() && "Types must match");
02148         Type *PhiTy = P->getType();
02149         Value *Broadcasted;
02150         if (P == OldInduction) {
02151           // Handle the canonical induction variable. We might have had to
02152           // extend the type.
02153           Broadcasted = Builder.CreateTrunc(Induction, PhiTy);
02154         } else {
02155           // Handle other induction variables that are now based on the
02156           // canonical one.
02157           Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx,
02158                                                    "normalized.idx");
02159           NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy);
02160           Broadcasted = Builder.CreateAdd(II.StartValue, NormalizedIdx,
02161                                           "offset.idx");
02162         }
02163         Broadcasted = getBroadcastInstrs(Broadcasted);
02164         // After broadcasting the induction variable we need to make the vector
02165         // consecutive by adding 0, 1, 2, etc.
02166         for (unsigned part = 0; part < UF; ++part)
02167           Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false);
02168         continue;
02169       }
02170       case LoopVectorizationLegality::IK_ReverseIntInduction:
02171       case LoopVectorizationLegality::IK_PtrInduction:
02172       case LoopVectorizationLegality::IK_ReversePtrInduction:
02173         // Handle reverse integer and pointer inductions.
02174         Value *StartIdx = ExtendedIdx;
02175         // This is the normalized GEP that starts counting at zero.
02176         Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx,
02177                                                  "normalized.idx");
02178 
02179         // Handle the reverse integer induction variable case.
02180         if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) {
02181           IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType());
02182           Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy,
02183                                                  "resize.norm.idx");
02184           Value *ReverseInd  = Builder.CreateSub(II.StartValue, CNI,
02185                                                  "reverse.idx");
02186 
02187           // This is a new value so do not hoist it out.
02188           Value *Broadcasted = getBroadcastInstrs(ReverseInd);
02189           // After broadcasting the induction variable we need to make the
02190           // vector consecutive by adding  ... -3, -2, -1, 0.
02191           for (unsigned part = 0; part < UF; ++part)
02192             Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part,
02193                                                true);
02194           continue;
02195         }
02196 
02197         // Handle the pointer induction variable case.
02198         assert(P->getType()->isPointerTy() && "Unexpected type.");
02199 
02200         // Is this a reverse induction ptr or a consecutive induction ptr.
02201         bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction ==
02202                         II.IK);
02203 
02204         // This is the vector of results. Notice that we don't generate
02205         // vector geps because scalar geps result in better code.
02206         for (unsigned part = 0; part < UF; ++part) {
02207           Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
02208           for (unsigned int i = 0; i < VF; ++i) {
02209             int EltIndex = (i + part * VF) * (Reverse ? -1 : 1);
02210             Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex);
02211             Value *GlobalIdx;
02212             if (!Reverse)
02213               GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
02214             else
02215               GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx");
02216 
02217             Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
02218                                                "next.gep");
02219             VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
02220                                                  Builder.getInt32(i),
02221                                                  "insert.gep");
02222           }
02223           Entry[part] = VecVal;
02224         }
02225         continue;
02226       }
02227 
02228     }// End of PHI.
02229 
02230     case Instruction::Add:
02231     case Instruction::FAdd:
02232     case Instruction::Sub:
02233     case Instruction::FSub:
02234     case Instruction::Mul:
02235     case Instruction::FMul:
02236     case Instruction::UDiv:
02237     case Instruction::SDiv:
02238     case Instruction::FDiv:
02239     case Instruction::URem:
02240     case Instruction::SRem:
02241     case Instruction::FRem:
02242     case Instruction::Shl:
02243     case Instruction::LShr:
02244     case Instruction::AShr:
02245     case Instruction::And:
02246     case Instruction::Or:
02247     case Instruction::Xor: {
02248       // Just widen binops.
02249       BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it);
02250       VectorParts &A = getVectorValue(it->getOperand(0));
02251       VectorParts &B = getVectorValue(it->getOperand(1));
02252 
02253       // Use this vector value for all users of the original instruction.
02254       for (unsigned Part = 0; Part < UF; ++Part) {
02255         Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]);
02256 
02257         // Update the NSW, NUW and Exact flags. Notice: V can be an Undef.
02258         BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V);
02259         if (VecOp && isa<OverflowingBinaryOperator>(BinOp)) {
02260           VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap());
02261           VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap());
02262         }
02263         if (VecOp && isa<PossiblyExactOperator>(VecOp))
02264           VecOp->setIsExact(BinOp->isExact());
02265 
02266         Entry[Part] = V;
02267       }
02268       break;
02269     }
02270     case Instruction::Select: {
02271       // Widen selects.
02272       // If the selector is loop invariant we can create a select
02273       // instruction with a scalar condition. Otherwise, use vector-select.
02274       bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)),
02275                                                OrigLoop);
02276 
02277       // The condition can be loop invariant  but still defined inside the
02278       // loop. This means that we can't just use the original 'cond' value.
02279       // We have to take the 'vectorized' value and pick the first lane.
02280       // Instcombine will make this a no-op.
02281       VectorParts &Cond = getVectorValue(it->getOperand(0));
02282       VectorParts &Op0  = getVectorValue(it->getOperand(1));
02283       VectorParts &Op1  = getVectorValue(it->getOperand(2));
02284       Value *ScalarCond = Builder.CreateExtractElement(Cond[0],
02285                                                        Builder.getInt32(0));
02286       for (unsigned Part = 0; Part < UF; ++Part) {
02287         Entry[Part] = Builder.CreateSelect(
02288           InvariantCond ? ScalarCond : Cond[Part],
02289           Op0[Part],
02290           Op1[Part]);
02291       }
02292       break;
02293     }
02294 
02295     case Instruction::ICmp:
02296     case Instruction::FCmp: {
02297       // Widen compares. Generate vector compares.
02298       bool FCmp = (it->getOpcode() == Instruction::FCmp);
02299       CmpInst *Cmp = dyn_cast<CmpInst>(it);
02300       VectorParts &A = getVectorValue(it->getOperand(0));
02301       VectorParts &B = getVectorValue(it->getOperand(1));
02302       for (unsigned Part = 0; Part < UF; ++Part) {
02303         Value *C = 0;
02304         if (FCmp)
02305           C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]);
02306         else
02307           C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]);
02308         Entry[Part] = C;
02309       }
02310       break;
02311     }
02312 
02313     case Instruction::Store:
02314     case Instruction::Load:
02315         vectorizeMemoryInstruction(it, Legal);
02316         break;
02317     case Instruction::ZExt:
02318     case Instruction::SExt:
02319     case Instruction::FPToUI:
02320     case Instruction::FPToSI:
02321     case Instruction::FPExt:
02322     case Instruction::PtrToInt:
02323     case Instruction::IntToPtr:
02324     case Instruction::SIToFP:
02325     case Instruction::UIToFP:
02326     case Instruction::Trunc:
02327     case Instruction::FPTrunc:
02328     case Instruction::BitCast: {
02329       CastInst *CI = dyn_cast<CastInst>(it);
02330       /// Optimize the special case where the source is the induction
02331       /// variable. Notice that we can only optimize the 'trunc' case
02332       /// because: a. FP conversions lose precision, b. sext/zext may wrap,
02333       /// c. other casts depend on pointer size.
02334       if (CI->getOperand(0) == OldInduction &&
02335           it->getOpcode() == Instruction::Trunc) {
02336         Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction,
02337                                                CI->getType());
02338         Value *Broadcasted = getBroadcastInstrs(ScalarCast);
02339         for (unsigned Part = 0; Part < UF; ++Part)
02340           Entry[Part] = getConsecutiveVector(Broadcasted, VF * Part, false);
02341         break;
02342       }
02343       /// Vectorize casts.
02344       Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF);
02345 
02346       VectorParts &A = getVectorValue(it->getOperand(0));
02347       for (unsigned Part = 0; Part < UF; ++Part)
02348         Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy);
02349       break;
02350     }
02351 
02352     case Instruction::Call: {
02353       // Ignore dbg intrinsics.
02354       if (isa<DbgInfoIntrinsic>(it))
02355         break;
02356 
02357       Module *M = BB->getParent()->getParent();
02358       CallInst *CI = cast<CallInst>(it);
02359       Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
02360       assert(ID && "Not an intrinsic call!");
02361       for (unsigned Part = 0; Part < UF; ++Part) {
02362         SmallVector<Value*, 4> Args;
02363         for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
02364           VectorParts &Arg = getVectorValue(CI->getArgOperand(i));
02365           Args.push_back(Arg[Part]);
02366         }
02367         Type *Tys[] = { VectorType::get(CI->getType()->getScalarType(), VF) };
02368         Function *F = Intrinsic::getDeclaration(M, ID, Tys);
02369         Entry[Part] = Builder.CreateCall(F, Args);
02370       }
02371       break;
02372     }
02373 
02374     default:
02375       // All other instructions are unsupported. Scalarize them.
02376       scalarizeInstruction(it);
02377       break;
02378     }// end of switch.
02379   }// end of for_each instr.
02380 }
02381 
02382 void InnerLoopVectorizer::updateAnalysis() {
02383   // Forget the original basic block.
02384   SE->forgetLoop(OrigLoop);
02385 
02386   // Update the dominator tree information.
02387   assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
02388          "Entry does not dominate exit.");
02389 
02390   for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
02391     DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]);
02392   DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back());
02393   DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader);
02394   DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks.front());
02395   DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock);
02396   DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
02397   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
02398 
02399   DEBUG(DT->verifyAnalysis());
02400 }
02401 
02402 bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
02403   if (!EnableIfConversion)
02404     return false;
02405 
02406   assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
02407   std::vector<BasicBlock*> &LoopBlocks = TheLoop->getBlocksVector();
02408 
02409   // Collect the blocks that need predication.
02410   for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) {
02411     BasicBlock *BB = LoopBlocks[i];
02412 
02413     // We don't support switch statements inside loops.
02414     if (!isa<BranchInst>(BB->getTerminator()))
02415       return false;
02416 
02417     // We must be able to predicate all blocks that need to be predicated.
02418     if (blockNeedsPredication(BB) && !blockCanBePredicated(BB))
02419       return false;
02420   }
02421 
02422   // Check that we can actually speculate the hoistable loads.
02423   if (!LoadSpeculation.canHoistAllLoads())
02424     return false;
02425 
02426   // We can if-convert this loop.
02427   return true;
02428 }
02429 
02430 bool LoopVectorizationLegality::canVectorize() {
02431   assert(TheLoop->getLoopPreheader() && "No preheader!!");
02432 
02433   // We can only vectorize innermost loops.
02434   if (TheLoop->getSubLoopsVector().size())
02435     return false;
02436 
02437   // We must have a single backedge.
02438   if (TheLoop->getNumBackEdges() != 1)
02439     return false;
02440 
02441   // We must have a single exiting block.
02442   if (!TheLoop->getExitingBlock())
02443     return false;
02444 
02445   unsigned NumBlocks = TheLoop->getNumBlocks();
02446 
02447   // Check if we can if-convert non single-bb loops.
02448   if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
02449     DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
02450     return false;
02451   }
02452 
02453   // We need to have a loop header.
02454   BasicBlock *Latch = TheLoop->getLoopLatch();
02455   DEBUG(dbgs() << "LV: Found a loop: " <<
02456         TheLoop->getHeader()->getName() << "\n");
02457 
02458   // ScalarEvolution needs to be able to find the exit count.
02459   const SCEV *ExitCount = SE->getExitCount(TheLoop, Latch);
02460   if (ExitCount == SE->getCouldNotCompute()) {
02461     DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
02462     return false;
02463   }
02464 
02465   // Do not loop-vectorize loops with a tiny trip count.
02466   unsigned TC = SE->getSmallConstantTripCount(TheLoop, Latch);
02467   if (TC > 0u && TC < TinyTripCountVectorThreshold) {
02468     DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " <<
02469           "This loop is not worth vectorizing.\n");
02470     return false;
02471   }
02472 
02473   // Check if we can vectorize the instructions and CFG in this loop.
02474   if (!canVectorizeInstrs()) {
02475     DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
02476     return false;
02477   }
02478 
02479   // Go over each instruction and look at memory deps.
02480   if (!canVectorizeMemory()) {
02481     DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
02482     return false;
02483   }
02484 
02485   // Collect all of the variables that remain uniform after vectorization.
02486   collectLoopUniforms();
02487 
02488   DEBUG(dbgs() << "LV: We can vectorize this loop" <<
02489         (PtrRtCheck.Need ? " (with a runtime bound check)" : "")
02490         <<"!\n");
02491 
02492   // Okay! We can vectorize. At this point we don't have any other mem analysis
02493   // which may limit our maximum vectorization factor, so just return true with
02494   // no restrictions.
02495   return true;
02496 }
02497 
02498 static Type *convertPointerToIntegerType(DataLayout &DL, Type *Ty) {
02499   if (Ty->isPointerTy())
02500     return DL.getIntPtrType(Ty->getContext());
02501   return Ty;
02502 }
02503 
02504 static Type* getWiderType(DataLayout &DL, Type *Ty0, Type *Ty1) {
02505   Ty0 = convertPointerToIntegerType(DL, Ty0);
02506   Ty1 = convertPointerToIntegerType(DL, Ty1);
02507   if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
02508     return Ty0;
02509   return Ty1;
02510 }
02511 
02512 bool LoopVectorizationLegality::canVectorizeInstrs() {
02513   BasicBlock *PreHeader = TheLoop->getLoopPreheader();
02514   BasicBlock *Header = TheLoop->getHeader();
02515 
02516   // If we marked the scalar loop as "already vectorized" then no need
02517   // to vectorize it again.
02518   if (Header->getTerminator()->getMetadata(AlreadyVectorizedMDName)) {
02519     DEBUG(dbgs() << "LV: This loop was vectorized before\n");
02520     return false;
02521   }
02522 
02523   // Look for the attribute signaling the absence of NaNs.
02524   Function &F = *Header->getParent();
02525   if (F.hasFnAttribute("no-nans-fp-math"))
02526     HasFunNoNaNAttr = F.getAttributes().getAttribute(
02527       AttributeSet::FunctionIndex,
02528       "no-nans-fp-math").getValueAsString() == "true";
02529 
02530   // For each block in the loop.
02531   for (Loop::block_iterator bb = TheLoop->block_begin(),
02532        be = TheLoop->block_end(); bb != be; ++bb) {
02533 
02534     // Scan the instructions in the block and look for hazards.
02535     for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
02536          ++it) {
02537 
02538       if (PHINode *Phi = dyn_cast<PHINode>(it)) {
02539         Type *PhiTy = Phi->getType();
02540         // Check that this PHI type is allowed.
02541         if (!PhiTy->isIntegerTy() &&
02542             !PhiTy->isFloatingPointTy() &&
02543             !PhiTy->isPointerTy()) {
02544           DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
02545           return false;
02546         }
02547 
02548         // If this PHINode is not in the header block, then we know that we
02549         // can convert it to select during if-conversion. No need to check if
02550         // the PHIs in this block are induction or reduction variables.
02551         if (*bb != Header)
02552           continue;
02553 
02554         // We only allow if-converted PHIs with more than two incoming values.
02555         if (Phi->getNumIncomingValues() != 2) {
02556           DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
02557           return false;
02558         }
02559 
02560         // This is the value coming from the preheader.
02561         Value *StartValue = Phi->getIncomingValueForBlock(PreHeader);
02562         // Check if this is an induction variable.
02563         InductionKind IK = isInductionVariable(Phi);
02564 
02565         if (IK_NoInduction != IK) {
02566           // Get the widest type.
02567           if (!WidestIndTy)
02568             WidestIndTy = convertPointerToIntegerType(*DL, PhiTy);
02569           else
02570             WidestIndTy = getWiderType(*DL, PhiTy, WidestIndTy);
02571 
02572           // Int inductions are special because we only allow one IV.
02573           if (IK == IK_IntInduction) {
02574             // Use the phi node with the widest type as induction. Use the last
02575             // one if there are multiple (no good reason for doing this other
02576             // than it is expedient).
02577             if (!Induction || PhiTy == WidestIndTy)
02578               Induction = Phi;
02579           }
02580 
02581           DEBUG(dbgs() << "LV: Found an induction variable.\n");
02582           Inductions[Phi] = InductionInfo(StartValue, IK);
02583           continue;
02584         }
02585 
02586         if (AddReductionVar(Phi, RK_IntegerAdd)) {
02587           DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n");
02588           continue;
02589         }
02590         if (AddReductionVar(Phi, RK_IntegerMult)) {
02591           DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n");
02592           continue;
02593         }
02594         if (AddReductionVar(Phi, RK_IntegerOr)) {
02595           DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n");
02596           continue;
02597         }
02598         if (AddReductionVar(Phi, RK_IntegerAnd)) {
02599           DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n");
02600           continue;
02601         }
02602         if (AddReductionVar(Phi, RK_IntegerXor)) {
02603           DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n");
02604           continue;
02605         }
02606         if (AddReductionVar(Phi, RK_IntegerMinMax)) {
02607           DEBUG(dbgs() << "LV: Found a MINMAX reduction PHI."<< *Phi <<"\n");
02608           continue;
02609         }
02610         if (AddReductionVar(Phi, RK_FloatMult)) {
02611           DEBUG(dbgs() << "LV: Found an FMult reduction PHI."<< *Phi <<"\n");
02612           continue;
02613         }
02614         if (AddReductionVar(Phi, RK_FloatAdd)) {
02615           DEBUG(dbgs() << "LV: Found an FAdd reduction PHI."<< *Phi <<"\n");
02616           continue;
02617         }
02618         if (AddReductionVar(Phi, RK_FloatMinMax)) {
02619           DEBUG(dbgs() << "LV: Found an float MINMAX reduction PHI."<< *Phi <<"\n");
02620           continue;
02621         }
02622 
02623         DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
02624         return false;
02625       }// end of PHI handling
02626 
02627       // We still don't handle functions. However, we can ignore dbg intrinsic
02628       // calls and we do handle certain intrinsic and libm functions.
02629       CallInst *CI = dyn_cast<CallInst>(it);
02630       if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI)) {
02631         DEBUG(dbgs() << "LV: Found a call site.\n");
02632         return false;
02633       }
02634 
02635       // Check that the instruction return type is vectorizable.
02636       if (!VectorType::isValidElementType(it->getType()) &&
02637           !it->getType()->isVoidTy()) {
02638         DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n");
02639         return false;
02640       }
02641 
02642       // Check that the stored type is vectorizable.
02643       if (StoreInst *ST = dyn_cast<StoreInst>(it)) {
02644         Type *T = ST->getValueOperand()->getType();
02645         if (!VectorType::isValidElementType(T))
02646           return false;
02647       }
02648 
02649       // Reduction instructions are allowed to have exit users.
02650       // All other instructions must not have external users.
02651       if (!AllowedExit.count(it))
02652         //Check that all of the users of the loop are inside the BB.
02653         for (Value::use_iterator I = it->use_begin(), E = it->use_end();
02654              I != E; ++I) {
02655           Instruction *U = cast<Instruction>(*I);
02656           // This user may be a reduction exit value.
02657           if (!TheLoop->contains(U)) {
02658             DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
02659             return false;
02660           }
02661         }
02662     } // next instr.
02663 
02664   }
02665 
02666   if (!Induction) {
02667     DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
02668     if (Inductions.empty())
02669       return false;
02670   }
02671 
02672   return true;
02673 }
02674 
02675 void LoopVectorizationLegality::collectLoopUniforms() {
02676   // We now know that the loop is vectorizable!
02677   // Collect variables that will remain uniform after vectorization.
02678   std::vector<Value*> Worklist;
02679   BasicBlock *Latch = TheLoop->getLoopLatch();
02680 
02681   // Start with the conditional branch and walk up the block.
02682   Worklist.push_back(Latch->getTerminator()->getOperand(0));
02683 
02684   while (Worklist.size()) {
02685     Instruction *I = dyn_cast<Instruction>(Worklist.back());
02686     Worklist.pop_back();
02687 
02688     // Look at instructions inside this loop.
02689     // Stop when reaching PHI nodes.
02690     // TODO: we need to follow values all over the loop, not only in this block.
02691     if (!I || !TheLoop->contains(I) || isa<PHINode>(I))
02692       continue;
02693 
02694     // This is a known uniform.
02695     Uniforms.insert(I);
02696 
02697     // Insert all operands.
02698     Worklist.insert(Worklist.end(), I->op_begin(), I->op_end());
02699   }
02700 }
02701 
02702 AliasAnalysis::Location
02703 LoopVectorizationLegality::getLoadStoreLocation(Instruction *Inst) {
02704   if (StoreInst *Store = dyn_cast<StoreInst>(Inst))
02705     return AA->getLocation(Store);
02706   else if (LoadInst *Load = dyn_cast<LoadInst>(Inst))
02707     return AA->getLocation(Load);
02708 
02709   llvm_unreachable("Should be either load or store instruction");
02710 }
02711 
02712 bool
02713 LoopVectorizationLegality::hasPossibleGlobalWriteReorder(
02714                                                 Value *Object,
02715                                                 Instruction *Inst,
02716                                                 AliasMultiMap& WriteObjects,
02717                                                 unsigned MaxByteWidth) {
02718 
02719   AliasAnalysis::Location ThisLoc = getLoadStoreLocation(Inst);
02720 
02721   std::vector<Instruction*>::iterator
02722               it = WriteObjects[Object].begin(),
02723               end = WriteObjects[Object].end();
02724 
02725   for (; it != end; ++it) {
02726     Instruction* I = *it;
02727     if (I == Inst)
02728       continue;
02729 
02730     AliasAnalysis::Location ThatLoc = getLoadStoreLocation(I);
02731     if (AA->alias(ThisLoc.getWithNewSize(MaxByteWidth),
02732                   ThatLoc.getWithNewSize(MaxByteWidth)))
02733       return true;
02734   }
02735   return false;
02736 }
02737 
02738 bool LoopVectorizationLegality::canVectorizeMemory() {
02739 
02740   typedef SmallVector<Value*, 16> ValueVector;
02741   typedef SmallPtrSet<Value*, 16> ValueSet;
02742   // Holds the Load and Store *instructions*.
02743   ValueVector Loads;
02744   ValueVector Stores;
02745   PtrRtCheck.Pointers.clear();
02746   PtrRtCheck.Need = false;
02747 
02748   const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
02749 
02750   // For each block.
02751   for (Loop::block_iterator bb = TheLoop->block_begin(),
02752        be = TheLoop->block_end(); bb != be; ++bb) {
02753 
02754     // Scan the BB and collect legal loads and stores.
02755     for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
02756          ++it) {
02757 
02758       // If this is a load, save it. If this instruction can read from memory
02759       // but is not a load, then we quit. Notice that we don't handle function
02760       // calls that read or write.
02761       if (it->mayReadFromMemory()) {
02762         LoadInst *Ld = dyn_cast<LoadInst>(it);
02763         if (!Ld) return false;
02764         if (!Ld->isSimple() && !IsAnnotatedParallel) {
02765           DEBUG(dbgs() << "LV: Found a non-simple load.\n");
02766           return false;
02767         }
02768         Loads.push_back(Ld);
02769         continue;
02770       }
02771 
02772       // Save 'store' instructions. Abort if other instructions write to memory.
02773       if (it->mayWriteToMemory()) {
02774         StoreInst *St = dyn_cast<StoreInst>(it);
02775         if (!St) return false;
02776         if (!St->isSimple() && !IsAnnotatedParallel) {
02777           DEBUG(dbgs() << "LV: Found a non-simple store.\n");
02778           return false;
02779         }
02780         Stores.push_back(St);
02781       }
02782     } // next instr.
02783   } // next block.
02784 
02785   // Now we have two lists that hold the loads and the stores.
02786   // Next, we find the pointers that they use.
02787 
02788   // Check if we see any stores. If there are no stores, then we don't
02789   // care if the pointers are *restrict*.
02790   if (!Stores.size()) {
02791     DEBUG(dbgs() << "LV: Found a read-only loop!\n");
02792     return true;
02793   }
02794 
02795   // Holds the read and read-write *pointers* that we find. These maps hold
02796   // unique values for pointers (so no need for multi-map).
02797   AliasMap Reads;
02798   AliasMap ReadWrites;
02799 
02800   // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
02801   // multiple times on the same object. If the ptr is accessed twice, once
02802   // for read and once for write, it will only appear once (on the write
02803   // list). This is okay, since we are going to check for conflicts between
02804   // writes and between reads and writes, but not between reads and reads.
02805   ValueSet Seen;
02806 
02807   ValueVector::iterator I, IE;
02808   for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) {
02809     StoreInst *ST = cast<StoreInst>(*I);
02810     Value* Ptr = ST->getPointerOperand();
02811 
02812     if (isUniform(Ptr)) {
02813       DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
02814       return false;
02815     }
02816 
02817     // If we did *not* see this pointer before, insert it to
02818     // the read-write list. At this phase it is only a 'write' list.
02819     if (Seen.insert(Ptr))
02820       ReadWrites.insert(std::make_pair(Ptr, ST));
02821   }
02822 
02823   if (IsAnnotatedParallel) {
02824     DEBUG(dbgs()
02825           << "LV: A loop annotated parallel, ignore memory dependency "
02826           << "checks.\n");
02827     return true;
02828   }
02829 
02830   for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
02831     LoadInst *LD = cast<LoadInst>(*I);
02832     Value* Ptr = LD->getPointerOperand();
02833     // If we did *not* see this pointer before, insert it to the
02834     // read list. If we *did* see it before, then it is already in
02835     // the read-write list. This allows us to vectorize expressions
02836     // such as A[i] += x;  Because the address of A[i] is a read-write
02837     // pointer. This only works if the index of A[i] is consecutive.
02838     // If the address of i is unknown (for example A[B[i]]) then we may
02839     // read a few words, modify, and write a few words, and some of the
02840     // words may be written to the same address.
02841     if (Seen.insert(Ptr) || 0 == isConsecutivePtr(Ptr))
02842       Reads.insert(std::make_pair(Ptr, LD));
02843   }
02844 
02845   // If we write (or read-write) to a single destination and there are no
02846   // other reads in this loop then is it safe to vectorize.
02847   if (ReadWrites.size() == 1 && Reads.size() == 0) {
02848     DEBUG(dbgs() << "LV: Found a write-only loop!\n");
02849     return true;
02850   }
02851 
02852   unsigned NumReadPtrs = 0;
02853   unsigned NumWritePtrs = 0;
02854 
02855   // Find pointers with computable bounds. We are going to use this information
02856   // to place a runtime bound check.
02857   bool CanDoRT = true;
02858   AliasMap::iterator MI, ME;
02859   for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) {
02860     Value *V = (*MI).first;
02861     if (hasComputableBounds(V)) {
02862       PtrRtCheck.insert(SE, TheLoop, V, true);
02863       NumWritePtrs++;
02864       DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n");
02865     } else {
02866       CanDoRT = false;
02867       break;
02868     }
02869   }
02870   for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) {
02871     Value *V = (*MI).first;
02872     if (hasComputableBounds(V)) {
02873       PtrRtCheck.insert(SE, TheLoop, V, false);
02874       NumReadPtrs++;
02875       DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n");
02876     } else {
02877       CanDoRT = false;
02878       break;
02879     }
02880   }
02881 
02882   // Check that we did not collect too many pointers or found a
02883   // unsizeable pointer.
02884   unsigned NumComparisons = (NumWritePtrs * (NumReadPtrs + NumWritePtrs - 1));
02885   DEBUG(dbgs() << "LV: We need to compare " << NumComparisons << " ptrs.\n");
02886   if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) {
02887     PtrRtCheck.reset();
02888     CanDoRT = false;
02889   }
02890 
02891   if (CanDoRT) {
02892     DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n");
02893   }
02894 
02895   bool NeedRTCheck = false;
02896 
02897   // Biggest vectorized access possible, vector width * unroll factor.
02898   // TODO: We're being very pessimistic here, find a way to know the
02899   // real access width before getting here.
02900   unsigned MaxByteWidth = (TTI->getRegisterBitWidth(true) / 8) *
02901                            TTI->getMaximumUnrollFactor();
02902   // Now that the pointers are in two lists (Reads and ReadWrites), we
02903   // can check that there are no conflicts between each of the writes and
02904   // between the writes to the reads.
02905   // Note that WriteObjects duplicates the stores (indexed now by underlying
02906   // objects) to avoid pointing to elements inside ReadWrites.
02907   // TODO: Maybe create a new type where they can interact without duplication.
02908   AliasMultiMap WriteObjects;
02909   ValueVector TempObjects;
02910 
02911   // Check that the read-writes do not conflict with other read-write
02912   // pointers.
02913   bool AllWritesIdentified = true;
02914   for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) {
02915     Value *Val = (*MI).first;
02916     Instruction *Inst = (*MI).second;
02917 
02918     GetUnderlyingObjects(Val, TempObjects, DL);
02919     for (ValueVector::iterator UI=TempObjects.begin(), UE=TempObjects.end();
02920          UI != UE; ++UI) {
02921       if (!isIdentifiedObject(*UI)) {
02922         DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **UI <<"\n");
02923         NeedRTCheck = true;
02924         AllWritesIdentified = false;
02925       }
02926 
02927       // Never seen it before, can't alias.
02928       if (WriteObjects[*UI].empty()) {
02929         DEBUG(dbgs() << "LV: Adding Underlying value:" << **UI <<"\n");
02930         WriteObjects[*UI].push_back(Inst);
02931         continue;
02932       }
02933       // Direct alias found.
02934       if (!AA || dyn_cast<GlobalValue>(*UI) == NULL) {
02935         DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
02936               << **UI <<"\n");
02937         return false;
02938       }
02939       DEBUG(dbgs() << "LV: Found a conflicting global value:"
02940             << **UI <<"\n");
02941       DEBUG(dbgs() << "LV: While examining store:" << *Inst <<"\n");
02942       DEBUG(dbgs() << "LV: On value:" << *Val <<"\n");
02943 
02944       // If global alias, make sure they do alias.
02945       if (hasPossibleGlobalWriteReorder(*UI,
02946                                         Inst,
02947                                         WriteObjects,
02948                                         MaxByteWidth)) {
02949         DEBUG(dbgs() << "LV: Found a possible write-write reorder:" << **UI
02950                      << "\n");
02951         return false;
02952       }
02953 
02954       // Didn't alias, insert into map for further reference.
02955       WriteObjects[*UI].push_back(Inst);
02956     }
02957     TempObjects.clear();
02958   }
02959 
02960   /// Check that the reads don't conflict with the read-writes.
02961   for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) {
02962     Value *Val = (*MI).first;
02963     GetUnderlyingObjects(Val, TempObjects, DL);
02964     for (ValueVector::iterator UI=TempObjects.begin(), UE=TempObjects.end();
02965          UI != UE; ++UI) {
02966       // If all of the writes are identified then we don't care if the read
02967       // pointer is identified or not.
02968       if (!AllWritesIdentified && !isIdentifiedObject(*UI)) {
02969         DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **UI <<"\n");
02970         NeedRTCheck = true;
02971       }
02972 
02973       // Never seen it before, can't alias.
02974       if (WriteObjects[*UI].empty())
02975         continue;
02976       // Direct alias found.
02977       if (!AA || dyn_cast<GlobalValue>(*UI) == NULL) {
02978         DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
02979               << **UI <<"\n");
02980         return false;
02981       }
02982       DEBUG(dbgs() << "LV: Found a global value:  "
02983             << **UI <<"\n");
02984       Instruction *Inst = (*MI).second;
02985       DEBUG(dbgs() << "LV: While examining load:" << *Inst <<"\n");
02986       DEBUG(dbgs() << "LV: On value:" << *Val <<"\n");
02987 
02988       // If global alias, make sure they do alias.
02989       if (hasPossibleGlobalWriteReorder(*UI,
02990                                         Inst,
02991                                         WriteObjects,
02992                                         MaxByteWidth)) {
02993         DEBUG(dbgs() << "LV: Found a possible read-write reorder:" << **UI
02994                      << "\n");
02995         return false;
02996       }
02997     }
02998     TempObjects.clear();
02999   }
03000 
03001   PtrRtCheck.Need = NeedRTCheck;
03002   if (NeedRTCheck && !CanDoRT) {
03003     DEBUG(dbgs() << "LV: We can't vectorize because we can't find " <<
03004           "the array bounds.\n");
03005     PtrRtCheck.reset();
03006     return false;
03007   }
03008 
03009   DEBUG(dbgs() << "LV: We "<< (NeedRTCheck ? "" : "don't") <<
03010         " need a runtime memory check.\n");
03011   return true;
03012 }
03013 
03014 static bool hasMultipleUsesOf(Instruction *I,
03015                               SmallPtrSet<Instruction *, 8> &Insts) {
03016   unsigned NumUses = 0;
03017   for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) {
03018     if (Insts.count(dyn_cast<Instruction>(*Use)))
03019       ++NumUses;
03020     if (NumUses > 1)
03021       return true;
03022   }
03023 
03024   return false;
03025 }
03026 
03027 static bool areAllUsesIn(Instruction *I, SmallPtrSet<Instruction *, 8> &Set) {
03028   for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use)
03029     if (!Set.count(dyn_cast<Instruction>(*Use)))
03030       return false;
03031   return true;
03032 }
03033 
03034 bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
03035                                                 ReductionKind Kind) {
03036   if (Phi->getNumIncomingValues() != 2)
03037     return false;
03038 
03039   // Reduction variables are only found in the loop header block.
03040   if (Phi->getParent() != TheLoop->getHeader())
03041     return false;
03042 
03043   // Obtain the reduction start value from the value that comes from the loop
03044   // preheader.
03045   Value *RdxStart = Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader());
03046 
03047   // ExitInstruction is the single value which is used outside the loop.
03048   // We only allow for a single reduction value to be used outside the loop.
03049   // This includes users of the reduction, variables (which form a cycle
03050   // which ends in the phi node).
03051   Instruction *ExitInstruction = 0;
03052   // Indicates that we found a reduction operation in our scan.
03053   bool FoundReduxOp = false;
03054 
03055   // We start with the PHI node and scan for all of the users of this
03056   // instruction. All users must be instructions that can be used as reduction
03057   // variables (such as ADD). We must have a single out-of-block user. The cycle
03058   // must include the original PHI.
03059   bool FoundStartPHI = false;
03060 
03061   // To recognize min/max patterns formed by a icmp select sequence, we store
03062   // the number of instruction we saw from the recognized min/max pattern,
03063   //  to make sure we only see exactly the two instructions.
03064   unsigned NumCmpSelectPatternInst = 0;
03065   ReductionInstDesc ReduxDesc(false, 0);
03066 
03067   SmallPtrSet<Instruction *, 8> VisitedInsts;
03068   SmallVector<Instruction *, 8> Worklist;
03069   Worklist.push_back(Phi);
03070   VisitedInsts.insert(Phi);
03071 
03072   // A value in the reduction can be used:
03073   //  - By the reduction:
03074   //      - Reduction operation:
03075   //        - One use of reduction value (safe).
03076   //        - Multiple use of reduction value (not safe).
03077   //      - PHI:
03078   //        - All uses of the PHI must be the reduction (safe).
03079   //        - Otherwise, not safe.
03080   //  - By one instruction outside of the loop (safe).
03081   //  - By further instructions outside of the loop (not safe).
03082   //  - By an instruction that is not part of the reduction (not safe).
03083   //    This is either:
03084   //      * An instruction type other than PHI or the reduction operation.
03085   //      * A PHI in the header other than the initial PHI.
03086   while (!Worklist.empty()) {
03087     Instruction *Cur = Worklist.back();
03088     Worklist.pop_back();
03089 
03090     // No Users.
03091     // If the instruction has no users then this is a broken chain and can't be
03092     // a reduction variable.
03093     if (Cur->use_empty())
03094       return false;
03095 
03096     bool IsAPhi = isa<PHINode>(Cur);
03097 
03098     // A header PHI use other than the original PHI.
03099     if (Cur != Phi && IsAPhi && Cur->getParent() == Phi->getParent())
03100       return false;
03101 
03102     // Reductions of instructions such as Div, and Sub is only possible if the
03103     // LHS is the reduction variable.
03104     if (!Cur->isCommutative() && !IsAPhi && !isa<SelectInst>(Cur) &&
03105         !isa<ICmpInst>(Cur) && !isa<FCmpInst>(Cur) &&
03106         !VisitedInsts.count(dyn_cast<Instruction>(Cur->getOperand(0))))
03107       return false;
03108 
03109     // Any reduction instruction must be of one of the allowed kinds.
03110     ReduxDesc = isReductionInstr(Cur, Kind, ReduxDesc);
03111     if (!ReduxDesc.IsReduction)
03112       return false;
03113 
03114     // A reduction operation must only have one use of the reduction value.
03115     if (!IsAPhi && Kind != RK_IntegerMinMax && Kind != RK_FloatMinMax &&
03116         hasMultipleUsesOf(Cur, VisitedInsts))
03117       return false;
03118 
03119     // All inputs to a PHI node must be a reduction value.
03120     if(IsAPhi && Cur != Phi && !areAllUsesIn(Cur, VisitedInsts))
03121       return false;
03122 
03123     if (Kind == RK_IntegerMinMax && (isa<ICmpInst>(Cur) ||
03124                                      isa<SelectInst>(Cur)))
03125       ++NumCmpSelectPatternInst;
03126     if (Kind == RK_FloatMinMax && (isa<FCmpInst>(Cur) ||
03127                                    isa<SelectInst>(Cur)))
03128       ++NumCmpSelectPatternInst;
03129 
03130     // Check  whether we found a reduction operator.
03131     FoundReduxOp |= !IsAPhi;
03132 
03133     // Process users of current instruction. Push non PHI nodes after PHI nodes
03134     // onto the stack. This way we are going to have seen all inputs to PHI
03135     // nodes once we get to them.
03136     SmallVector<Instruction *, 8> NonPHIs;
03137     SmallVector<Instruction *, 8> PHIs;
03138     for (Value::use_iterator UI = Cur->use_begin(), E = Cur->use_end(); UI != E;
03139          ++UI) {
03140       Instruction *Usr = cast<Instruction>(*UI);
03141 
03142       // Check if we found the exit user.
03143       BasicBlock *Parent = Usr->getParent();
03144       if (!TheLoop->contains(Parent)) {
03145         // Exit if you find multiple outside users.
03146         if (ExitInstruction != 0)
03147           return false;
03148         ExitInstruction = Cur;
03149         continue;
03150       }
03151 
03152       // Process instructions only once (termination).
03153       if (VisitedInsts.insert(Usr)) {
03154         if (isa<PHINode>(Usr))
03155           PHIs.push_back(Usr);
03156         else
03157           NonPHIs.push_back(Usr);
03158       }
03159       // Remember that we completed the cycle.
03160       if (Usr == Phi)
03161         FoundStartPHI = true;
03162     }
03163     Worklist.append(PHIs.begin(), PHIs.end());
03164     Worklist.append(NonPHIs.begin(), NonPHIs.end());
03165   }
03166 
03167   // This means we have seen one but not the other instruction of the
03168   // pattern or more than just a select and cmp.
03169   if ((Kind == RK_IntegerMinMax || Kind == RK_FloatMinMax) &&
03170       NumCmpSelectPatternInst != 2)
03171     return false;
03172 
03173   if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction)
03174     return false;
03175 
03176   // We found a reduction var if we have reached the original phi node and we
03177   // only have a single instruction with out-of-loop users.
03178 
03179   // This instruction is allowed to have out-of-loop users.
03180   AllowedExit.insert(ExitInstruction);
03181 
03182   // Save the description of this reduction variable.
03183   ReductionDescriptor RD(RdxStart, ExitInstruction, Kind,
03184                          ReduxDesc.MinMaxKind);
03185   Reductions[Phi] = RD;
03186   // We've ended the cycle. This is a reduction variable if we have an
03187   // outside user and it has a binary op.
03188 
03189   return true;
03190 }
03191 
03192 /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction
03193 /// pattern corresponding to a min(X, Y) or max(X, Y).
03194 LoopVectorizationLegality::ReductionInstDesc
03195 LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I,
03196                                                     ReductionInstDesc &Prev) {
03197 
03198   assert((isa<ICmpInst>(I) || isa<FCmpInst>(I) || isa<SelectInst>(I)) &&
03199          "Expect a select instruction");
03200   Instruction *Cmp = 0;
03201   SelectInst *Select = 0;
03202 
03203   // We must handle the select(cmp()) as a single instruction. Advance to the
03204   // select.
03205   if ((Cmp = dyn_cast<ICmpInst>(I)) || (Cmp = dyn_cast<FCmpInst>(I))) {
03206     if (!Cmp->hasOneUse() || !(Select = dyn_cast<SelectInst>(*I->use_begin())))
03207       return ReductionInstDesc(false, I);
03208     return ReductionInstDesc(Select, Prev.MinMaxKind);
03209   }
03210 
03211   // Only handle single use cases for now.
03212   if (!(Select = dyn_cast<SelectInst>(I)))
03213     return ReductionInstDesc(false, I);
03214   if (!(Cmp = dyn_cast<ICmpInst>(I->getOperand(0))) &&
03215       !(Cmp = dyn_cast<FCmpInst>(I->getOperand(0))))
03216     return ReductionInstDesc(false, I);
03217   if (!Cmp->hasOneUse())
03218     return ReductionInstDesc(false, I);
03219 
03220   Value *CmpLeft;
03221   Value *CmpRight;
03222 
03223   // Look for a min/max pattern.
03224   if (m_UMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
03225     return ReductionInstDesc(Select, MRK_UIntMin);
03226   else if (m_UMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
03227     return ReductionInstDesc(Select, MRK_UIntMax);
03228   else if (m_SMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
03229     return ReductionInstDesc(Select, MRK_SIntMax);
03230   else if (m_SMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
03231     return ReductionInstDesc(Select, MRK_SIntMin);
03232   else if (m_OrdFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
03233     return ReductionInstDesc(Select, MRK_FloatMin);
03234   else if (m_OrdFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
03235     return ReductionInstDesc(Select, MRK_FloatMax);
03236   else if (m_UnordFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
03237     return ReductionInstDesc(Select, MRK_FloatMin);
03238   else if (m_UnordFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
03239     return ReductionInstDesc(Select, MRK_FloatMax);
03240 
03241   return ReductionInstDesc(false, I);
03242 }
03243 
03244 LoopVectorizationLegality::ReductionInstDesc
03245 LoopVectorizationLegality::isReductionInstr(Instruction *I,
03246                                             ReductionKind Kind,
03247                                             ReductionInstDesc &Prev) {
03248   bool FP = I->getType()->isFloatingPointTy();
03249   bool FastMath = (FP && I->isCommutative() && I->isAssociative());
03250   switch (I->getOpcode()) {
03251   default:
03252     return ReductionInstDesc(false, I);
03253   case Instruction::PHI:
03254       if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd &&
03255                  Kind != RK_FloatMinMax))
03256         return ReductionInstDesc(false, I);
03257     return ReductionInstDesc(I, Prev.MinMaxKind);
03258   case Instruction::Sub:
03259   case Instruction::Add:
03260     return ReductionInstDesc(Kind == RK_IntegerAdd, I);
03261   case Instruction::Mul:
03262     return ReductionInstDesc(Kind == RK_IntegerMult, I);
03263   case Instruction::And:
03264     return ReductionInstDesc(Kind == RK_IntegerAnd, I);
03265   case Instruction::Or:
03266     return ReductionInstDesc(Kind == RK_IntegerOr, I);
03267   case Instruction::Xor:
03268     return ReductionInstDesc(Kind == RK_IntegerXor, I);
03269   case Instruction::FMul:
03270     return ReductionInstDesc(Kind == RK_FloatMult && FastMath, I);
03271   case Instruction::FAdd:
03272     return ReductionInstDesc(Kind == RK_FloatAdd && FastMath, I);
03273   case Instruction::FCmp:
03274   case Instruction::ICmp:
03275   case Instruction::Select:
03276     if (Kind != RK_IntegerMinMax &&
03277         (!HasFunNoNaNAttr || Kind != RK_FloatMinMax))
03278       return ReductionInstDesc(false, I);
03279     return isMinMaxSelectCmpPattern(I, Prev);
03280   }
03281 }
03282 
03283 LoopVectorizationLegality::InductionKind
03284 LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
03285   Type *PhiTy = Phi->getType();
03286   // We only handle integer and pointer inductions variables.
03287   if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy())
03288     return IK_NoInduction;
03289 
03290   // Check that the PHI is consecutive.
03291   const SCEV *PhiScev = SE->getSCEV(Phi);
03292   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
03293   if (!AR) {
03294     DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
03295     return IK_NoInduction;
03296   }
03297   const SCEV *Step = AR->getStepRecurrence(*SE);
03298 
03299   // Integer inductions need to have a stride of one.
03300   if (PhiTy->isIntegerTy()) {
03301     if (Step->isOne())
03302       return IK_IntInduction;
03303     if (Step->isAllOnesValue())
03304       return IK_ReverseIntInduction;
03305     return IK_NoInduction;
03306   }
03307 
03308   // Calculate the pointer stride and check if it is consecutive.
03309   const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
03310   if (!C)
03311     return IK_NoInduction;
03312 
03313   assert(PhiTy->isPointerTy() && "The PHI must be a pointer");
03314   uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType());
03315   if (C->getValue()->equalsInt(Size))
03316     return IK_PtrInduction;
03317   else if (C->getValue()->equalsInt(0 - Size))
03318     return IK_ReversePtrInduction;
03319 
03320   return IK_NoInduction;
03321 }
03322 
03323 bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
03324   Value *In0 = const_cast<Value*>(V);
03325   PHINode *PN = dyn_cast_or_null<PHINode>(In0);
03326   if (!PN)
03327     return false;
03328 
03329   return Inductions.count(PN);
03330 }
03331 
03332 bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB)  {
03333   assert(TheLoop->contains(BB) && "Unknown block used");
03334 
03335   // Blocks that do not dominate the latch need predication.
03336   BasicBlock* Latch = TheLoop->getLoopLatch();
03337   return !DT->dominates(BB, Latch);
03338 }
03339 
03340 bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) {
03341   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
03342     // We might be able to hoist the load.
03343     if (it->mayReadFromMemory() && !LoadSpeculation.isHoistableLoad(it))
03344       return false;
03345 
03346     // We don't predicate stores at the moment.
03347     if (it->mayWriteToMemory() || it->mayThrow())
03348       return false;
03349 
03350     // The instructions below can trap.
03351     switch (it->getOpcode()) {
03352     default: continue;
03353     case Instruction::UDiv:
03354     case Instruction::SDiv:
03355     case Instruction::URem:
03356     case Instruction::SRem:
03357              return false;
03358     }
03359   }
03360 
03361   return true;
03362 }
03363 
03364 bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) {
03365   const SCEV *PhiScev = SE->getSCEV(Ptr);
03366   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
03367   if (!AR)
03368     return false;
03369 
03370   return AR->isAffine();
03371 }
03372 
03373 LoopVectorizationCostModel::VectorizationFactor
03374 LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
03375                                                       unsigned UserVF) {
03376   // Width 1 means no vectorize
03377   VectorizationFactor Factor = { 1U, 0U };
03378   if (OptForSize && Legal->getRuntimePointerCheck()->Need) {
03379     DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n");
03380     return Factor;
03381   }
03382 
03383   // Find the trip count.
03384   unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch());
03385   DEBUG(dbgs() << "LV: Found trip count:"<<TC<<"\n");
03386 
03387   unsigned WidestType = getWidestType();
03388   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
03389   unsigned MaxVectorSize = WidestRegister / WidestType;
03390   DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n");
03391   DEBUG(dbgs() << "LV: The Widest register is:" << WidestRegister << "bits.\n");
03392 
03393   if (MaxVectorSize == 0) {
03394     DEBUG(dbgs() << "LV: The target has no vector registers.\n");
03395     MaxVectorSize = 1;
03396   }
03397 
03398   assert(MaxVectorSize <= 32 && "Did not expect to pack so many elements"
03399          " into one vector!");
03400 
03401   unsigned VF = MaxVectorSize;
03402 
03403   // If we optimize the program for size, avoid creating the tail loop.
03404   if (OptForSize) {
03405     // If we are unable to calculate the trip count then don't try to vectorize.
03406     if (TC < 2) {
03407       DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
03408       return Factor;
03409     }
03410 
03411     // Find the maximum SIMD width that can fit within the trip count.
03412     VF = TC % MaxVectorSize;
03413 
03414     if (VF == 0)
03415       VF = MaxVectorSize;
03416 
03417     // If the trip count that we found modulo the vectorization factor is not
03418     // zero then we require a tail.
03419     if (VF < 2) {
03420       DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
03421       return Factor;
03422     }
03423   }
03424 
03425   if (UserVF != 0) {
03426     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
03427     DEBUG(dbgs() << "LV: Using user VF "<<UserVF<<".\n");
03428 
03429     Factor.Width = UserVF;
03430     return Factor;
03431   }
03432 
03433   float Cost = expectedCost(1);
03434   unsigned Width = 1;
03435   DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n");
03436   for (unsigned i=2; i <= VF; i*=2) {
03437     // Notice that the vector loop needs to be executed less times, so
03438     // we need to divide the cost of the vector loops by the width of
03439     // the vector elements.
03440     float VectorCost = expectedCost(i) / (float)i;
03441     DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " <<
03442           (int)VectorCost << ".\n");
03443     if (VectorCost < Cost) {
03444       Cost = VectorCost;
03445       Width = i;
03446     }
03447   }
03448 
03449   DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n");
03450   Factor.Width = Width;
03451   Factor.Cost = Width * Cost;
03452   return Factor;
03453 }
03454 
03455 unsigned LoopVectorizationCostModel::getWidestType() {
03456   unsigned MaxWidth = 8;
03457 
03458   // For each block.
03459   for (Loop::block_iterator bb = TheLoop->block_begin(),
03460        be = TheLoop->block_end(); bb != be; ++bb) {
03461     BasicBlock *BB = *bb;
03462 
03463     // For each instruction in the loop.
03464     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
03465       Type *T = it->getType();
03466 
03467       // Only examine Loads, Stores and PHINodes.
03468       if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it))
03469         continue;
03470 
03471       // Examine PHI nodes that are reduction variables.
03472       if (PHINode *PN = dyn_cast<PHINode>(it))
03473         if (!Legal->getReductionVars()->count(PN))
03474           continue;
03475 
03476       // Examine the stored values.
03477       if (StoreInst *ST = dyn_cast<StoreInst>(it))
03478         T = ST->getValueOperand()->getType();
03479 
03480       // Ignore loaded pointer types and stored pointer types that are not
03481       // consecutive. However, we do want to take consecutive stores/loads of
03482       // pointer vectors into account.
03483       if (T->isPointerTy() && !isConsecutiveLoadOrStore(it))
03484         continue;
03485 
03486       MaxWidth = std::max(MaxWidth,
03487                           (unsigned)DL->getTypeSizeInBits(T->getScalarType()));
03488     }
03489   }
03490 
03491   return MaxWidth;
03492 }
03493 
03494 unsigned
03495 LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
03496                                                unsigned UserUF,
03497                                                unsigned VF,
03498                                                unsigned LoopCost) {
03499 
03500   // -- The unroll heuristics --
03501   // We unroll the loop in order to expose ILP and reduce the loop overhead.
03502   // There are many micro-architectural considerations that we can't predict
03503   // at this level. For example frontend pressure (on decode or fetch) due to
03504   // code size, or the number and capabilities of the execution ports.
03505   //
03506   // We use the following heuristics to select the unroll factor:
03507   // 1. If the code has reductions the we unroll in order to break the cross
03508   // iteration dependency.
03509   // 2. If the loop is really small then we unroll in order to reduce the loop
03510   // overhead.
03511   // 3. We don't unroll if we think that we will spill registers to memory due
03512   // to the increased register pressure.
03513 
03514   // Use the user preference, unless 'auto' is selected.
03515   if (UserUF != 0)
03516     return UserUF;
03517 
03518   // When we optimize for size we don't unroll.
03519   if (OptForSize)
03520     return 1;
03521 
03522   // Do not unroll loops with a relatively small trip count.
03523   unsigned TC = SE->getSmallConstantTripCount(TheLoop,
03524                                               TheLoop->getLoopLatch());
03525   if (TC > 1 && TC < TinyTripCountUnrollThreshold)
03526     return 1;
03527 
03528   unsigned TargetVectorRegisters = TTI.getNumberOfRegisters(true);
03529   DEBUG(dbgs() << "LV: The target has " << TargetVectorRegisters <<
03530         " vector registers\n");
03531 
03532   LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage();
03533   // We divide by these constants so assume that we have at least one
03534   // instruction that uses at least one register.
03535   R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
03536   R.NumInstructions = std::max(R.NumInstructions, 1U);
03537 
03538   // We calculate the unroll factor using the following formula.
03539   // Subtract the number of loop invariants from the number of available
03540   // registers. These registers are used by all of the unrolled instances.
03541   // Next, divide the remaining registers by the number of registers that is
03542   // required by the loop, in order to estimate how many parallel instances
03543   // fit without causing spills.
03544   unsigned UF = (TargetVectorRegisters - R.LoopInvariantRegs) / R.MaxLocalUsers;
03545 
03546   // Clamp the unroll factor ranges to reasonable factors.
03547   unsigned MaxUnrollSize = TTI.getMaximumUnrollFactor();
03548 
03549   // If we did not calculate the cost for VF (because the user selected the VF)
03550   // then we calculate the cost of VF here.
03551   if (LoopCost == 0)
03552     LoopCost = expectedCost(VF);
03553 
03554   // Clamp the calculated UF to be between the 1 and the max unroll factor
03555   // that the target allows.
03556   if (UF > MaxUnrollSize)
03557     UF = MaxUnrollSize;
03558   else if (UF < 1)
03559     UF = 1;
03560 
03561   if (Legal->getReductionVars()->size()) {
03562     DEBUG(dbgs() << "LV: Unrolling because of reductions. \n");
03563     return UF;
03564   }
03565 
03566   // We want to unroll tiny loops in order to reduce the loop overhead.
03567   // We assume that the cost overhead is 1 and we use the cost model
03568   // to estimate the cost of the loop and unroll until the cost of the
03569   // loop overhead is about 5% of the cost of the loop.
03570   DEBUG(dbgs() << "LV: Loop cost is "<< LoopCost <<" \n");
03571   if (LoopCost < 20) {
03572     DEBUG(dbgs() << "LV: Unrolling to reduce branch cost. \n");
03573     unsigned NewUF = 20/LoopCost + 1;
03574     return std::min(NewUF, UF);
03575   }
03576 
03577   DEBUG(dbgs() << "LV: Not Unrolling. \n");
03578   return 1;
03579 }
03580 
03581 LoopVectorizationCostModel::RegisterUsage
03582 LoopVectorizationCostModel::calculateRegisterUsage() {
03583   // This function calculates the register usage by measuring the highest number
03584   // of values that are alive at a single location. Obviously, this is a very
03585   // rough estimation. We scan the loop in a topological order in order and
03586   // assign a number to each instruction. We use RPO to ensure that defs are
03587   // met before their users. We assume that each instruction that has in-loop
03588   // users starts an interval. We record every time that an in-loop value is
03589   // used, so we have a list of the first and last occurrences of each
03590   // instruction. Next, we transpose this data structure into a multi map that
03591   // holds the list of intervals that *end* at a specific location. This multi
03592   // map allows us to perform a linear search. We scan the instructions linearly
03593   // and record each time that a new interval starts, by placing it in a set.
03594   // If we find this value in the multi-map then we remove it from the set.
03595   // The max register usage is the maximum size of the set.
03596   // We also search for instructions that are defined outside the loop, but are
03597   // used inside the loop. We need this number separately from the max-interval
03598   // usage number because when we unroll, loop-invariant values do not take
03599   // more register.
03600   LoopBlocksDFS DFS(TheLoop);
03601   DFS.perform(LI);
03602 
03603   RegisterUsage R;
03604   R.NumInstructions = 0;
03605 
03606   // Each 'key' in the map opens a new interval. The values
03607   // of the map are the index of the 'last seen' usage of the
03608   // instruction that is the key.
03609   typedef DenseMap<Instruction*, unsigned> IntervalMap;
03610   // Maps instruction to its index.
03611   DenseMap<unsigned, Instruction*> IdxToInstr;
03612   // Marks the end of each interval.
03613   IntervalMap EndPoint;
03614   // Saves the list of instruction indices that are used in the loop.
03615   SmallSet<Instruction*, 8> Ends;
03616   // Saves the list of values that are used in the loop but are
03617   // defined outside the loop, such as arguments and constants.
03618   SmallPtrSet<Value*, 8> LoopInvariants;
03619 
03620   unsigned Index = 0;
03621   for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
03622        be = DFS.endRPO(); bb != be; ++bb) {
03623     R.NumInstructions += (*bb)->size();
03624     for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
03625          ++it) {
03626       Instruction *I = it;
03627       IdxToInstr[Index++] = I;
03628 
03629       // Save the end location of each USE.
03630       for (unsigned i = 0; i < I->getNumOperands(); ++i) {
03631         Value *U = I->getOperand(i);
03632         Instruction *Instr = dyn_cast<Instruction>(U);
03633 
03634         // Ignore non-instruction values such as arguments, constants, etc.
03635         if (!Instr) continue;
03636 
03637         // If this instruction is outside the loop then record it and continue.
03638         if (!TheLoop->contains(Instr)) {
03639           LoopInvariants.insert(Instr);
03640           continue;
03641         }
03642 
03643         // Overwrite previous end points.
03644         EndPoint[Instr] = Index;
03645         Ends.insert(Instr);
03646       }
03647     }
03648   }
03649 
03650   // Saves the list of intervals that end with the index in 'key'.
03651   typedef SmallVector<Instruction*, 2> InstrList;
03652   DenseMap<unsigned, InstrList> TransposeEnds;
03653 
03654   // Transpose the EndPoints to a list of values that end at each index.
03655   for (IntervalMap::iterator it = EndPoint.begin(), e = EndPoint.end();
03656        it != e; ++it)
03657     TransposeEnds[it->second].push_back(it->first);
03658 
03659   SmallSet<Instruction*, 8> OpenIntervals;
03660   unsigned MaxUsage = 0;
03661 
03662 
03663   DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
03664   for (unsigned int i = 0; i < Index; ++i) {
03665     Instruction *I = IdxToInstr[i];
03666     // Ignore instructions that are never used within the loop.
03667     if (!Ends.count(I)) continue;
03668 
03669     // Remove all of the instructions that end at this location.
03670     InstrList &List = TransposeEnds[i];
03671     for (unsigned int j=0, e = List.size(); j < e; ++j)
03672       OpenIntervals.erase(List[j]);
03673 
03674     // Count the number of live interals.
03675     MaxUsage = std::max(MaxUsage, OpenIntervals.size());
03676 
03677     DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " <<
03678           OpenIntervals.size() <<"\n");
03679 
03680     // Add the current instruction to the list of open intervals.
03681     OpenIntervals.insert(I);
03682   }
03683 
03684   unsigned Invariant = LoopInvariants.size();
03685   DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << " \n");
03686   DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << " \n");
03687   DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << " \n");
03688 
03689   R.LoopInvariantRegs = Invariant;
03690   R.MaxLocalUsers = MaxUsage;
03691   return R;
03692 }
03693 
03694 unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
03695   unsigned Cost = 0;
03696 
03697   // For each block.
03698   for (Loop::block_iterator bb = TheLoop->block_begin(),
03699        be = TheLoop->block_end(); bb != be; ++bb) {
03700     unsigned BlockCost = 0;
03701     BasicBlock *BB = *bb;
03702 
03703     // For each instruction in the old loop.
03704     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
03705       // Skip dbg intrinsics.
03706       if (isa<DbgInfoIntrinsic>(it))
03707         continue;
03708 
03709       unsigned C = getInstructionCost(it, VF);
03710       Cost += C;
03711       DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF " <<
03712             VF << " For instruction: "<< *it << "\n");
03713     }
03714 
03715     // We assume that if-converted blocks have a 50% chance of being executed.
03716     // When the code is scalar then some of the blocks are avoided due to CF.
03717     // When the code is vectorized we execute all code paths.
03718     if (Legal->blockNeedsPredication(*bb) && VF == 1)
03719       BlockCost /= 2;
03720 
03721     Cost += BlockCost;
03722   }
03723 
03724   return Cost;
03725 }
03726 
03727 unsigned
03728 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
03729   // If we know that this instruction will remain uniform, check the cost of
03730   // the scalar version.
03731   if (Legal->isUniformAfterVectorization(I))
03732     VF = 1;
03733 
03734   Type *RetTy = I->getType();
03735   Type *VectorTy = ToVectorTy(RetTy, VF);
03736 
03737   // TODO: We need to estimate the cost of intrinsic calls.
03738   switch (I->getOpcode()) {
03739   case Instruction::GetElementPtr:
03740     // We mark this instruction as zero-cost because the cost of GEPs in
03741     // vectorized code depends on whether the corresponding memory instruction
03742     // is scalarized or not. Therefore, we handle GEPs with the memory
03743     // instruction cost.
03744     return 0;
03745   case Instruction::Br: {
03746     return TTI.getCFInstrCost(I->getOpcode());
03747   }
03748   case Instruction::PHI:
03749     //TODO: IF-converted IFs become selects.
03750     return 0;
03751   case Instruction::Add:
03752   case Instruction::FAdd:
03753   case Instruction::Sub:
03754   case Instruction::FSub:
03755   case Instruction::Mul:
03756   case Instruction::FMul:
03757   case Instruction::UDiv:
03758   case Instruction::SDiv:
03759   case Instruction::FDiv:
03760   case Instruction::URem:
03761   case Instruction::SRem:
03762   case Instruction::FRem:
03763   case Instruction::Shl:
03764   case Instruction::LShr:
03765   case Instruction::AShr:
03766   case Instruction::And:
03767   case Instruction::Or:
03768   case Instruction::Xor: {
03769     // Certain instructions can be cheaper to vectorize if they have a constant
03770     // second vector operand. One example of this are shifts on x86.
03771     TargetTransformInfo::OperandValueKind Op1VK =
03772       TargetTransformInfo::OK_AnyValue;
03773     TargetTransformInfo::OperandValueKind Op2VK =
03774       TargetTransformInfo::OK_AnyValue;
03775 
03776     if (isa<ConstantInt>(I->getOperand(1)))
03777       Op2VK = TargetTransformInfo::OK_UniformConstantValue;
03778 
03779     return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK);
03780   }
03781   case Instruction::Select: {
03782     SelectInst *SI = cast<SelectInst>(I);
03783     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
03784     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
03785     Type *CondTy = SI->getCondition()->getType();
03786     if (!ScalarCond)
03787       CondTy = VectorType::get(CondTy, VF);
03788 
03789     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
03790   }
03791   case Instruction::ICmp:
03792   case Instruction::FCmp: {
03793     Type *ValTy = I->getOperand(0)->getType();
03794     VectorTy = ToVectorTy(ValTy, VF);
03795     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy);
03796   }
03797   case Instruction::Store:
03798   case Instruction::Load: {
03799     StoreInst *SI = dyn_cast<StoreInst>(I);
03800     LoadInst *LI = dyn_cast<LoadInst>(I);
03801     Type *ValTy = (SI ? SI->getValueOperand()->getType() :
03802                    LI->getType());
03803     VectorTy = ToVectorTy(ValTy, VF);
03804 
03805     unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment();
03806     unsigned AS = SI ? SI->getPointerAddressSpace() :
03807       LI->getPointerAddressSpace();
03808     Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand();
03809     // We add the cost of address computation here instead of with the gep
03810     // instruction because only here we know whether the operation is
03811     // scalarized.
03812     if (VF == 1)
03813       return TTI.getAddressComputationCost(VectorTy) +
03814         TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
03815 
03816     // Scalarized loads/stores.
03817     int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
03818     bool Reverse = ConsecutiveStride < 0;
03819     unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ValTy);
03820     unsigned VectorElementSize = DL->getTypeStoreSize(VectorTy)/VF;
03821     if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) {
03822       unsigned Cost = 0;
03823       // The cost of extracting from the value vector and pointer vector.
03824       Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
03825       for (unsigned i = 0; i < VF; ++i) {
03826         //  The cost of extracting the pointer operand.
03827         Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i);
03828         // In case of STORE, the cost of ExtractElement from the vector.
03829         // In case of LOAD, the cost of InsertElement into the returned
03830         // vector.
03831         Cost += TTI.getVectorInstrCost(SI ? Instruction::ExtractElement :
03832                                             Instruction::InsertElement,
03833                                             VectorTy, i);
03834       }
03835 
03836       // The cost of the scalar loads/stores.
03837       Cost += VF * TTI.getAddressComputationCost(ValTy->getScalarType());
03838       Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
03839                                        Alignment, AS);
03840       return Cost;
03841     }
03842 
03843     // Wide load/stores.
03844     unsigned Cost = TTI.getAddressComputationCost(VectorTy);
03845     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
03846 
03847     if (Reverse)
03848       Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
03849                                   VectorTy, 0);
03850     return Cost;
03851   }
03852   case Instruction::ZExt:
03853   case Instruction::SExt:
03854   case Instruction::FPToUI:
03855   case Instruction::FPToSI:
03856   case Instruction::FPExt:
03857   case Instruction::PtrToInt:
03858   case Instruction::IntToPtr:
03859   case Instruction::SIToFP:
03860   case Instruction::UIToFP:
03861   case Instruction::Trunc:
03862   case Instruction::FPTrunc:
03863   case Instruction::BitCast: {
03864     // We optimize the truncation of induction variable.
03865     // The cost of these is the same as the scalar operation.
03866     if (I->getOpcode() == Instruction::Trunc &&
03867         Legal->isInductionVariable(I->getOperand(0)))
03868       return TTI.getCastInstrCost(I->getOpcode(), I->getType(),
03869                                   I->getOperand(0)->getType());
03870 
03871     Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
03872     return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
03873   }
03874   case Instruction::Call: {
03875     CallInst *CI = cast<CallInst>(I);
03876     Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
03877     assert(ID && "Not an intrinsic call!");
03878     Type *RetTy = ToVectorTy(CI->getType(), VF);
03879     SmallVector<Type*, 4> Tys;
03880     for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
03881       Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF));
03882     return TTI.getIntrinsicInstrCost(ID, RetTy, Tys);
03883   }
03884   default: {
03885     // We are scalarizing the instruction. Return the cost of the scalar
03886     // instruction, plus the cost of insert and extract into vector
03887     // elements, times the vector width.
03888     unsigned Cost = 0;
03889 
03890     if (!RetTy->isVoidTy() && VF != 1) {
03891       unsigned InsCost = TTI.getVectorInstrCost(Instruction::InsertElement,
03892                                                 VectorTy);
03893       unsigned ExtCost = TTI.getVectorInstrCost(Instruction::ExtractElement,
03894                                                 VectorTy);
03895 
03896       // The cost of inserting the results plus extracting each one of the
03897       // operands.
03898       Cost += VF * (InsCost + ExtCost * I->getNumOperands());
03899     }
03900 
03901     // The cost of executing VF copies of the scalar instruction. This opcode
03902     // is unknown. Assume that it is the same as 'mul'.
03903     Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy);
03904     return Cost;
03905   }
03906   }// end of switch.
03907 }
03908 
03909 Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) {
03910   if (Scalar->isVoidTy() || VF == 1)
03911     return Scalar;
03912   return VectorType::get(Scalar, VF);
03913 }
03914 
03915 char LoopVectorize::ID = 0;
03916 static const char lv_name[] = "Loop Vectorization";
03917 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
03918 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
03919 INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
03920 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
03921 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
03922 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
03923 
03924 namespace llvm {
03925   Pass *createLoopVectorizePass() {
03926     return new LoopVectorize();
03927   }
03928 }
03929 
03930 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
03931   // Check for a store.
03932   if (StoreInst *ST = dyn_cast<StoreInst>(Inst))
03933     return Legal->isConsecutivePtr(ST->getPointerOperand()) != 0;
03934 
03935   // Check for a load.
03936   if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
03937     return Legal->isConsecutivePtr(LI->getPointerOperand()) != 0;
03938 
03939   return false;
03940 }