LLVM  mainline
LoopVectorize.cpp
Go to the documentation of this file.
00001 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
00011 // and generates target-independent LLVM-IR.
00012 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
00013 // of instructions in order to estimate the profitability of vectorization.
00014 //
00015 // The loop vectorizer combines consecutive loop iterations into a single
00016 // 'wide' iteration. After this transformation the index is incremented
00017 // by the SIMD vector width, and not by one.
00018 //
00019 // This pass has three parts:
00020 // 1. The main loop pass that drives the different parts.
00021 // 2. LoopVectorizationLegality - A unit that checks for the legality
00022 //    of the vectorization.
00023 // 3. InnerLoopVectorizer - A unit that performs the actual
00024 //    widening of instructions.
00025 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
00026 //    of vectorization. It decides on the optimal vector width, which
00027 //    can be one, if vectorization is not profitable.
00028 //
00029 //===----------------------------------------------------------------------===//
00030 //
00031 // The reduction-variable vectorization is based on the paper:
00032 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
00033 //
00034 // Variable uniformity checks are inspired by:
00035 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
00036 //
00037 // The interleaved access vectorization is based on the paper:
00038 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
00039 //  Data for SIMD
00040 //
00041 // Other ideas/concepts are from:
00042 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
00043 //
00044 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
00045 //  Vectorizing Compilers.
00046 //
00047 //===----------------------------------------------------------------------===//
00048 
00049 #include "llvm/Transforms/Vectorize.h"
00050 #include "llvm/ADT/DenseMap.h"
00051 #include "llvm/ADT/EquivalenceClasses.h"
00052 #include "llvm/ADT/Hashing.h"
00053 #include "llvm/ADT/MapVector.h"
00054 #include "llvm/ADT/SetVector.h"
00055 #include "llvm/ADT/SmallPtrSet.h"
00056 #include "llvm/ADT/SmallSet.h"
00057 #include "llvm/ADT/SmallVector.h"
00058 #include "llvm/ADT/Statistic.h"
00059 #include "llvm/ADT/StringExtras.h"
00060 #include "llvm/Analysis/AliasAnalysis.h"
00061 #include "llvm/Analysis/AliasSetTracker.h"
00062 #include "llvm/Analysis/AssumptionCache.h"
00063 #include "llvm/Analysis/BlockFrequencyInfo.h"
00064 #include "llvm/Analysis/CodeMetrics.h"
00065 #include "llvm/Analysis/LoopAccessAnalysis.h"
00066 #include "llvm/Analysis/LoopInfo.h"
00067 #include "llvm/Analysis/LoopIterator.h"
00068 #include "llvm/Analysis/LoopPass.h"
00069 #include "llvm/Analysis/ScalarEvolution.h"
00070 #include "llvm/Analysis/ScalarEvolutionExpander.h"
00071 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
00072 #include "llvm/Analysis/TargetTransformInfo.h"
00073 #include "llvm/Analysis/ValueTracking.h"
00074 #include "llvm/IR/Constants.h"
00075 #include "llvm/IR/DataLayout.h"
00076 #include "llvm/IR/DebugInfo.h"
00077 #include "llvm/IR/DerivedTypes.h"
00078 #include "llvm/IR/DiagnosticInfo.h"
00079 #include "llvm/IR/Dominators.h"
00080 #include "llvm/IR/Function.h"
00081 #include "llvm/IR/IRBuilder.h"
00082 #include "llvm/IR/Instructions.h"
00083 #include "llvm/IR/IntrinsicInst.h"
00084 #include "llvm/IR/LLVMContext.h"
00085 #include "llvm/IR/Module.h"
00086 #include "llvm/IR/PatternMatch.h"
00087 #include "llvm/IR/Type.h"
00088 #include "llvm/IR/Value.h"
00089 #include "llvm/IR/ValueHandle.h"
00090 #include "llvm/IR/Verifier.h"
00091 #include "llvm/Pass.h"
00092 #include "llvm/Support/BranchProbability.h"
00093 #include "llvm/Support/CommandLine.h"
00094 #include "llvm/Support/Debug.h"
00095 #include "llvm/Support/raw_ostream.h"
00096 #include "llvm/Transforms/Scalar.h"
00097 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
00098 #include "llvm/Transforms/Utils/Local.h"
00099 #include "llvm/Transforms/Utils/VectorUtils.h"
00100 #include "llvm/Transforms/Utils/LoopUtils.h"
00101 #include <algorithm>
00102 #include <map>
00103 #include <tuple>
00104 
00105 using namespace llvm;
00106 using namespace llvm::PatternMatch;
00107 
00108 #define LV_NAME "loop-vectorize"
00109 #define DEBUG_TYPE LV_NAME
00110 
00111 STATISTIC(LoopsVectorized, "Number of loops vectorized");
00112 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
00113 
00114 static cl::opt<bool>
00115 EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
00116                    cl::desc("Enable if-conversion during vectorization."));
00117 
00118 /// We don't vectorize loops with a known constant trip count below this number.
00119 static cl::opt<unsigned>
00120 TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16),
00121                              cl::Hidden,
00122                              cl::desc("Don't vectorize loops with a constant "
00123                                       "trip count that is smaller than this "
00124                                       "value."));
00125 
00126 /// This enables versioning on the strides of symbolically striding memory
00127 /// accesses in code like the following.
00128 ///   for (i = 0; i < N; ++i)
00129 ///     A[i * Stride1] += B[i * Stride2] ...
00130 ///
00131 /// Will be roughly translated to
00132 ///    if (Stride1 == 1 && Stride2 == 1) {
00133 ///      for (i = 0; i < N; i+=4)
00134 ///       A[i:i+3] += ...
00135 ///    } else
00136 ///      ...
00137 static cl::opt<bool> EnableMemAccessVersioning(
00138     "enable-mem-access-versioning", cl::init(true), cl::Hidden,
00139     cl::desc("Enable symblic stride memory access versioning"));
00140 
00141 static cl::opt<bool> EnableInterleavedMemAccesses(
00142     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
00143     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
00144 
00145 /// Maximum factor for an interleaved memory access.
00146 static cl::opt<unsigned> MaxInterleaveGroupFactor(
00147     "max-interleave-group-factor", cl::Hidden,
00148     cl::desc("Maximum factor for an interleaved access group (default = 8)"),
00149     cl::init(8));
00150 
00151 /// We don't unroll loops with a known constant trip count below this number.
00152 static const unsigned TinyTripCountUnrollThreshold = 128;
00153 
00154 static cl::opt<unsigned> ForceTargetNumScalarRegs(
00155     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
00156     cl::desc("A flag that overrides the target's number of scalar registers."));
00157 
00158 static cl::opt<unsigned> ForceTargetNumVectorRegs(
00159     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
00160     cl::desc("A flag that overrides the target's number of vector registers."));
00161 
00162 /// Maximum vectorization interleave count.
00163 static const unsigned MaxInterleaveFactor = 16;
00164 
00165 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
00166     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
00167     cl::desc("A flag that overrides the target's max interleave factor for "
00168              "scalar loops."));
00169 
00170 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
00171     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
00172     cl::desc("A flag that overrides the target's max interleave factor for "
00173              "vectorized loops."));
00174 
00175 static cl::opt<unsigned> ForceTargetInstructionCost(
00176     "force-target-instruction-cost", cl::init(0), cl::Hidden,
00177     cl::desc("A flag that overrides the target's expected cost for "
00178              "an instruction to a single constant value. Mostly "
00179              "useful for getting consistent testing."));
00180 
00181 static cl::opt<unsigned> SmallLoopCost(
00182     "small-loop-cost", cl::init(20), cl::Hidden,
00183     cl::desc("The cost of a loop that is considered 'small' by the unroller."));
00184 
00185 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
00186     "loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden,
00187     cl::desc("Enable the use of the block frequency analysis to access PGO "
00188              "heuristics minimizing code growth in cold regions and being more "
00189              "aggressive in hot regions."));
00190 
00191 // Runtime unroll loops for load/store throughput.
00192 static cl::opt<bool> EnableLoadStoreRuntimeUnroll(
00193     "enable-loadstore-runtime-unroll", cl::init(true), cl::Hidden,
00194     cl::desc("Enable runtime unrolling until load/store ports are saturated"));
00195 
00196 /// The number of stores in a loop that are allowed to need predication.
00197 static cl::opt<unsigned> NumberOfStoresToPredicate(
00198     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
00199     cl::desc("Max number of stores to be predicated behind an if."));
00200 
00201 static cl::opt<bool> EnableIndVarRegisterHeur(
00202     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
00203     cl::desc("Count the induction variable only once when unrolling"));
00204 
00205 static cl::opt<bool> EnableCondStoresVectorization(
00206     "enable-cond-stores-vec", cl::init(false), cl::Hidden,
00207     cl::desc("Enable if predication of stores during vectorization."));
00208 
00209 static cl::opt<unsigned> MaxNestedScalarReductionUF(
00210     "max-nested-scalar-reduction-unroll", cl::init(2), cl::Hidden,
00211     cl::desc("The maximum unroll factor to use when unrolling a scalar "
00212              "reduction in a nested loop."));
00213 
00214 namespace {
00215 
00216 // Forward declarations.
00217 class LoopVectorizationLegality;
00218 class LoopVectorizationCostModel;
00219 class LoopVectorizeHints;
00220 
00221 /// \brief This modifies LoopAccessReport to initialize message with
00222 /// loop-vectorizer-specific part.
00223 class VectorizationReport : public LoopAccessReport {
00224 public:
00225   VectorizationReport(Instruction *I = nullptr)
00226       : LoopAccessReport("loop not vectorized: ", I) {}
00227 
00228   /// \brief This allows promotion of the loop-access analysis report into the
00229   /// loop-vectorizer report.  It modifies the message to add the
00230   /// loop-vectorizer-specific part of the message.
00231   explicit VectorizationReport(const LoopAccessReport &R)
00232       : LoopAccessReport(Twine("loop not vectorized: ") + R.str(),
00233                          R.getInstr()) {}
00234 };
00235 
00236 /// A helper function for converting Scalar types to vector types.
00237 /// If the incoming type is void, we return void. If the VF is 1, we return
00238 /// the scalar type.
00239 static Type* ToVectorTy(Type *Scalar, unsigned VF) {
00240   if (Scalar->isVoidTy() || VF == 1)
00241     return Scalar;
00242   return VectorType::get(Scalar, VF);
00243 }
00244 
00245 /// InnerLoopVectorizer vectorizes loops which contain only one basic
00246 /// block to a specified vectorization factor (VF).
00247 /// This class performs the widening of scalars into vectors, or multiple
00248 /// scalars. This class also implements the following features:
00249 /// * It inserts an epilogue loop for handling loops that don't have iteration
00250 ///   counts that are known to be a multiple of the vectorization factor.
00251 /// * It handles the code generation for reduction variables.
00252 /// * Scalarization (implementation using scalars) of un-vectorizable
00253 ///   instructions.
00254 /// InnerLoopVectorizer does not perform any vectorization-legality
00255 /// checks, and relies on the caller to check for the different legality
00256 /// aspects. The InnerLoopVectorizer relies on the
00257 /// LoopVectorizationLegality class to provide information about the induction
00258 /// and reduction variables that were found to a given vectorization factor.
00259 class InnerLoopVectorizer {
00260 public:
00261   InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
00262                       DominatorTree *DT, const TargetLibraryInfo *TLI,
00263                       const TargetTransformInfo *TTI, unsigned VecWidth,
00264                       unsigned UnrollFactor)
00265       : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
00266         VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()),
00267         Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor),
00268         Legal(nullptr), AddedSafetyChecks(false) {}
00269 
00270   // Perform the actual loop widening (vectorization).
00271   void vectorize(LoopVectorizationLegality *L) {
00272     Legal = L;
00273     // Create a new empty loop. Unlink the old loop and connect the new one.
00274     createEmptyLoop();
00275     // Widen each instruction in the old loop to a new one in the new loop.
00276     // Use the Legality module to find the induction and reduction variables.
00277     vectorizeLoop();
00278     // Register the new loop and update the analysis passes.
00279     updateAnalysis();
00280   }
00281 
00282   // Return true if any runtime check is added.
00283   bool IsSafetyChecksAdded() {
00284     return AddedSafetyChecks;
00285   }
00286 
00287   virtual ~InnerLoopVectorizer() {}
00288 
00289 protected:
00290   /// A small list of PHINodes.
00291   typedef SmallVector<PHINode*, 4> PhiVector;
00292   /// When we unroll loops we have multiple vector values for each scalar.
00293   /// This data structure holds the unrolled and vectorized values that
00294   /// originated from one scalar instruction.
00295   typedef SmallVector<Value*, 2> VectorParts;
00296 
00297   // When we if-convert we need to create edge masks. We have to cache values
00298   // so that we don't end up with exponential recursion/IR.
00299   typedef DenseMap<std::pair<BasicBlock*, BasicBlock*>,
00300                    VectorParts> EdgeMaskCache;
00301 
00302   /// \brief Add checks for strides that were assumed to be 1.
00303   ///
00304   /// Returns the last check instruction and the first check instruction in the
00305   /// pair as (first, last).
00306   std::pair<Instruction *, Instruction *> addStrideCheck(Instruction *Loc);
00307 
00308   /// Create an empty loop, based on the loop ranges of the old loop.
00309   void createEmptyLoop();
00310   /// Copy and widen the instructions from the old loop.
00311   virtual void vectorizeLoop();
00312 
00313   /// \brief The Loop exit block may have single value PHI nodes where the
00314   /// incoming value is 'Undef'. While vectorizing we only handled real values
00315   /// that were defined inside the loop. Here we fix the 'undef case'.
00316   /// See PR14725.
00317   void fixLCSSAPHIs();
00318 
00319   /// A helper function that computes the predicate of the block BB, assuming
00320   /// that the header block of the loop is set to True. It returns the *entry*
00321   /// mask for the block BB.
00322   VectorParts createBlockInMask(BasicBlock *BB);
00323   /// A helper function that computes the predicate of the edge between SRC
00324   /// and DST.
00325   VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
00326 
00327   /// A helper function to vectorize a single BB within the innermost loop.
00328   void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV);
00329 
00330   /// Vectorize a single PHINode in a block. This method handles the induction
00331   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
00332   /// arbitrary length vectors.
00333   void widenPHIInstruction(Instruction *PN, VectorParts &Entry,
00334                            unsigned UF, unsigned VF, PhiVector *PV);
00335 
00336   /// Insert the new loop to the loop hierarchy and pass manager
00337   /// and update the analysis passes.
00338   void updateAnalysis();
00339 
00340   /// This instruction is un-vectorizable. Implement it as a sequence
00341   /// of scalars. If \p IfPredicateStore is true we need to 'hide' each
00342   /// scalarized instruction behind an if block predicated on the control
00343   /// dependence of the instruction.
00344   virtual void scalarizeInstruction(Instruction *Instr,
00345                                     bool IfPredicateStore=false);
00346 
00347   /// Vectorize Load and Store instructions,
00348   virtual void vectorizeMemoryInstruction(Instruction *Instr);
00349 
00350   /// Create a broadcast instruction. This method generates a broadcast
00351   /// instruction (shuffle) for loop invariant values and for the induction
00352   /// value. If this is the induction variable then we extend it to N, N+1, ...
00353   /// this is needed because each iteration in the loop corresponds to a SIMD
00354   /// element.
00355   virtual Value *getBroadcastInstrs(Value *V);
00356 
00357   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
00358   /// to each vector element of Val. The sequence starts at StartIndex.
00359   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step);
00360 
00361   /// When we go over instructions in the basic block we rely on previous
00362   /// values within the current basic block or on loop invariant values.
00363   /// When we widen (vectorize) values we place them in the map. If the values
00364   /// are not within the map, they have to be loop invariant, so we simply
00365   /// broadcast them into a vector.
00366   VectorParts &getVectorValue(Value *V);
00367 
00368   /// Try to vectorize the interleaved access group that \p Instr belongs to.
00369   void vectorizeInterleaveGroup(Instruction *Instr);
00370 
00371   /// Generate a shuffle sequence that will reverse the vector Vec.
00372   virtual Value *reverseVector(Value *Vec);
00373 
00374   /// This is a helper class that holds the vectorizer state. It maps scalar
00375   /// instructions to vector instructions. When the code is 'unrolled' then
00376   /// then a single scalar value is mapped to multiple vector parts. The parts
00377   /// are stored in the VectorPart type.
00378   struct ValueMap {
00379     /// C'tor.  UnrollFactor controls the number of vectors ('parts') that
00380     /// are mapped.
00381     ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {}
00382 
00383     /// \return True if 'Key' is saved in the Value Map.
00384     bool has(Value *Key) const { return MapStorage.count(Key); }
00385 
00386     /// Initializes a new entry in the map. Sets all of the vector parts to the
00387     /// save value in 'Val'.
00388     /// \return A reference to a vector with splat values.
00389     VectorParts &splat(Value *Key, Value *Val) {
00390       VectorParts &Entry = MapStorage[Key];
00391       Entry.assign(UF, Val);
00392       return Entry;
00393     }
00394 
00395     ///\return A reference to the value that is stored at 'Key'.
00396     VectorParts &get(Value *Key) {
00397       VectorParts &Entry = MapStorage[Key];
00398       if (Entry.empty())
00399         Entry.resize(UF);
00400       assert(Entry.size() == UF);
00401       return Entry;
00402     }
00403 
00404   private:
00405     /// The unroll factor. Each entry in the map stores this number of vector
00406     /// elements.
00407     unsigned UF;
00408 
00409     /// Map storage. We use std::map and not DenseMap because insertions to a
00410     /// dense map invalidates its iterators.
00411     std::map<Value *, VectorParts> MapStorage;
00412   };
00413 
00414   /// The original loop.
00415   Loop *OrigLoop;
00416   /// Scev analysis to use.
00417   ScalarEvolution *SE;
00418   /// Loop Info.
00419   LoopInfo *LI;
00420   /// Dominator Tree.
00421   DominatorTree *DT;
00422   /// Alias Analysis.
00423   AliasAnalysis *AA;
00424   /// Target Library Info.
00425   const TargetLibraryInfo *TLI;
00426   /// Target Transform Info.
00427   const TargetTransformInfo *TTI;
00428 
00429   /// The vectorization SIMD factor to use. Each vector will have this many
00430   /// vector elements.
00431   unsigned VF;
00432 
00433 protected:
00434   /// The vectorization unroll factor to use. Each scalar is vectorized to this
00435   /// many different vector instructions.
00436   unsigned UF;
00437 
00438   /// The builder that we use
00439   IRBuilder<> Builder;
00440 
00441   // --- Vectorization state ---
00442 
00443   /// The vector-loop preheader.
00444   BasicBlock *LoopVectorPreHeader;
00445   /// The scalar-loop preheader.
00446   BasicBlock *LoopScalarPreHeader;
00447   /// Middle Block between the vector and the scalar.
00448   BasicBlock *LoopMiddleBlock;
00449   ///The ExitBlock of the scalar loop.
00450   BasicBlock *LoopExitBlock;
00451   ///The vector loop body.
00452   SmallVector<BasicBlock *, 4> LoopVectorBody;
00453   ///The scalar loop body.
00454   BasicBlock *LoopScalarBody;
00455   /// A list of all bypass blocks. The first block is the entry of the loop.
00456   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
00457 
00458   /// The new Induction variable which was added to the new block.
00459   PHINode *Induction;
00460   /// The induction variable of the old basic block.
00461   PHINode *OldInduction;
00462   /// Holds the extended (to the widest induction type) start index.
00463   Value *ExtendedIdx;
00464   /// Maps scalars to widened vectors.
00465   ValueMap WidenMap;
00466   EdgeMaskCache MaskCache;
00467 
00468   LoopVectorizationLegality *Legal;
00469 
00470   // Record whether runtime check is added.
00471   bool AddedSafetyChecks;
00472 };
00473 
00474 class InnerLoopUnroller : public InnerLoopVectorizer {
00475 public:
00476   InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
00477                     DominatorTree *DT, const TargetLibraryInfo *TLI,
00478                     const TargetTransformInfo *TTI, unsigned UnrollFactor)
00479       : InnerLoopVectorizer(OrigLoop, SE, LI, DT, TLI, TTI, 1, UnrollFactor) {}
00480 
00481 private:
00482   void scalarizeInstruction(Instruction *Instr,
00483                             bool IfPredicateStore = false) override;
00484   void vectorizeMemoryInstruction(Instruction *Instr) override;
00485   Value *getBroadcastInstrs(Value *V) override;
00486   Value *getStepVector(Value *Val, int StartIdx, Value *Step) override;
00487   Value *reverseVector(Value *Vec) override;
00488 };
00489 
00490 /// \brief Look for a meaningful debug location on the instruction or it's
00491 /// operands.
00492 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
00493   if (!I)
00494     return I;
00495 
00496   DebugLoc Empty;
00497   if (I->getDebugLoc() != Empty)
00498     return I;
00499 
00500   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
00501     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
00502       if (OpInst->getDebugLoc() != Empty)
00503         return OpInst;
00504   }
00505 
00506   return I;
00507 }
00508 
00509 /// \brief Set the debug location in the builder using the debug location in the
00510 /// instruction.
00511 static void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
00512   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr))
00513     B.SetCurrentDebugLocation(Inst->getDebugLoc());
00514   else
00515     B.SetCurrentDebugLocation(DebugLoc());
00516 }
00517 
00518 #ifndef NDEBUG
00519 /// \return string containing a file name and a line # for the given loop.
00520 static std::string getDebugLocString(const Loop *L) {
00521   std::string Result;
00522   if (L) {
00523     raw_string_ostream OS(Result);
00524     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
00525       LoopDbgLoc.print(OS);
00526     else
00527       // Just print the module name.
00528       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
00529     OS.flush();
00530   }
00531   return Result;
00532 }
00533 #endif
00534 
00535 /// \brief Propagate known metadata from one instruction to another.
00536 static void propagateMetadata(Instruction *To, const Instruction *From) {
00537   SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
00538   From->getAllMetadataOtherThanDebugLoc(Metadata);
00539 
00540   for (auto M : Metadata) {
00541     unsigned Kind = M.first;
00542 
00543     // These are safe to transfer (this is safe for TBAA, even when we
00544     // if-convert, because should that metadata have had a control dependency
00545     // on the condition, and thus actually aliased with some other
00546     // non-speculated memory access when the condition was false, this would be
00547     // caught by the runtime overlap checks).
00548     if (Kind != LLVMContext::MD_tbaa &&
00549         Kind != LLVMContext::MD_alias_scope &&
00550         Kind != LLVMContext::MD_noalias &&
00551         Kind != LLVMContext::MD_fpmath)
00552       continue;
00553 
00554     To->setMetadata(Kind, M.second);
00555   }
00556 }
00557 
00558 /// \brief Propagate known metadata from one instruction to a vector of others.
00559 static void propagateMetadata(SmallVectorImpl<Value *> &To, const Instruction *From) {
00560   for (Value *V : To)
00561     if (Instruction *I = dyn_cast<Instruction>(V))
00562       propagateMetadata(I, From);
00563 }
00564 
00565 /// \brief The group of interleaved loads/stores sharing the same stride and
00566 /// close to each other.
00567 ///
00568 /// Each member in this group has an index starting from 0, and the largest
00569 /// index should be less than interleaved factor, which is equal to the absolute
00570 /// value of the access's stride.
00571 ///
00572 /// E.g. An interleaved load group of factor 4:
00573 ///        for (unsigned i = 0; i < 1024; i+=4) {
00574 ///          a = A[i];                           // Member of index 0
00575 ///          b = A[i+1];                         // Member of index 1
00576 ///          d = A[i+3];                         // Member of index 3
00577 ///          ...
00578 ///        }
00579 ///
00580 ///      An interleaved store group of factor 4:
00581 ///        for (unsigned i = 0; i < 1024; i+=4) {
00582 ///          ...
00583 ///          A[i]   = a;                         // Member of index 0
00584 ///          A[i+1] = b;                         // Member of index 1
00585 ///          A[i+2] = c;                         // Member of index 2
00586 ///          A[i+3] = d;                         // Member of index 3
00587 ///        }
00588 ///
00589 /// Note: the interleaved load group could have gaps (missing members), but
00590 /// the interleaved store group doesn't allow gaps.
00591 class InterleaveGroup {
00592 public:
00593   InterleaveGroup(Instruction *Instr, int Stride, unsigned Align)
00594       : Align(Align), SmallestKey(0), LargestKey(0), InsertPos(Instr) {
00595     assert(Align && "The alignment should be non-zero");
00596 
00597     Factor = std::abs(Stride);
00598     assert(Factor > 1 && "Invalid interleave factor");
00599 
00600     Reverse = Stride < 0;
00601     Members[0] = Instr;
00602   }
00603 
00604   bool isReverse() const { return Reverse; }
00605   unsigned getFactor() const { return Factor; }
00606   unsigned getAlignment() const { return Align; }
00607   unsigned getNumMembers() const { return Members.size(); }
00608 
00609   /// \brief Try to insert a new member \p Instr with index \p Index and
00610   /// alignment \p NewAlign. The index is related to the leader and it could be
00611   /// negative if it is the new leader.
00612   ///
00613   /// \returns false if the instruction doesn't belong to the group.
00614   bool insertMember(Instruction *Instr, int Index, unsigned NewAlign) {
00615     assert(NewAlign && "The new member's alignment should be non-zero");
00616 
00617     int Key = Index + SmallestKey;
00618 
00619     // Skip if there is already a member with the same index.
00620     if (Members.count(Key))
00621       return false;
00622 
00623     if (Key > LargestKey) {
00624       // The largest index is always less than the interleave factor.
00625       if (Index >= static_cast<int>(Factor))
00626         return false;
00627 
00628       LargestKey = Key;
00629     } else if (Key < SmallestKey) {
00630       // The largest index is always less than the interleave factor.
00631       if (LargestKey - Key >= static_cast<int>(Factor))
00632         return false;
00633 
00634       SmallestKey = Key;
00635     }
00636 
00637     // It's always safe to select the minimum alignment.
00638     Align = std::min(Align, NewAlign);
00639     Members[Key] = Instr;
00640     return true;
00641   }
00642 
00643   /// \brief Get the member with the given index \p Index
00644   ///
00645   /// \returns nullptr if contains no such member.
00646   Instruction *getMember(unsigned Index) const {
00647     int Key = SmallestKey + Index;
00648     if (!Members.count(Key))
00649       return nullptr;
00650 
00651     return Members.find(Key)->second;
00652   }
00653 
00654   /// \brief Get the index for the given member. Unlike the key in the member
00655   /// map, the index starts from 0.
00656   unsigned getIndex(Instruction *Instr) const {
00657     for (auto I : Members)
00658       if (I.second == Instr)
00659         return I.first - SmallestKey;
00660 
00661     llvm_unreachable("InterleaveGroup contains no such member");
00662   }
00663 
00664   Instruction *getInsertPos() const { return InsertPos; }
00665   void setInsertPos(Instruction *Inst) { InsertPos = Inst; }
00666 
00667 private:
00668   unsigned Factor; // Interleave Factor.
00669   bool Reverse;
00670   unsigned Align;
00671   DenseMap<int, Instruction *> Members;
00672   int SmallestKey;
00673   int LargestKey;
00674 
00675   // To avoid breaking dependences, vectorized instructions of an interleave
00676   // group should be inserted at either the first load or the last store in
00677   // program order.
00678   //
00679   // E.g. %even = load i32             // Insert Position
00680   //      %add = add i32 %even         // Use of %even
00681   //      %odd = load i32
00682   //
00683   //      store i32 %even
00684   //      %odd = add i32               // Def of %odd
00685   //      store i32 %odd               // Insert Position
00686   Instruction *InsertPos;
00687 };
00688 
00689 /// \brief Drive the analysis of interleaved memory accesses in the loop.
00690 ///
00691 /// Use this class to analyze interleaved accesses only when we can vectorize
00692 /// a loop. Otherwise it's meaningless to do analysis as the vectorization
00693 /// on interleaved accesses is unsafe.
00694 ///
00695 /// The analysis collects interleave groups and records the relationships
00696 /// between the member and the group in a map.
00697 class InterleavedAccessInfo {
00698 public:
00699   InterleavedAccessInfo(ScalarEvolution *SE, Loop *L, DominatorTree *DT)
00700       : SE(SE), TheLoop(L), DT(DT) {}
00701 
00702   ~InterleavedAccessInfo() {
00703     SmallSet<InterleaveGroup *, 4> DelSet;
00704     // Avoid releasing a pointer twice.
00705     for (auto &I : InterleaveGroupMap)
00706       DelSet.insert(I.second);
00707     for (auto *Ptr : DelSet)
00708       delete Ptr;
00709   }
00710 
00711   /// \brief Analyze the interleaved accesses and collect them in interleave
00712   /// groups. Substitute symbolic strides using \p Strides.
00713   void analyzeInterleaving(const ValueToValueMap &Strides);
00714 
00715   /// \brief Check if \p Instr belongs to any interleave group.
00716   bool isInterleaved(Instruction *Instr) const {
00717     return InterleaveGroupMap.count(Instr);
00718   }
00719 
00720   /// \brief Get the interleave group that \p Instr belongs to.
00721   ///
00722   /// \returns nullptr if doesn't have such group.
00723   InterleaveGroup *getInterleaveGroup(Instruction *Instr) const {
00724     if (InterleaveGroupMap.count(Instr))
00725       return InterleaveGroupMap.find(Instr)->second;
00726     return nullptr;
00727   }
00728 
00729 private:
00730   ScalarEvolution *SE;
00731   Loop *TheLoop;
00732   DominatorTree *DT;
00733 
00734   /// Holds the relationships between the members and the interleave group.
00735   DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap;
00736 
00737   /// \brief The descriptor for a strided memory access.
00738   struct StrideDescriptor {
00739     StrideDescriptor(int Stride, const SCEV *Scev, unsigned Size,
00740                      unsigned Align)
00741         : Stride(Stride), Scev(Scev), Size(Size), Align(Align) {}
00742 
00743     StrideDescriptor() : Stride(0), Scev(nullptr), Size(0), Align(0) {}
00744 
00745     int Stride; // The access's stride. It is negative for a reverse access.
00746     const SCEV *Scev; // The scalar expression of this access
00747     unsigned Size;    // The size of the memory object.
00748     unsigned Align;   // The alignment of this access.
00749   };
00750 
00751   /// \brief Create a new interleave group with the given instruction \p Instr,
00752   /// stride \p Stride and alignment \p Align.
00753   ///
00754   /// \returns the newly created interleave group.
00755   InterleaveGroup *createInterleaveGroup(Instruction *Instr, int Stride,
00756                                          unsigned Align) {
00757     assert(!InterleaveGroupMap.count(Instr) &&
00758            "Already in an interleaved access group");
00759     InterleaveGroupMap[Instr] = new InterleaveGroup(Instr, Stride, Align);
00760     return InterleaveGroupMap[Instr];
00761   }
00762 
00763   /// \brief Release the group and remove all the relationships.
00764   void releaseGroup(InterleaveGroup *Group) {
00765     for (unsigned i = 0; i < Group->getFactor(); i++)
00766       if (Instruction *Member = Group->getMember(i))
00767         InterleaveGroupMap.erase(Member);
00768 
00769     delete Group;
00770   }
00771 
00772   /// \brief Collect all the accesses with a constant stride in program order.
00773   void collectConstStridedAccesses(
00774       MapVector<Instruction *, StrideDescriptor> &StrideAccesses,
00775       const ValueToValueMap &Strides);
00776 };
00777 
00778 /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
00779 /// to what vectorization factor.
00780 /// This class does not look at the profitability of vectorization, only the
00781 /// legality. This class has two main kinds of checks:
00782 /// * Memory checks - The code in canVectorizeMemory checks if vectorization
00783 ///   will change the order of memory accesses in a way that will change the
00784 ///   correctness of the program.
00785 /// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory
00786 /// checks for a number of different conditions, such as the availability of a
00787 /// single induction variable, that all types are supported and vectorize-able,
00788 /// etc. This code reflects the capabilities of InnerLoopVectorizer.
00789 /// This class is also used by InnerLoopVectorizer for identifying
00790 /// induction variable and the different reduction variables.
00791 class LoopVectorizationLegality {
00792 public:
00793   LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DominatorTree *DT,
00794                             TargetLibraryInfo *TLI, AliasAnalysis *AA,
00795                             Function *F, const TargetTransformInfo *TTI,
00796                             LoopAccessAnalysis *LAA)
00797       : NumPredStores(0), TheLoop(L), SE(SE), TLI(TLI), TheFunction(F),
00798         TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(SE, L, DT),
00799         Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false) {}
00800 
00801   /// This enum represents the kinds of inductions that we support.
00802   enum InductionKind {
00803     IK_NoInduction,  ///< Not an induction variable.
00804     IK_IntInduction, ///< Integer induction variable. Step = C.
00805     IK_PtrInduction  ///< Pointer induction var. Step = C / sizeof(elem).
00806   };
00807 
00808   /// A struct for saving information about induction variables.
00809   struct InductionInfo {
00810     InductionInfo(Value *Start, InductionKind K, ConstantInt *Step)
00811         : StartValue(Start), IK(K), StepValue(Step) {
00812       assert(IK != IK_NoInduction && "Not an induction");
00813       assert(StartValue && "StartValue is null");
00814       assert(StepValue && !StepValue->isZero() && "StepValue is zero");
00815       assert((IK != IK_PtrInduction || StartValue->getType()->isPointerTy()) &&
00816              "StartValue is not a pointer for pointer induction");
00817       assert((IK != IK_IntInduction || StartValue->getType()->isIntegerTy()) &&
00818              "StartValue is not an integer for integer induction");
00819       assert(StepValue->getType()->isIntegerTy() &&
00820              "StepValue is not an integer");
00821     }
00822     InductionInfo()
00823         : StartValue(nullptr), IK(IK_NoInduction), StepValue(nullptr) {}
00824 
00825     /// Get the consecutive direction. Returns:
00826     ///   0 - unknown or non-consecutive.
00827     ///   1 - consecutive and increasing.
00828     ///  -1 - consecutive and decreasing.
00829     int getConsecutiveDirection() const {
00830       if (StepValue && (StepValue->isOne() || StepValue->isMinusOne()))
00831         return StepValue->getSExtValue();
00832       return 0;
00833     }
00834 
00835     /// Compute the transformed value of Index at offset StartValue using step
00836     /// StepValue.
00837     /// For integer induction, returns StartValue + Index * StepValue.
00838     /// For pointer induction, returns StartValue[Index * StepValue].
00839     /// FIXME: The newly created binary instructions should contain nsw/nuw
00840     /// flags, which can be found from the original scalar operations.
00841     Value *transform(IRBuilder<> &B, Value *Index) const {
00842       switch (IK) {
00843       case IK_IntInduction:
00844         assert(Index->getType() == StartValue->getType() &&
00845                "Index type does not match StartValue type");
00846         if (StepValue->isMinusOne())
00847           return B.CreateSub(StartValue, Index);
00848         if (!StepValue->isOne())
00849           Index = B.CreateMul(Index, StepValue);
00850         return B.CreateAdd(StartValue, Index);
00851 
00852       case IK_PtrInduction:
00853         if (StepValue->isMinusOne())
00854           Index = B.CreateNeg(Index);
00855         else if (!StepValue->isOne())
00856           Index = B.CreateMul(Index, StepValue);
00857         return B.CreateGEP(nullptr, StartValue, Index);
00858 
00859       case IK_NoInduction:
00860         return nullptr;
00861       }
00862       llvm_unreachable("invalid enum");
00863     }
00864 
00865     /// Start value.
00866     TrackingVH<Value> StartValue;
00867     /// Induction kind.
00868     InductionKind IK;
00869     /// Step value.
00870     ConstantInt *StepValue;
00871   };
00872 
00873   /// ReductionList contains the reduction descriptors for all
00874   /// of the reductions that were found in the loop.
00875   typedef DenseMap<PHINode *, RecurrenceDescriptor> ReductionList;
00876 
00877   /// InductionList saves induction variables and maps them to the
00878   /// induction descriptor.
00879   typedef MapVector<PHINode*, InductionInfo> InductionList;
00880 
00881   /// Returns true if it is legal to vectorize this loop.
00882   /// This does not mean that it is profitable to vectorize this
00883   /// loop, only that it is legal to do so.
00884   bool canVectorize();
00885 
00886   /// Returns the Induction variable.
00887   PHINode *getInduction() { return Induction; }
00888 
00889   /// Returns the reduction variables found in the loop.
00890   ReductionList *getReductionVars() { return &Reductions; }
00891 
00892   /// Returns the induction variables found in the loop.
00893   InductionList *getInductionVars() { return &Inductions; }
00894 
00895   /// Returns the widest induction type.
00896   Type *getWidestInductionType() { return WidestIndTy; }
00897 
00898   /// Returns True if V is an induction variable in this loop.
00899   bool isInductionVariable(const Value *V);
00900 
00901   /// Return true if the block BB needs to be predicated in order for the loop
00902   /// to be vectorized.
00903   bool blockNeedsPredication(BasicBlock *BB);
00904 
00905   /// Check if this  pointer is consecutive when vectorizing. This happens
00906   /// when the last index of the GEP is the induction variable, or that the
00907   /// pointer itself is an induction variable.
00908   /// This check allows us to vectorize A[idx] into a wide load/store.
00909   /// Returns:
00910   /// 0 - Stride is unknown or non-consecutive.
00911   /// 1 - Address is consecutive.
00912   /// -1 - Address is consecutive, and decreasing.
00913   int isConsecutivePtr(Value *Ptr);
00914 
00915   /// Returns true if the value V is uniform within the loop.
00916   bool isUniform(Value *V);
00917 
00918   /// Returns true if this instruction will remain scalar after vectorization.
00919   bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); }
00920 
00921   /// Returns the information that we collected about runtime memory check.
00922   const LoopAccessInfo::RuntimePointerCheck *getRuntimePointerCheck() const {
00923     return LAI->getRuntimePointerCheck();
00924   }
00925 
00926   const LoopAccessInfo *getLAI() const {
00927     return LAI;
00928   }
00929 
00930   /// \brief Check if \p Instr belongs to any interleaved access group.
00931   bool isAccessInterleaved(Instruction *Instr) {
00932     return InterleaveInfo.isInterleaved(Instr);
00933   }
00934 
00935   /// \brief Get the interleaved access group that \p Instr belongs to.
00936   const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) {
00937     return InterleaveInfo.getInterleaveGroup(Instr);
00938   }
00939 
00940   unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); }
00941 
00942   bool hasStride(Value *V) { return StrideSet.count(V); }
00943   bool mustCheckStrides() { return !StrideSet.empty(); }
00944   SmallPtrSet<Value *, 8>::iterator strides_begin() {
00945     return StrideSet.begin();
00946   }
00947   SmallPtrSet<Value *, 8>::iterator strides_end() { return StrideSet.end(); }
00948 
00949   /// Returns true if the target machine supports masked store operation
00950   /// for the given \p DataType and kind of access to \p Ptr.
00951   bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
00952     return TTI->isLegalMaskedStore(DataType, isConsecutivePtr(Ptr));
00953   }
00954   /// Returns true if the target machine supports masked load operation
00955   /// for the given \p DataType and kind of access to \p Ptr.
00956   bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
00957     return TTI->isLegalMaskedLoad(DataType, isConsecutivePtr(Ptr));
00958   }
00959   /// Returns true if vector representation of the instruction \p I
00960   /// requires mask.
00961   bool isMaskRequired(const Instruction* I) {
00962     return (MaskedOp.count(I) != 0);
00963   }
00964   unsigned getNumStores() const {
00965     return LAI->getNumStores();
00966   }
00967   unsigned getNumLoads() const {
00968     return LAI->getNumLoads();
00969   }
00970   unsigned getNumPredStores() const {
00971     return NumPredStores;
00972   }
00973 private:
00974   /// Check if a single basic block loop is vectorizable.
00975   /// At this point we know that this is a loop with a constant trip count
00976   /// and we only need to check individual instructions.
00977   bool canVectorizeInstrs();
00978 
00979   /// When we vectorize loops we may change the order in which
00980   /// we read and write from memory. This method checks if it is
00981   /// legal to vectorize the code, considering only memory constrains.
00982   /// Returns true if the loop is vectorizable
00983   bool canVectorizeMemory();
00984 
00985   /// Return true if we can vectorize this loop using the IF-conversion
00986   /// transformation.
00987   bool canVectorizeWithIfConvert();
00988 
00989   /// Collect the variables that need to stay uniform after vectorization.
00990   void collectLoopUniforms();
00991 
00992   /// Return true if all of the instructions in the block can be speculatively
00993   /// executed. \p SafePtrs is a list of addresses that are known to be legal
00994   /// and we know that we can read from them without segfault.
00995   bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs);
00996 
00997   /// Returns the induction kind of Phi and record the step. This function may
00998   /// return NoInduction if the PHI is not an induction variable.
00999   InductionKind isInductionVariable(PHINode *Phi, ConstantInt *&StepValue);
01000 
01001   /// \brief Collect memory access with loop invariant strides.
01002   ///
01003   /// Looks for accesses like "a[i * StrideA]" where "StrideA" is loop
01004   /// invariant.
01005   void collectStridedAccess(Value *LoadOrStoreInst);
01006 
01007   /// Report an analysis message to assist the user in diagnosing loops that are
01008   /// not vectorized.  These are handled as LoopAccessReport rather than
01009   /// VectorizationReport because the << operator of VectorizationReport returns
01010   /// LoopAccessReport.
01011   void emitAnalysis(const LoopAccessReport &Message) {
01012     LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, LV_NAME);
01013   }
01014 
01015   unsigned NumPredStores;
01016 
01017   /// The loop that we evaluate.
01018   Loop *TheLoop;
01019   /// Scev analysis.
01020   ScalarEvolution *SE;
01021   /// Target Library Info.
01022   TargetLibraryInfo *TLI;
01023   /// Parent function
01024   Function *TheFunction;
01025   /// Target Transform Info
01026   const TargetTransformInfo *TTI;
01027   /// Dominator Tree.
01028   DominatorTree *DT;
01029   // LoopAccess analysis.
01030   LoopAccessAnalysis *LAA;
01031   // And the loop-accesses info corresponding to this loop.  This pointer is
01032   // null until canVectorizeMemory sets it up.
01033   const LoopAccessInfo *LAI;
01034 
01035   /// The interleave access information contains groups of interleaved accesses
01036   /// with the same stride and close to each other.
01037   InterleavedAccessInfo InterleaveInfo;
01038 
01039   //  ---  vectorization state --- //
01040 
01041   /// Holds the integer induction variable. This is the counter of the
01042   /// loop.
01043   PHINode *Induction;
01044   /// Holds the reduction variables.
01045   ReductionList Reductions;
01046   /// Holds all of the induction variables that we found in the loop.
01047   /// Notice that inductions don't need to start at zero and that induction
01048   /// variables can be pointers.
01049   InductionList Inductions;
01050   /// Holds the widest induction type encountered.
01051   Type *WidestIndTy;
01052 
01053   /// Allowed outside users. This holds the reduction
01054   /// vars which can be accessed from outside the loop.
01055   SmallPtrSet<Value*, 4> AllowedExit;
01056   /// This set holds the variables which are known to be uniform after
01057   /// vectorization.
01058   SmallPtrSet<Instruction*, 4> Uniforms;
01059 
01060   /// Can we assume the absence of NaNs.
01061   bool HasFunNoNaNAttr;
01062 
01063   ValueToValueMap Strides;
01064   SmallPtrSet<Value *, 8> StrideSet;
01065 
01066   /// While vectorizing these instructions we have to generate a
01067   /// call to the appropriate masked intrinsic
01068   SmallPtrSet<const Instruction*, 8> MaskedOp;
01069 };
01070 
01071 /// LoopVectorizationCostModel - estimates the expected speedups due to
01072 /// vectorization.
01073 /// In many cases vectorization is not profitable. This can happen because of
01074 /// a number of reasons. In this class we mainly attempt to predict the
01075 /// expected speedup/slowdowns due to the supported instruction set. We use the
01076 /// TargetTransformInfo to query the different backends for the cost of
01077 /// different operations.
01078 class LoopVectorizationCostModel {
01079 public:
01080   LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI,
01081                              LoopVectorizationLegality *Legal,
01082                              const TargetTransformInfo &TTI,
01083                              const TargetLibraryInfo *TLI, AssumptionCache *AC,
01084                              const Function *F, const LoopVectorizeHints *Hints)
01085       : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI),
01086         TheFunction(F), Hints(Hints) {
01087     CodeMetrics::collectEphemeralValues(L, AC, EphValues);
01088   }
01089 
01090   /// Information about vectorization costs
01091   struct VectorizationFactor {
01092     unsigned Width; // Vector width with best cost
01093     unsigned Cost; // Cost of the loop with that width
01094   };
01095   /// \return The most profitable vectorization factor and the cost of that VF.
01096   /// This method checks every power of two up to VF. If UserVF is not ZERO
01097   /// then this vectorization factor will be selected if vectorization is
01098   /// possible.
01099   VectorizationFactor selectVectorizationFactor(bool OptForSize);
01100 
01101   /// \return The size (in bits) of the widest type in the code that
01102   /// needs to be vectorized. We ignore values that remain scalar such as
01103   /// 64 bit loop indices.
01104   unsigned getWidestType();
01105 
01106   /// \return The most profitable unroll factor.
01107   /// If UserUF is non-zero then this method finds the best unroll-factor
01108   /// based on register pressure and other parameters.
01109   /// VF and LoopCost are the selected vectorization factor and the cost of the
01110   /// selected VF.
01111   unsigned selectUnrollFactor(bool OptForSize, unsigned VF, unsigned LoopCost);
01112 
01113   /// \brief A struct that represents some properties of the register usage
01114   /// of a loop.
01115   struct RegisterUsage {
01116     /// Holds the number of loop invariant values that are used in the loop.
01117     unsigned LoopInvariantRegs;
01118     /// Holds the maximum number of concurrent live intervals in the loop.
01119     unsigned MaxLocalUsers;
01120     /// Holds the number of instructions in the loop.
01121     unsigned NumInstructions;
01122   };
01123 
01124   /// \return  information about the register usage of the loop.
01125   RegisterUsage calculateRegisterUsage();
01126 
01127 private:
01128   /// Returns the expected execution cost. The unit of the cost does
01129   /// not matter because we use the 'cost' units to compare different
01130   /// vector widths. The cost that is returned is *not* normalized by
01131   /// the factor width.
01132   unsigned expectedCost(unsigned VF);
01133 
01134   /// Returns the execution time cost of an instruction for a given vector
01135   /// width. Vector width of one means scalar.
01136   unsigned getInstructionCost(Instruction *I, unsigned VF);
01137 
01138   /// Returns whether the instruction is a load or store and will be a emitted
01139   /// as a vector operation.
01140   bool isConsecutiveLoadOrStore(Instruction *I);
01141 
01142   /// Report an analysis message to assist the user in diagnosing loops that are
01143   /// not vectorized.  These are handled as LoopAccessReport rather than
01144   /// VectorizationReport because the << operator of VectorizationReport returns
01145   /// LoopAccessReport.
01146   void emitAnalysis(const LoopAccessReport &Message) {
01147     LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, LV_NAME);
01148   }
01149 
01150   /// Values used only by @llvm.assume calls.
01151   SmallPtrSet<const Value *, 32> EphValues;
01152 
01153   /// The loop that we evaluate.
01154   Loop *TheLoop;
01155   /// Scev analysis.
01156   ScalarEvolution *SE;
01157   /// Loop Info analysis.
01158   LoopInfo *LI;
01159   /// Vectorization legality.
01160   LoopVectorizationLegality *Legal;
01161   /// Vector target information.
01162   const TargetTransformInfo &TTI;
01163   /// Target Library Info.
01164   const TargetLibraryInfo *TLI;
01165   const Function *TheFunction;
01166   // Loop Vectorize Hint.
01167   const LoopVectorizeHints *Hints;
01168 };
01169 
01170 /// Utility class for getting and setting loop vectorizer hints in the form
01171 /// of loop metadata.
01172 /// This class keeps a number of loop annotations locally (as member variables)
01173 /// and can, upon request, write them back as metadata on the loop. It will
01174 /// initially scan the loop for existing metadata, and will update the local
01175 /// values based on information in the loop.
01176 /// We cannot write all values to metadata, as the mere presence of some info,
01177 /// for example 'force', means a decision has been made. So, we need to be
01178 /// careful NOT to add them if the user hasn't specifically asked so.
01179 class LoopVectorizeHints {
01180   enum HintKind {
01181     HK_WIDTH,
01182     HK_UNROLL,
01183     HK_FORCE
01184   };
01185 
01186   /// Hint - associates name and validation with the hint value.
01187   struct Hint {
01188     const char * Name;
01189     unsigned Value; // This may have to change for non-numeric values.
01190     HintKind Kind;
01191 
01192     Hint(const char * Name, unsigned Value, HintKind Kind)
01193       : Name(Name), Value(Value), Kind(Kind) { }
01194 
01195     bool validate(unsigned Val) {
01196       switch (Kind) {
01197       case HK_WIDTH:
01198         return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth;
01199       case HK_UNROLL:
01200         return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
01201       case HK_FORCE:
01202         return (Val <= 1);
01203       }
01204       return false;
01205     }
01206   };
01207 
01208   /// Vectorization width.
01209   Hint Width;
01210   /// Vectorization interleave factor.
01211   Hint Interleave;
01212   /// Vectorization forced
01213   Hint Force;
01214 
01215   /// Return the loop metadata prefix.
01216   static StringRef Prefix() { return "llvm.loop."; }
01217 
01218 public:
01219   enum ForceKind {
01220     FK_Undefined = -1, ///< Not selected.
01221     FK_Disabled = 0,   ///< Forcing disabled.
01222     FK_Enabled = 1,    ///< Forcing enabled.
01223   };
01224 
01225   LoopVectorizeHints(const Loop *L, bool DisableInterleaving)
01226       : Width("vectorize.width", VectorizerParams::VectorizationFactor,
01227               HK_WIDTH),
01228         Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
01229         Force("vectorize.enable", FK_Undefined, HK_FORCE),
01230         TheLoop(L) {
01231     // Populate values with existing loop metadata.
01232     getHintsFromMetadata();
01233 
01234     // force-vector-interleave overrides DisableInterleaving.
01235     if (VectorizerParams::isInterleaveForced())
01236       Interleave.Value = VectorizerParams::VectorizationInterleave;
01237 
01238     DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()
01239           << "LV: Interleaving disabled by the pass manager\n");
01240   }
01241 
01242   /// Mark the loop L as already vectorized by setting the width to 1.
01243   void setAlreadyVectorized() {
01244     Width.Value = Interleave.Value = 1;
01245     Hint Hints[] = {Width, Interleave};
01246     writeHintsToMetadata(Hints);
01247   }
01248 
01249   /// Dumps all the hint information.
01250   std::string emitRemark() const {
01251     VectorizationReport R;
01252     if (Force.Value == LoopVectorizeHints::FK_Disabled)
01253       R << "vectorization is explicitly disabled";
01254     else {
01255       R << "use -Rpass-analysis=loop-vectorize for more info";
01256       if (Force.Value == LoopVectorizeHints::FK_Enabled) {
01257         R << " (Force=true";
01258         if (Width.Value != 0)
01259           R << ", Vector Width=" << Width.Value;
01260         if (Interleave.Value != 0)
01261           R << ", Interleave Count=" << Interleave.Value;
01262         R << ")";
01263       }
01264     }
01265 
01266     return R.str();
01267   }
01268 
01269   unsigned getWidth() const { return Width.Value; }
01270   unsigned getInterleave() const { return Interleave.Value; }
01271   enum ForceKind getForce() const { return (ForceKind)Force.Value; }
01272 
01273 private:
01274   /// Find hints specified in the loop metadata and update local values.
01275   void getHintsFromMetadata() {
01276     MDNode *LoopID = TheLoop->getLoopID();
01277     if (!LoopID)
01278       return;
01279 
01280     // First operand should refer to the loop id itself.
01281     assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
01282     assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
01283 
01284     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
01285       const MDString *S = nullptr;
01286       SmallVector<Metadata *, 4> Args;
01287 
01288       // The expected hint is either a MDString or a MDNode with the first
01289       // operand a MDString.
01290       if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
01291         if (!MD || MD->getNumOperands() == 0)
01292           continue;
01293         S = dyn_cast<MDString>(MD->getOperand(0));
01294         for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
01295           Args.push_back(MD->getOperand(i));
01296       } else {
01297         S = dyn_cast<MDString>(LoopID->getOperand(i));
01298         assert(Args.size() == 0 && "too many arguments for MDString");
01299       }
01300 
01301       if (!S)
01302         continue;
01303 
01304       // Check if the hint starts with the loop metadata prefix.
01305       StringRef Name = S->getString();
01306       if (Args.size() == 1)
01307         setHint(Name, Args[0]);
01308     }
01309   }
01310 
01311   /// Checks string hint with one operand and set value if valid.
01312   void setHint(StringRef Name, Metadata *Arg) {
01313     if (!Name.startswith(Prefix()))
01314       return;
01315     Name = Name.substr(Prefix().size(), StringRef::npos);
01316 
01317     const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
01318     if (!C) return;
01319     unsigned Val = C->getZExtValue();
01320 
01321     Hint *Hints[] = {&Width, &Interleave, &Force};
01322     for (auto H : Hints) {
01323       if (Name == H->Name) {
01324         if (H->validate(Val))
01325           H->Value = Val;
01326         else
01327           DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
01328         break;
01329       }
01330     }
01331   }
01332 
01333   /// Create a new hint from name / value pair.
01334   MDNode *createHintMetadata(StringRef Name, unsigned V) const {
01335     LLVMContext &Context = TheLoop->getHeader()->getContext();
01336     Metadata *MDs[] = {MDString::get(Context, Name),
01337                        ConstantAsMetadata::get(
01338                            ConstantInt::get(Type::getInt32Ty(Context), V))};
01339     return MDNode::get(Context, MDs);
01340   }
01341 
01342   /// Matches metadata with hint name.
01343   bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) {
01344     MDString* Name = dyn_cast<MDString>(Node->getOperand(0));
01345     if (!Name)
01346       return false;
01347 
01348     for (auto H : HintTypes)
01349       if (Name->getString().endswith(H.Name))
01350         return true;
01351     return false;
01352   }
01353 
01354   /// Sets current hints into loop metadata, keeping other values intact.
01355   void writeHintsToMetadata(ArrayRef<Hint> HintTypes) {
01356     if (HintTypes.size() == 0)
01357       return;
01358 
01359     // Reserve the first element to LoopID (see below).
01360     SmallVector<Metadata *, 4> MDs(1);
01361     // If the loop already has metadata, then ignore the existing operands.
01362     MDNode *LoopID = TheLoop->getLoopID();
01363     if (LoopID) {
01364       for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
01365         MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
01366         // If node in update list, ignore old value.
01367         if (!matchesHintMetadataName(Node, HintTypes))
01368           MDs.push_back(Node);
01369       }
01370     }
01371 
01372     // Now, add the missing hints.
01373     for (auto H : HintTypes)
01374       MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));
01375 
01376     // Replace current metadata node with new one.
01377     LLVMContext &Context = TheLoop->getHeader()->getContext();
01378     MDNode *NewLoopID = MDNode::get(Context, MDs);
01379     // Set operand 0 to refer to the loop id itself.
01380     NewLoopID->replaceOperandWith(0, NewLoopID);
01381 
01382     TheLoop->setLoopID(NewLoopID);
01383   }
01384 
01385   /// The loop these hints belong to.
01386   const Loop *TheLoop;
01387 };
01388 
01389 static void emitMissedWarning(Function *F, Loop *L,
01390                               const LoopVectorizeHints &LH) {
01391   emitOptimizationRemarkMissed(F->getContext(), DEBUG_TYPE, *F,
01392                                L->getStartLoc(), LH.emitRemark());
01393 
01394   if (LH.getForce() == LoopVectorizeHints::FK_Enabled) {
01395     if (LH.getWidth() != 1)
01396       emitLoopVectorizeWarning(
01397           F->getContext(), *F, L->getStartLoc(),
01398           "failed explicitly specified loop vectorization");
01399     else if (LH.getInterleave() != 1)
01400       emitLoopInterleaveWarning(
01401           F->getContext(), *F, L->getStartLoc(),
01402           "failed explicitly specified loop interleaving");
01403   }
01404 }
01405 
01406 static void addInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {
01407   if (L.empty())
01408     return V.push_back(&L);
01409 
01410   for (Loop *InnerL : L)
01411     addInnerLoop(*InnerL, V);
01412 }
01413 
01414 /// The LoopVectorize Pass.
01415 struct LoopVectorize : public FunctionPass {
01416   /// Pass identification, replacement for typeid
01417   static char ID;
01418 
01419   explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true)
01420     : FunctionPass(ID),
01421       DisableUnrolling(NoUnrolling),
01422       AlwaysVectorize(AlwaysVectorize) {
01423     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
01424   }
01425 
01426   ScalarEvolution *SE;
01427   LoopInfo *LI;
01428   TargetTransformInfo *TTI;
01429   DominatorTree *DT;
01430   BlockFrequencyInfo *BFI;
01431   TargetLibraryInfo *TLI;
01432   AliasAnalysis *AA;
01433   AssumptionCache *AC;
01434   LoopAccessAnalysis *LAA;
01435   bool DisableUnrolling;
01436   bool AlwaysVectorize;
01437 
01438   BlockFrequency ColdEntryFreq;
01439 
01440   bool runOnFunction(Function &F) override {
01441     SE = &getAnalysis<ScalarEvolution>();
01442     LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
01443     TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
01444     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
01445     BFI = &getAnalysis<BlockFrequencyInfo>();
01446     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
01447     TLI = TLIP ? &TLIP->getTLI() : nullptr;
01448     AA = &getAnalysis<AliasAnalysis>();
01449     AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
01450     LAA = &getAnalysis<LoopAccessAnalysis>();
01451 
01452     // Compute some weights outside of the loop over the loops. Compute this
01453     // using a BranchProbability to re-use its scaling math.
01454     const BranchProbability ColdProb(1, 5); // 20%
01455     ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb;
01456 
01457     // If the target claims to have no vector registers don't attempt
01458     // vectorization.
01459     if (!TTI->getNumberOfRegisters(true))
01460       return false;
01461 
01462     // Build up a worklist of inner-loops to vectorize. This is necessary as
01463     // the act of vectorizing or partially unrolling a loop creates new loops
01464     // and can invalidate iterators across the loops.
01465     SmallVector<Loop *, 8> Worklist;
01466 
01467     for (Loop *L : *LI)
01468       addInnerLoop(*L, Worklist);
01469 
01470     LoopsAnalyzed += Worklist.size();
01471 
01472     // Now walk the identified inner loops.
01473     bool Changed = false;
01474     while (!Worklist.empty())
01475       Changed |= processLoop(Worklist.pop_back_val());
01476 
01477     // Process each loop nest in the function.
01478     return Changed;
01479   }
01480 
01481   static void AddRuntimeUnrollDisableMetaData(Loop *L) {
01482     SmallVector<Metadata *, 4> MDs;
01483     // Reserve first location for self reference to the LoopID metadata node.
01484     MDs.push_back(nullptr);
01485     bool IsUnrollMetadata = false;
01486     MDNode *LoopID = L->getLoopID();
01487     if (LoopID) {
01488       // First find existing loop unrolling disable metadata.
01489       for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
01490         MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
01491         if (MD) {
01492           const MDString *S = dyn_cast<MDString>(MD->getOperand(0));
01493           IsUnrollMetadata =
01494               S && S->getString().startswith("llvm.loop.unroll.disable");
01495         }
01496         MDs.push_back(LoopID->getOperand(i));
01497       }
01498     }
01499 
01500     if (!IsUnrollMetadata) {
01501       // Add runtime unroll disable metadata.
01502       LLVMContext &Context = L->getHeader()->getContext();
01503       SmallVector<Metadata *, 1> DisableOperands;
01504       DisableOperands.push_back(
01505           MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
01506       MDNode *DisableNode = MDNode::get(Context, DisableOperands);
01507       MDs.push_back(DisableNode);
01508       MDNode *NewLoopID = MDNode::get(Context, MDs);
01509       // Set operand 0 to refer to the loop id itself.
01510       NewLoopID->replaceOperandWith(0, NewLoopID);
01511       L->setLoopID(NewLoopID);
01512     }
01513   }
01514 
01515   bool processLoop(Loop *L) {
01516     assert(L->empty() && "Only process inner loops.");
01517 
01518 #ifndef NDEBUG
01519     const std::string DebugLocStr = getDebugLocString(L);
01520 #endif /* NDEBUG */
01521 
01522     DEBUG(dbgs() << "\nLV: Checking a loop in \""
01523                  << L->getHeader()->getParent()->getName() << "\" from "
01524                  << DebugLocStr << "\n");
01525 
01526     LoopVectorizeHints Hints(L, DisableUnrolling);
01527 
01528     DEBUG(dbgs() << "LV: Loop hints:"
01529                  << " force="
01530                  << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
01531                          ? "disabled"
01532                          : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
01533                                 ? "enabled"
01534                                 : "?")) << " width=" << Hints.getWidth()
01535                  << " unroll=" << Hints.getInterleave() << "\n");
01536 
01537     // Function containing loop
01538     Function *F = L->getHeader()->getParent();
01539 
01540     // Looking at the diagnostic output is the only way to determine if a loop
01541     // was vectorized (other than looking at the IR or machine code), so it
01542     // is important to generate an optimization remark for each loop. Most of
01543     // these messages are generated by emitOptimizationRemarkAnalysis. Remarks
01544     // generated by emitOptimizationRemark and emitOptimizationRemarkMissed are
01545     // less verbose reporting vectorized loops and unvectorized loops that may
01546     // benefit from vectorization, respectively.
01547 
01548     if (Hints.getForce() == LoopVectorizeHints::FK_Disabled) {
01549       DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
01550       emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F,
01551                                      L->getStartLoc(), Hints.emitRemark());
01552       return false;
01553     }
01554 
01555     if (!AlwaysVectorize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) {
01556       DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
01557       emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F,
01558                                      L->getStartLoc(), Hints.emitRemark());
01559       return false;
01560     }
01561 
01562     if (Hints.getWidth() == 1 && Hints.getInterleave() == 1) {
01563       DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
01564       emitOptimizationRemarkAnalysis(
01565           F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
01566           "loop not vectorized: vector width and interleave count are "
01567           "explicitly set to 1");
01568       return false;
01569     }
01570 
01571     // Check the loop for a trip count threshold:
01572     // do not vectorize loops with a tiny trip count.
01573     const unsigned TC = SE->getSmallConstantTripCount(L);
01574     if (TC > 0u && TC < TinyTripCountVectorThreshold) {
01575       DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
01576                    << "This loop is not worth vectorizing.");
01577       if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
01578         DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
01579       else {
01580         DEBUG(dbgs() << "\n");
01581         emitOptimizationRemarkAnalysis(
01582             F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
01583             "vectorization is not beneficial and is not explicitly forced");
01584         return false;
01585       }
01586     }
01587 
01588     // Check if it is legal to vectorize the loop.
01589     LoopVectorizationLegality LVL(L, SE, DT, TLI, AA, F, TTI, LAA);
01590     if (!LVL.canVectorize()) {
01591       DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
01592       emitMissedWarning(F, L, Hints);
01593       return false;
01594     }
01595 
01596     // Use the cost model.
01597     LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, TLI, AC, F, &Hints);
01598 
01599     // Check the function attributes to find out if this function should be
01600     // optimized for size.
01601     bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
01602                       F->hasFnAttribute(Attribute::OptimizeForSize);
01603 
01604     // Compute the weighted frequency of this loop being executed and see if it
01605     // is less than 20% of the function entry baseline frequency. Note that we
01606     // always have a canonical loop here because we think we *can* vectoriez.
01607     // FIXME: This is hidden behind a flag due to pervasive problems with
01608     // exactly what block frequency models.
01609     if (LoopVectorizeWithBlockFrequency) {
01610       BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader());
01611       if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
01612           LoopEntryFreq < ColdEntryFreq)
01613         OptForSize = true;
01614     }
01615 
01616     // Check the function attributes to see if implicit floats are allowed.a
01617     // FIXME: This check doesn't seem possibly correct -- what if the loop is
01618     // an integer loop and the vector instructions selected are purely integer
01619     // vector instructions?
01620     if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
01621       DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
01622             "attribute is used.\n");
01623       emitOptimizationRemarkAnalysis(
01624           F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
01625           "loop not vectorized due to NoImplicitFloat attribute");
01626       emitMissedWarning(F, L, Hints);
01627       return false;
01628     }
01629 
01630     // Select the optimal vectorization factor.
01631     const LoopVectorizationCostModel::VectorizationFactor VF =
01632         CM.selectVectorizationFactor(OptForSize);
01633 
01634     // Select the unroll factor.
01635     const unsigned UF =
01636         CM.selectUnrollFactor(OptForSize, VF.Width, VF.Cost);
01637 
01638     DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
01639                  << DebugLocStr << '\n');
01640     DEBUG(dbgs() << "LV: Unroll Factor is " << UF << '\n');
01641 
01642     if (VF.Width == 1) {
01643       DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial\n");
01644 
01645       if (UF == 1) {
01646         emitOptimizationRemarkAnalysis(
01647             F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
01648             "not beneficial to vectorize and user disabled interleaving");
01649         return false;
01650       }
01651       DEBUG(dbgs() << "LV: Trying to at least unroll the loops.\n");
01652 
01653       // Report the unrolling decision.
01654       emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
01655                              Twine("unrolled with interleaving factor " +
01656                                    Twine(UF) +
01657                                    " (vectorization not beneficial)"));
01658 
01659       // We decided not to vectorize, but we may want to unroll.
01660 
01661       InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, UF);
01662       Unroller.vectorize(&LVL);
01663     } else {
01664       // If we decided that it is *legal* to vectorize the loop then do it.
01665       InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, UF);
01666       LB.vectorize(&LVL);
01667       ++LoopsVectorized;
01668 
01669       // Add metadata to disable runtime unrolling scalar loop when there's no
01670       // runtime check about strides and memory. Because at this situation,
01671       // scalar loop is rarely used not worthy to be unrolled.
01672       if (!LB.IsSafetyChecksAdded())
01673         AddRuntimeUnrollDisableMetaData(L);
01674 
01675       // Report the vectorization decision.
01676       emitOptimizationRemark(
01677           F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
01678           Twine("vectorized loop (vectorization factor: ") + Twine(VF.Width) +
01679               ", unrolling interleave factor: " + Twine(UF) + ")");
01680     }
01681 
01682     // Mark the loop as already vectorized to avoid vectorizing again.
01683     Hints.setAlreadyVectorized();
01684 
01685     DEBUG(verifyFunction(*L->getHeader()->getParent()));
01686     return true;
01687   }
01688 
01689   void getAnalysisUsage(AnalysisUsage &AU) const override {
01690     AU.addRequired<AssumptionCacheTracker>();
01691     AU.addRequiredID(LoopSimplifyID);
01692     AU.addRequiredID(LCSSAID);
01693     AU.addRequired<BlockFrequencyInfo>();
01694     AU.addRequired<DominatorTreeWrapperPass>();
01695     AU.addRequired<LoopInfoWrapperPass>();
01696     AU.addRequired<ScalarEvolution>();
01697     AU.addRequired<TargetTransformInfoWrapperPass>();
01698     AU.addRequired<AliasAnalysis>();
01699     AU.addRequired<LoopAccessAnalysis>();
01700     AU.addPreserved<LoopInfoWrapperPass>();
01701     AU.addPreserved<DominatorTreeWrapperPass>();
01702     AU.addPreserved<AliasAnalysis>();
01703   }
01704 
01705 };
01706 
01707 } // end anonymous namespace
01708 
01709 //===----------------------------------------------------------------------===//
01710 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
01711 // LoopVectorizationCostModel.
01712 //===----------------------------------------------------------------------===//
01713 
01714 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
01715   // We need to place the broadcast of invariant variables outside the loop.
01716   Instruction *Instr = dyn_cast<Instruction>(V);
01717   bool NewInstr =
01718       (Instr && std::find(LoopVectorBody.begin(), LoopVectorBody.end(),
01719                           Instr->getParent()) != LoopVectorBody.end());
01720   bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
01721 
01722   // Place the code for broadcasting invariant variables in the new preheader.
01723   IRBuilder<>::InsertPointGuard Guard(Builder);
01724   if (Invariant)
01725     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
01726 
01727   // Broadcast the scalar into all locations in the vector.
01728   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
01729 
01730   return Shuf;
01731 }
01732 
01733 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx,
01734                                           Value *Step) {
01735   assert(Val->getType()->isVectorTy() && "Must be a vector");
01736   assert(Val->getType()->getScalarType()->isIntegerTy() &&
01737          "Elem must be an integer");
01738   assert(Step->getType() == Val->getType()->getScalarType() &&
01739          "Step has wrong type");
01740   // Create the types.
01741   Type *ITy = Val->getType()->getScalarType();
01742   VectorType *Ty = cast<VectorType>(Val->getType());
01743   int VLen = Ty->getNumElements();
01744   SmallVector<Constant*, 8> Indices;
01745 
01746   // Create a vector of consecutive numbers from zero to VF.
01747   for (int i = 0; i < VLen; ++i)
01748     Indices.push_back(ConstantInt::get(ITy, StartIdx + i));
01749 
01750   // Add the consecutive indices to the vector value.
01751   Constant *Cv = ConstantVector::get(Indices);
01752   assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
01753   Step = Builder.CreateVectorSplat(VLen, Step);
01754   assert(Step->getType() == Val->getType() && "Invalid step vec");
01755   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
01756   // which can be found from the original scalar operations.
01757   Step = Builder.CreateMul(Cv, Step);
01758   return Builder.CreateAdd(Val, Step, "induction");
01759 }
01760 
01761 /// \brief Find the operand of the GEP that should be checked for consecutive
01762 /// stores. This ignores trailing indices that have no effect on the final
01763 /// pointer.
01764 static unsigned getGEPInductionOperand(const GetElementPtrInst *Gep) {
01765   const DataLayout &DL = Gep->getModule()->getDataLayout();
01766   unsigned LastOperand = Gep->getNumOperands() - 1;
01767   unsigned GEPAllocSize = DL.getTypeAllocSize(
01768       cast<PointerType>(Gep->getType()->getScalarType())->getElementType());
01769 
01770   // Walk backwards and try to peel off zeros.
01771   while (LastOperand > 1 && match(Gep->getOperand(LastOperand), m_Zero())) {
01772     // Find the type we're currently indexing into.
01773     gep_type_iterator GEPTI = gep_type_begin(Gep);
01774     std::advance(GEPTI, LastOperand - 1);
01775 
01776     // If it's a type with the same allocation size as the result of the GEP we
01777     // can peel off the zero index.
01778     if (DL.getTypeAllocSize(*GEPTI) != GEPAllocSize)
01779       break;
01780     --LastOperand;
01781   }
01782 
01783   return LastOperand;
01784 }
01785 
01786 int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
01787   assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr");
01788   // Make sure that the pointer does not point to structs.
01789   if (Ptr->getType()->getPointerElementType()->isAggregateType())
01790     return 0;
01791 
01792   // If this value is a pointer induction variable we know it is consecutive.
01793   PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);
01794   if (Phi && Inductions.count(Phi)) {
01795     InductionInfo II = Inductions[Phi];
01796     return II.getConsecutiveDirection();
01797   }
01798 
01799   GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
01800   if (!Gep)
01801     return 0;
01802 
01803   unsigned NumOperands = Gep->getNumOperands();
01804   Value *GpPtr = Gep->getPointerOperand();
01805   // If this GEP value is a consecutive pointer induction variable and all of
01806   // the indices are constant then we know it is consecutive. We can
01807   Phi = dyn_cast<PHINode>(GpPtr);
01808   if (Phi && Inductions.count(Phi)) {
01809 
01810     // Make sure that the pointer does not point to structs.
01811     PointerType *GepPtrType = cast<PointerType>(GpPtr->getType());
01812     if (GepPtrType->getElementType()->isAggregateType())
01813       return 0;
01814 
01815     // Make sure that all of the index operands are loop invariant.
01816     for (unsigned i = 1; i < NumOperands; ++i)
01817       if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
01818         return 0;
01819 
01820     InductionInfo II = Inductions[Phi];
01821     return II.getConsecutiveDirection();
01822   }
01823 
01824   unsigned InductionOperand = getGEPInductionOperand(Gep);
01825 
01826   // Check that all of the gep indices are uniform except for our induction
01827   // operand.
01828   for (unsigned i = 0; i != NumOperands; ++i)
01829     if (i != InductionOperand &&
01830         !SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
01831       return 0;
01832 
01833   // We can emit wide load/stores only if the last non-zero index is the
01834   // induction variable.
01835   const SCEV *Last = nullptr;
01836   if (!Strides.count(Gep))
01837     Last = SE->getSCEV(Gep->getOperand(InductionOperand));
01838   else {
01839     // Because of the multiplication by a stride we can have a s/zext cast.
01840     // We are going to replace this stride by 1 so the cast is safe to ignore.
01841     //
01842     //  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
01843     //  %0 = trunc i64 %indvars.iv to i32
01844     //  %mul = mul i32 %0, %Stride1
01845     //  %idxprom = zext i32 %mul to i64  << Safe cast.
01846     //  %arrayidx = getelementptr inbounds i32* %B, i64 %idxprom
01847     //
01848     Last = replaceSymbolicStrideSCEV(SE, Strides,
01849                                      Gep->getOperand(InductionOperand), Gep);
01850     if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(Last))
01851       Last =
01852           (C->getSCEVType() == scSignExtend || C->getSCEVType() == scZeroExtend)
01853               ? C->getOperand()
01854               : Last;
01855   }
01856   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {
01857     const SCEV *Step = AR->getStepRecurrence(*SE);
01858 
01859     // The memory is consecutive because the last index is consecutive
01860     // and all other indices are loop invariant.
01861     if (Step->isOne())
01862       return 1;
01863     if (Step->isAllOnesValue())
01864       return -1;
01865   }
01866 
01867   return 0;
01868 }
01869 
01870 bool LoopVectorizationLegality::isUniform(Value *V) {
01871   return LAI->isUniform(V);
01872 }
01873 
01874 InnerLoopVectorizer::VectorParts&
01875 InnerLoopVectorizer::getVectorValue(Value *V) {
01876   assert(V != Induction && "The new induction variable should not be used.");
01877   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
01878 
01879   // If we have a stride that is replaced by one, do it here.
01880   if (Legal->hasStride(V))
01881     V = ConstantInt::get(V->getType(), 1);
01882 
01883   // If we have this scalar in the map, return it.
01884   if (WidenMap.has(V))
01885     return WidenMap.get(V);
01886 
01887   // If this scalar is unknown, assume that it is a constant or that it is
01888   // loop invariant. Broadcast V and save the value for future uses.
01889   Value *B = getBroadcastInstrs(V);
01890   return WidenMap.splat(V, B);
01891 }
01892 
01893 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
01894   assert(Vec->getType()->isVectorTy() && "Invalid type");
01895   SmallVector<Constant*, 8> ShuffleMask;
01896   for (unsigned i = 0; i < VF; ++i)
01897     ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
01898 
01899   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
01900                                      ConstantVector::get(ShuffleMask),
01901                                      "reverse");
01902 }
01903 
01904 // Get a mask to interleave \p NumVec vectors into a wide vector.
01905 // I.e.  <0, VF, VF*2, ..., VF*(NumVec-1), 1, VF+1, VF*2+1, ...>
01906 // E.g. For 2 interleaved vectors, if VF is 4, the mask is:
01907 //      <0, 4, 1, 5, 2, 6, 3, 7>
01908 static Constant *getInterleavedMask(IRBuilder<> &Builder, unsigned VF,
01909                                     unsigned NumVec) {
01910   SmallVector<Constant *, 16> Mask;
01911   for (unsigned i = 0; i < VF; i++)
01912     for (unsigned j = 0; j < NumVec; j++)
01913       Mask.push_back(Builder.getInt32(j * VF + i));
01914 
01915   return ConstantVector::get(Mask);
01916 }
01917 
01918 // Get the strided mask starting from index \p Start.
01919 // I.e.  <Start, Start + Stride, ..., Start + Stride*(VF-1)>
01920 static Constant *getStridedMask(IRBuilder<> &Builder, unsigned Start,
01921                                 unsigned Stride, unsigned VF) {
01922   SmallVector<Constant *, 16> Mask;
01923   for (unsigned i = 0; i < VF; i++)
01924     Mask.push_back(Builder.getInt32(Start + i * Stride));
01925 
01926   return ConstantVector::get(Mask);
01927 }
01928 
01929 // Get a mask of two parts: The first part consists of sequential integers
01930 // starting from 0, The second part consists of UNDEFs.
01931 // I.e. <0, 1, 2, ..., NumInt - 1, undef, ..., undef>
01932 static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned NumInt,
01933                                    unsigned NumUndef) {
01934   SmallVector<Constant *, 16> Mask;
01935   for (unsigned i = 0; i < NumInt; i++)
01936     Mask.push_back(Builder.getInt32(i));
01937 
01938   Constant *Undef = UndefValue::get(Builder.getInt32Ty());
01939   for (unsigned i = 0; i < NumUndef; i++)
01940     Mask.push_back(Undef);
01941 
01942   return ConstantVector::get(Mask);
01943 }
01944 
01945 // Concatenate two vectors with the same element type. The 2nd vector should
01946 // not have more elements than the 1st vector. If the 2nd vector has less
01947 // elements, extend it with UNDEFs.
01948 static Value *ConcatenateTwoVectors(IRBuilder<> &Builder, Value *V1,
01949                                     Value *V2) {
01950   VectorType *VecTy1 = dyn_cast<VectorType>(V1->getType());
01951   VectorType *VecTy2 = dyn_cast<VectorType>(V2->getType());
01952   assert(VecTy1 && VecTy2 &&
01953          VecTy1->getScalarType() == VecTy2->getScalarType() &&
01954          "Expect two vectors with the same element type");
01955 
01956   unsigned NumElts1 = VecTy1->getNumElements();
01957   unsigned NumElts2 = VecTy2->getNumElements();
01958   assert(NumElts1 >= NumElts2 && "Unexpect the first vector has less elements");
01959 
01960   if (NumElts1 > NumElts2) {
01961     // Extend with UNDEFs.
01962     Constant *ExtMask =
01963         getSequentialMask(Builder, NumElts2, NumElts1 - NumElts2);
01964     V2 = Builder.CreateShuffleVector(V2, UndefValue::get(VecTy2), ExtMask);
01965   }
01966 
01967   Constant *Mask = getSequentialMask(Builder, NumElts1 + NumElts2, 0);
01968   return Builder.CreateShuffleVector(V1, V2, Mask);
01969 }
01970 
01971 // Concatenate vectors in the given list. All vectors have the same type.
01972 static Value *ConcatenateVectors(IRBuilder<> &Builder,
01973                                  ArrayRef<Value *> InputList) {
01974   unsigned NumVec = InputList.size();
01975   assert(NumVec > 1 && "Should be at least two vectors");
01976 
01977   SmallVector<Value *, 8> ResList;
01978   ResList.append(InputList.begin(), InputList.end());
01979   do {
01980     SmallVector<Value *, 8> TmpList;
01981     for (unsigned i = 0; i < NumVec - 1; i += 2) {
01982       Value *V0 = ResList[i], *V1 = ResList[i + 1];
01983       assert((V0->getType() == V1->getType() || i == NumVec - 2) &&
01984              "Only the last vector may have a different type");
01985 
01986       TmpList.push_back(ConcatenateTwoVectors(Builder, V0, V1));
01987     }
01988 
01989     // Push the last vector if the total number of vectors is odd.
01990     if (NumVec % 2 != 0)
01991       TmpList.push_back(ResList[NumVec - 1]);
01992 
01993     ResList = TmpList;
01994     NumVec = ResList.size();
01995   } while (NumVec > 1);
01996 
01997   return ResList[0];
01998 }
01999 
02000 // Try to vectorize the interleave group that \p Instr belongs to.
02001 //
02002 // E.g. Translate following interleaved load group (factor = 3):
02003 //   for (i = 0; i < N; i+=3) {
02004 //     R = Pic[i];             // Member of index 0
02005 //     G = Pic[i+1];           // Member of index 1
02006 //     B = Pic[i+2];           // Member of index 2
02007 //     ... // do something to R, G, B
02008 //   }
02009 // To:
02010 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
02011 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
02012 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
02013 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
02014 //
02015 // Or translate following interleaved store group (factor = 3):
02016 //   for (i = 0; i < N; i+=3) {
02017 //     ... do something to R, G, B
02018 //     Pic[i]   = R;           // Member of index 0
02019 //     Pic[i+1] = G;           // Member of index 1
02020 //     Pic[i+2] = B;           // Member of index 2
02021 //   }
02022 // To:
02023 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
02024 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
02025 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
02026 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
02027 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
02028 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
02029   const InterleaveGroup *Group = Legal->getInterleavedAccessGroup(Instr);
02030   assert(Group && "Fail to get an interleaved access group.");
02031 
02032   // Skip if current instruction is not the insert position.
02033   if (Instr != Group->getInsertPos())
02034     return;
02035 
02036   LoadInst *LI = dyn_cast<LoadInst>(Instr);
02037   StoreInst *SI = dyn_cast<StoreInst>(Instr);
02038   Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
02039 
02040   // Prepare for the vector type of the interleaved load/store.
02041   Type *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
02042   unsigned InterleaveFactor = Group->getFactor();
02043   Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
02044   Type *PtrTy = VecTy->getPointerTo(Ptr->getType()->getPointerAddressSpace());
02045 
02046   // Prepare for the new pointers.
02047   setDebugLocFromInst(Builder, Ptr);
02048   VectorParts &PtrParts = getVectorValue(Ptr);
02049   SmallVector<Value *, 2> NewPtrs;
02050   unsigned Index = Group->getIndex(Instr);
02051   for (unsigned Part = 0; Part < UF; Part++) {
02052     // Extract the pointer for current instruction from the pointer vector. A
02053     // reverse access uses the pointer in the last lane.
02054     Value *NewPtr = Builder.CreateExtractElement(
02055         PtrParts[Part],
02056         Group->isReverse() ? Builder.getInt32(VF - 1) : Builder.getInt32(0));
02057 
02058     // Notice current instruction could be any index. Need to adjust the address
02059     // to the member of index 0.
02060     //
02061     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
02062     //       b = A[i];       // Member of index 0
02063     // Current pointer is pointed to A[i+1], adjust it to A[i].
02064     //
02065     // E.g.  A[i+1] = a;     // Member of index 1
02066     //       A[i]   = b;     // Member of index 0
02067     //       A[i+2] = c;     // Member of index 2 (Current instruction)
02068     // Current pointer is pointed to A[i+2], adjust it to A[i].
02069     NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index));
02070 
02071     // Cast to the vector pointer type.
02072     NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
02073   }
02074 
02075   setDebugLocFromInst(Builder, Instr);
02076   Value *UndefVec = UndefValue::get(VecTy);
02077 
02078   // Vectorize the interleaved load group.
02079   if (LI) {
02080     for (unsigned Part = 0; Part < UF; Part++) {
02081       Instruction *NewLoadInstr = Builder.CreateAlignedLoad(
02082           NewPtrs[Part], Group->getAlignment(), "wide.vec");
02083 
02084       for (unsigned i = 0; i < InterleaveFactor; i++) {
02085         Instruction *Member = Group->getMember(i);
02086 
02087         // Skip the gaps in the group.
02088         if (!Member)
02089           continue;
02090 
02091         Constant *StrideMask = getStridedMask(Builder, i, InterleaveFactor, VF);
02092         Value *StridedVec = Builder.CreateShuffleVector(
02093             NewLoadInstr, UndefVec, StrideMask, "strided.vec");
02094 
02095         // If this member has different type, cast the result type.
02096         if (Member->getType() != ScalarTy) {
02097           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
02098           StridedVec = Builder.CreateBitOrPointerCast(StridedVec, OtherVTy);
02099         }
02100 
02101         VectorParts &Entry = WidenMap.get(Member);
02102         Entry[Part] =
02103             Group->isReverse() ? reverseVector(StridedVec) : StridedVec;
02104       }
02105 
02106       propagateMetadata(NewLoadInstr, Instr);
02107     }
02108     return;
02109   }
02110 
02111   // The sub vector type for current instruction.
02112   VectorType *SubVT = VectorType::get(ScalarTy, VF);
02113 
02114   // Vectorize the interleaved store group.
02115   for (unsigned Part = 0; Part < UF; Part++) {
02116     // Collect the stored vector from each member.
02117     SmallVector<Value *, 4> StoredVecs;
02118     for (unsigned i = 0; i < InterleaveFactor; i++) {
02119       // Interleaved store group doesn't allow a gap, so each index has a member
02120       Instruction *Member = Group->getMember(i);
02121       assert(Member && "Fail to get a member from an interleaved store group");
02122 
02123       Value *StoredVec =
02124           getVectorValue(dyn_cast<StoreInst>(Member)->getValueOperand())[Part];
02125       if (Group->isReverse())
02126         StoredVec = reverseVector(StoredVec);
02127 
02128       // If this member has different type, cast it to an unified type.
02129       if (StoredVec->getType() != SubVT)
02130         StoredVec = Builder.CreateBitOrPointerCast(StoredVec, SubVT);
02131 
02132       StoredVecs.push_back(StoredVec);
02133     }
02134 
02135     // Concatenate all vectors into a wide vector.
02136     Value *WideVec = ConcatenateVectors(Builder, StoredVecs);
02137 
02138     // Interleave the elements in the wide vector.
02139     Constant *IMask = getInterleavedMask(Builder, VF, InterleaveFactor);
02140     Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
02141                                               "interleaved.vec");
02142 
02143     Instruction *NewStoreInstr =
02144         Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment());
02145     propagateMetadata(NewStoreInstr, Instr);
02146   }
02147 }
02148 
02149 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
02150   // Attempt to issue a wide load.
02151   LoadInst *LI = dyn_cast<LoadInst>(Instr);
02152   StoreInst *SI = dyn_cast<StoreInst>(Instr);
02153 
02154   assert((LI || SI) && "Invalid Load/Store instruction");
02155 
02156   // Try to vectorize the interleave group if this access is interleaved.
02157   if (Legal->isAccessInterleaved(Instr))
02158     return vectorizeInterleaveGroup(Instr);
02159 
02160   Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType();
02161   Type *DataTy = VectorType::get(ScalarDataTy, VF);
02162   Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
02163   unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
02164   // An alignment of 0 means target abi alignment. We need to use the scalar's
02165   // target abi alignment in such a case.
02166   const DataLayout &DL = Instr->getModule()->getDataLayout();
02167   if (!Alignment)
02168     Alignment = DL.getABITypeAlignment(ScalarDataTy);
02169   unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
02170   unsigned ScalarAllocatedSize = DL.getTypeAllocSize(ScalarDataTy);
02171   unsigned VectorElementSize = DL.getTypeStoreSize(DataTy) / VF;
02172 
02173   if (SI && Legal->blockNeedsPredication(SI->getParent()) &&
02174       !Legal->isMaskRequired(SI))
02175     return scalarizeInstruction(Instr, true);
02176 
02177   if (ScalarAllocatedSize != VectorElementSize)
02178     return scalarizeInstruction(Instr);
02179 
02180   // If the pointer is loop invariant or if it is non-consecutive,
02181   // scalarize the load.
02182   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
02183   bool Reverse = ConsecutiveStride < 0;
02184   bool UniformLoad = LI && Legal->isUniform(Ptr);
02185   if (!ConsecutiveStride || UniformLoad)
02186     return scalarizeInstruction(Instr);
02187 
02188   Constant *Zero = Builder.getInt32(0);
02189   VectorParts &Entry = WidenMap.get(Instr);
02190 
02191   // Handle consecutive loads/stores.
02192   GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
02193   if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) {
02194     setDebugLocFromInst(Builder, Gep);
02195     Value *PtrOperand = Gep->getPointerOperand();
02196     Value *FirstBasePtr = getVectorValue(PtrOperand)[0];
02197     FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero);
02198 
02199     // Create the new GEP with the new induction variable.
02200     GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
02201     Gep2->setOperand(0, FirstBasePtr);
02202     Gep2->setName("gep.indvar.base");
02203     Ptr = Builder.Insert(Gep2);
02204   } else if (Gep) {
02205     setDebugLocFromInst(Builder, Gep);
02206     assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()),
02207                                OrigLoop) && "Base ptr must be invariant");
02208 
02209     // The last index does not have to be the induction. It can be
02210     // consecutive and be a function of the index. For example A[I+1];
02211     unsigned NumOperands = Gep->getNumOperands();
02212     unsigned InductionOperand = getGEPInductionOperand(Gep);
02213     // Create the new GEP with the new induction variable.
02214     GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
02215 
02216     for (unsigned i = 0; i < NumOperands; ++i) {
02217       Value *GepOperand = Gep->getOperand(i);
02218       Instruction *GepOperandInst = dyn_cast<Instruction>(GepOperand);
02219 
02220       // Update last index or loop invariant instruction anchored in loop.
02221       if (i == InductionOperand ||
02222           (GepOperandInst && OrigLoop->contains(GepOperandInst))) {
02223         assert((i == InductionOperand ||
02224                SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) &&
02225                "Must be last index or loop invariant");
02226 
02227         VectorParts &GEPParts = getVectorValue(GepOperand);
02228         Value *Index = GEPParts[0];
02229         Index = Builder.CreateExtractElement(Index, Zero);
02230         Gep2->setOperand(i, Index);
02231         Gep2->setName("gep.indvar.idx");
02232       }
02233     }
02234     Ptr = Builder.Insert(Gep2);
02235   } else {
02236     // Use the induction element ptr.
02237     assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
02238     setDebugLocFromInst(Builder, Ptr);
02239     VectorParts &PtrVal = getVectorValue(Ptr);
02240     Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
02241   }
02242 
02243   VectorParts Mask = createBlockInMask(Instr->getParent());
02244   // Handle Stores:
02245   if (SI) {
02246     assert(!Legal->isUniform(SI->getPointerOperand()) &&
02247            "We do not allow storing to uniform addresses");
02248     setDebugLocFromInst(Builder, SI);
02249     // We don't want to update the value in the map as it might be used in
02250     // another expression. So don't use a reference type for "StoredVal".
02251     VectorParts StoredVal = getVectorValue(SI->getValueOperand());
02252     
02253     for (unsigned Part = 0; Part < UF; ++Part) {
02254       // Calculate the pointer for the specific unroll-part.
02255       Value *PartPtr =
02256           Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
02257 
02258       if (Reverse) {
02259         // If we store to reverse consecutive memory locations then we need
02260         // to reverse the order of elements in the stored value.
02261         StoredVal[Part] = reverseVector(StoredVal[Part]);
02262         // If the address is consecutive but reversed, then the
02263         // wide store needs to start at the last vector element.
02264         PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
02265         PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
02266         Mask[Part] = reverseVector(Mask[Part]);
02267       }
02268 
02269       Value *VecPtr = Builder.CreateBitCast(PartPtr,
02270                                             DataTy->getPointerTo(AddressSpace));
02271 
02272       Instruction *NewSI;
02273       if (Legal->isMaskRequired(SI))
02274         NewSI = Builder.CreateMaskedStore(StoredVal[Part], VecPtr, Alignment,
02275                                           Mask[Part]);
02276       else 
02277         NewSI = Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment);
02278       propagateMetadata(NewSI, SI);
02279     }
02280     return;
02281   }
02282 
02283   // Handle loads.
02284   assert(LI && "Must have a load instruction");
02285   setDebugLocFromInst(Builder, LI);
02286   for (unsigned Part = 0; Part < UF; ++Part) {
02287     // Calculate the pointer for the specific unroll-part.
02288     Value *PartPtr =
02289         Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
02290 
02291     if (Reverse) {
02292       // If the address is consecutive but reversed, then the
02293       // wide load needs to start at the last vector element.
02294       PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
02295       PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
02296       Mask[Part] = reverseVector(Mask[Part]);
02297     }
02298 
02299     Instruction* NewLI;
02300     Value *VecPtr = Builder.CreateBitCast(PartPtr,
02301                                           DataTy->getPointerTo(AddressSpace));
02302     if (Legal->isMaskRequired(LI))
02303       NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
02304                                        UndefValue::get(DataTy),
02305                                        "wide.masked.load");
02306     else
02307       NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");
02308     propagateMetadata(NewLI, LI);
02309     Entry[Part] = Reverse ? reverseVector(NewLI) :  NewLI;
02310   }
02311 }
02312 
02313 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredicateStore) {
02314   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
02315   // Holds vector parameters or scalars, in case of uniform vals.
02316   SmallVector<VectorParts, 4> Params;
02317 
02318   setDebugLocFromInst(Builder, Instr);
02319 
02320   // Find all of the vectorized parameters.
02321   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
02322     Value *SrcOp = Instr->getOperand(op);
02323 
02324     // If we are accessing the old induction variable, use the new one.
02325     if (SrcOp == OldInduction) {
02326       Params.push_back(getVectorValue(SrcOp));
02327       continue;
02328     }
02329 
02330     // Try using previously calculated values.
02331     Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
02332 
02333     // If the src is an instruction that appeared earlier in the basic block
02334     // then it should already be vectorized.
02335     if (SrcInst && OrigLoop->contains(SrcInst)) {
02336       assert(WidenMap.has(SrcInst) && "Source operand is unavailable");
02337       // The parameter is a vector value from earlier.
02338       Params.push_back(WidenMap.get(SrcInst));
02339     } else {
02340       // The parameter is a scalar from outside the loop. Maybe even a constant.
02341       VectorParts Scalars;
02342       Scalars.append(UF, SrcOp);
02343       Params.push_back(Scalars);
02344     }
02345   }
02346 
02347   assert(Params.size() == Instr->getNumOperands() &&
02348          "Invalid number of operands");
02349 
02350   // Does this instruction return a value ?
02351   bool IsVoidRetTy = Instr->getType()->isVoidTy();
02352 
02353   Value *UndefVec = IsVoidRetTy ? nullptr :
02354     UndefValue::get(VectorType::get(Instr->getType(), VF));
02355   // Create a new entry in the WidenMap and initialize it to Undef or Null.
02356   VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
02357 
02358   Instruction *InsertPt = Builder.GetInsertPoint();
02359   BasicBlock *IfBlock = Builder.GetInsertBlock();
02360   BasicBlock *CondBlock = nullptr;
02361 
02362   VectorParts Cond;
02363   Loop *VectorLp = nullptr;
02364   if (IfPredicateStore) {
02365     assert(Instr->getParent()->getSinglePredecessor() &&
02366            "Only support single predecessor blocks");
02367     Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
02368                           Instr->getParent());
02369     VectorLp = LI->getLoopFor(IfBlock);
02370     assert(VectorLp && "Must have a loop for this block");
02371   }
02372 
02373   // For each vector unroll 'part':
02374   for (unsigned Part = 0; Part < UF; ++Part) {
02375     // For each scalar that we create:
02376     for (unsigned Width = 0; Width < VF; ++Width) {
02377 
02378       // Start if-block.
02379       Value *Cmp = nullptr;
02380       if (IfPredicateStore) {
02381         Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width));
02382         Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1));
02383         CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
02384         LoopVectorBody.push_back(CondBlock);
02385         VectorLp->addBasicBlockToLoop(CondBlock, *LI);
02386         // Update Builder with newly created basic block.
02387         Builder.SetInsertPoint(InsertPt);
02388       }
02389 
02390       Instruction *Cloned = Instr->clone();
02391       if (!IsVoidRetTy)
02392         Cloned->setName(Instr->getName() + ".cloned");
02393       // Replace the operands of the cloned instructions with extracted scalars.
02394       for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
02395         Value *Op = Params[op][Part];
02396         // Param is a vector. Need to extract the right lane.
02397         if (Op->getType()->isVectorTy())
02398           Op = Builder.CreateExtractElement(Op, Builder.getInt32(Width));
02399         Cloned->setOperand(op, Op);
02400       }
02401 
02402       // Place the cloned scalar in the new loop.
02403       Builder.Insert(Cloned);
02404 
02405       // If the original scalar returns a value we need to place it in a vector
02406       // so that future users will be able to use it.
02407       if (!IsVoidRetTy)
02408         VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned,
02409                                                        Builder.getInt32(Width));
02410       // End if-block.
02411       if (IfPredicateStore) {
02412          BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
02413          LoopVectorBody.push_back(NewIfBlock);
02414          VectorLp->addBasicBlockToLoop(NewIfBlock, *LI);
02415          Builder.SetInsertPoint(InsertPt);
02416          Instruction *OldBr = IfBlock->getTerminator();
02417          BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
02418          OldBr->eraseFromParent();
02419          IfBlock = NewIfBlock;
02420       }
02421     }
02422   }
02423 }
02424 
02425 static Instruction *getFirstInst(Instruction *FirstInst, Value *V,
02426                                  Instruction *Loc) {
02427   if (FirstInst)
02428     return FirstInst;
02429   if (Instruction *I = dyn_cast<Instruction>(V))
02430     return I->getParent() == Loc->getParent() ? I : nullptr;
02431   return nullptr;
02432 }
02433 
02434 std::pair<Instruction *, Instruction *>
02435 InnerLoopVectorizer::addStrideCheck(Instruction *Loc) {
02436   Instruction *tnullptr = nullptr;
02437   if (!Legal->mustCheckStrides())
02438     return std::pair<Instruction *, Instruction *>(tnullptr, tnullptr);
02439 
02440   IRBuilder<> ChkBuilder(Loc);
02441 
02442   // Emit checks.
02443   Value *Check = nullptr;
02444   Instruction *FirstInst = nullptr;
02445   for (SmallPtrSet<Value *, 8>::iterator SI = Legal->strides_begin(),
02446                                          SE = Legal->strides_end();
02447        SI != SE; ++SI) {
02448     Value *Ptr = stripIntegerCast(*SI);
02449     Value *C = ChkBuilder.CreateICmpNE(Ptr, ConstantInt::get(Ptr->getType(), 1),
02450                                        "stride.chk");
02451     // Store the first instruction we create.
02452     FirstInst = getFirstInst(FirstInst, C, Loc);
02453     if (Check)
02454       Check = ChkBuilder.CreateOr(Check, C);
02455     else
02456       Check = C;
02457   }
02458 
02459   // We have to do this trickery because the IRBuilder might fold the check to a
02460   // constant expression in which case there is no Instruction anchored in a
02461   // the block.
02462   LLVMContext &Ctx = Loc->getContext();
02463   Instruction *TheCheck =
02464       BinaryOperator::CreateAnd(Check, ConstantInt::getTrue(Ctx));
02465   ChkBuilder.Insert(TheCheck, "stride.not.one");
02466   FirstInst = getFirstInst(FirstInst, TheCheck, Loc);
02467 
02468   return std::make_pair(FirstInst, TheCheck);
02469 }
02470 
02471 void InnerLoopVectorizer::createEmptyLoop() {
02472   /*
02473    In this function we generate a new loop. The new loop will contain
02474    the vectorized instructions while the old loop will continue to run the
02475    scalar remainder.
02476 
02477        [ ] <-- Back-edge taken count overflow check.
02478     /   |
02479    /    v
02480   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
02481   |  /  |
02482   | /   v
02483   ||   [ ]     <-- vector pre header.
02484   ||    |
02485   ||    v
02486   ||   [  ] \
02487   ||   [  ]_|   <-- vector loop.
02488   ||    |
02489   | \   v
02490   |   >[ ]   <--- middle-block.
02491   |  /  |
02492   | /   v
02493   -|- >[ ]     <--- new preheader.
02494    |    |
02495    |    v
02496    |   [ ] \
02497    |   [ ]_|   <-- old scalar loop to handle remainder.
02498     \   |
02499      \  v
02500       >[ ]     <-- exit block.
02501    ...
02502    */
02503 
02504   BasicBlock *OldBasicBlock = OrigLoop->getHeader();
02505   BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
02506   BasicBlock *ExitBlock = OrigLoop->getExitBlock();
02507   assert(BypassBlock && "Invalid loop structure");
02508   assert(ExitBlock && "Must have an exit block");
02509 
02510   // Some loops have a single integer induction variable, while other loops
02511   // don't. One example is c++ iterators that often have multiple pointer
02512   // induction variables. In the code below we also support a case where we
02513   // don't have a single induction variable.
02514   OldInduction = Legal->getInduction();
02515   Type *IdxTy = Legal->getWidestInductionType();
02516 
02517   // Find the loop boundaries.
02518   const SCEV *ExitCount = SE->getBackedgeTakenCount(OrigLoop);
02519   assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
02520 
02521   // The exit count might have the type of i64 while the phi is i32. This can
02522   // happen if we have an induction variable that is sign extended before the
02523   // compare. The only way that we get a backedge taken count is that the
02524   // induction variable was signed and as such will not overflow. In such a case
02525   // truncation is legal.
02526   if (ExitCount->getType()->getPrimitiveSizeInBits() >
02527       IdxTy->getPrimitiveSizeInBits())
02528     ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy);
02529 
02530   const SCEV *BackedgeTakeCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy);
02531   // Get the total trip count from the count by adding 1.
02532   ExitCount = SE->getAddExpr(BackedgeTakeCount,
02533                              SE->getConstant(BackedgeTakeCount->getType(), 1));
02534 
02535   const DataLayout &DL = OldBasicBlock->getModule()->getDataLayout();
02536 
02537   // Expand the trip count and place the new instructions in the preheader.
02538   // Notice that the pre-header does not change, only the loop body.
02539   SCEVExpander Exp(*SE, DL, "induction");
02540 
02541   // We need to test whether the backedge-taken count is uint##_max. Adding one
02542   // to it will cause overflow and an incorrect loop trip count in the vector
02543   // body. In case of overflow we want to directly jump to the scalar remainder
02544   // loop.
02545   Value *BackedgeCount =
02546       Exp.expandCodeFor(BackedgeTakeCount, BackedgeTakeCount->getType(),
02547                         BypassBlock->getTerminator());
02548   if (BackedgeCount->getType()->isPointerTy())
02549     BackedgeCount = CastInst::CreatePointerCast(BackedgeCount, IdxTy,
02550                                                 "backedge.ptrcnt.to.int",
02551                                                 BypassBlock->getTerminator());
02552   Instruction *CheckBCOverflow =
02553       CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, BackedgeCount,
02554                       Constant::getAllOnesValue(BackedgeCount->getType()),
02555                       "backedge.overflow", BypassBlock->getTerminator());
02556 
02557   // The loop index does not have to start at Zero. Find the original start
02558   // value from the induction PHI node. If we don't have an induction variable
02559   // then we know that it starts at zero.
02560   Builder.SetInsertPoint(BypassBlock->getTerminator());
02561   Value *StartIdx = ExtendedIdx = OldInduction ?
02562     Builder.CreateZExt(OldInduction->getIncomingValueForBlock(BypassBlock),
02563                        IdxTy):
02564     ConstantInt::get(IdxTy, 0);
02565 
02566   // We need an instruction to anchor the overflow check on. StartIdx needs to
02567   // be defined before the overflow check branch. Because the scalar preheader
02568   // is going to merge the start index and so the overflow branch block needs to
02569   // contain a definition of the start index.
02570   Instruction *OverflowCheckAnchor = BinaryOperator::CreateAdd(
02571       StartIdx, ConstantInt::get(IdxTy, 0), "overflow.check.anchor",
02572       BypassBlock->getTerminator());
02573 
02574   // Count holds the overall loop count (N).
02575   Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
02576                                    BypassBlock->getTerminator());
02577 
02578   LoopBypassBlocks.push_back(BypassBlock);
02579 
02580   // Split the single block loop into the two loop structure described above.
02581   BasicBlock *VectorPH =
02582   BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
02583   BasicBlock *VecBody =
02584   VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
02585   BasicBlock *MiddleBlock =
02586   VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
02587   BasicBlock *ScalarPH =
02588   MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
02589 
02590   // Create and register the new vector loop.
02591   Loop* Lp = new Loop();
02592   Loop *ParentLoop = OrigLoop->getParentLoop();
02593 
02594   // Insert the new loop into the loop nest and register the new basic blocks
02595   // before calling any utilities such as SCEV that require valid LoopInfo.
02596   if (ParentLoop) {
02597     ParentLoop->addChildLoop(Lp);
02598     ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
02599     ParentLoop->addBasicBlockToLoop(VectorPH, *LI);
02600     ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
02601   } else {
02602     LI->addTopLevelLoop(Lp);
02603   }
02604   Lp->addBasicBlockToLoop(VecBody, *LI);
02605 
02606   // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
02607   // inside the loop.
02608   Builder.SetInsertPoint(VecBody->getFirstNonPHI());
02609 
02610   // Generate the induction variable.
02611   setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction));
02612   Induction = Builder.CreatePHI(IdxTy, 2, "index");
02613   // The loop step is equal to the vectorization factor (num of SIMD elements)
02614   // times the unroll factor (num of SIMD instructions).
02615   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
02616 
02617   // This is the IR builder that we use to add all of the logic for bypassing
02618   // the new vector loop.
02619   IRBuilder<> BypassBuilder(BypassBlock->getTerminator());
02620   setDebugLocFromInst(BypassBuilder,
02621                       getDebugLocFromInstOrOperands(OldInduction));
02622 
02623   // We may need to extend the index in case there is a type mismatch.
02624   // We know that the count starts at zero and does not overflow.
02625   if (Count->getType() != IdxTy) {
02626     // The exit count can be of pointer type. Convert it to the correct
02627     // integer type.
02628     if (ExitCount->getType()->isPointerTy())
02629       Count = BypassBuilder.CreatePointerCast(Count, IdxTy, "ptrcnt.to.int");
02630     else
02631       Count = BypassBuilder.CreateZExtOrTrunc(Count, IdxTy, "cnt.cast");
02632   }
02633 
02634   // Add the start index to the loop count to get the new end index.
02635   Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx, "end.idx");
02636 
02637   // Now we need to generate the expression for N - (N % VF), which is
02638   // the part that the vectorized body will execute.
02639   Value *R = BypassBuilder.CreateURem(Count, Step, "n.mod.vf");
02640   Value *CountRoundDown = BypassBuilder.CreateSub(Count, R, "n.vec");
02641   Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx,
02642                                                      "end.idx.rnd.down");
02643 
02644   // Now, compare the new count to zero. If it is zero skip the vector loop and
02645   // jump to the scalar loop.
02646   Value *Cmp =
02647       BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, "cmp.zero");
02648 
02649   BasicBlock *LastBypassBlock = BypassBlock;
02650 
02651   // Generate code to check that the loops trip count that we computed by adding
02652   // one to the backedge-taken count will not overflow.
02653   {
02654     auto PastOverflowCheck =
02655         std::next(BasicBlock::iterator(OverflowCheckAnchor));
02656     BasicBlock *CheckBlock =
02657       LastBypassBlock->splitBasicBlock(PastOverflowCheck, "overflow.checked");
02658     if (ParentLoop)
02659       ParentLoop->addBasicBlockToLoop(CheckBlock, *LI);
02660     LoopBypassBlocks.push_back(CheckBlock);
02661     Instruction *OldTerm = LastBypassBlock->getTerminator();
02662     BranchInst::Create(ScalarPH, CheckBlock, CheckBCOverflow, OldTerm);
02663     OldTerm->eraseFromParent();
02664     LastBypassBlock = CheckBlock;
02665   }
02666 
02667   // Generate the code to check that the strides we assumed to be one are really
02668   // one. We want the new basic block to start at the first instruction in a
02669   // sequence of instructions that form a check.
02670   Instruction *StrideCheck;
02671   Instruction *FirstCheckInst;
02672   std::tie(FirstCheckInst, StrideCheck) =
02673       addStrideCheck(LastBypassBlock->getTerminator());
02674   if (StrideCheck) {
02675     AddedSafetyChecks = true;
02676     // Create a new block containing the stride check.
02677     BasicBlock *CheckBlock =
02678         LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.stridecheck");
02679     if (ParentLoop)
02680       ParentLoop->addBasicBlockToLoop(CheckBlock, *LI);
02681     LoopBypassBlocks.push_back(CheckBlock);
02682 
02683     // Replace the branch into the memory check block with a conditional branch
02684     // for the "few elements case".
02685     Instruction *OldTerm = LastBypassBlock->getTerminator();
02686     BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm);
02687     OldTerm->eraseFromParent();
02688 
02689     Cmp = StrideCheck;
02690     LastBypassBlock = CheckBlock;
02691   }
02692 
02693   // Generate the code that checks in runtime if arrays overlap. We put the
02694   // checks into a separate block to make the more common case of few elements
02695   // faster.
02696   Instruction *MemRuntimeCheck;
02697   std::tie(FirstCheckInst, MemRuntimeCheck) =
02698     Legal->getLAI()->addRuntimeCheck(LastBypassBlock->getTerminator());
02699   if (MemRuntimeCheck) {
02700     AddedSafetyChecks = true;
02701     // Create a new block containing the memory check.
02702     BasicBlock *CheckBlock =
02703         LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.memcheck");
02704     if (ParentLoop)
02705       ParentLoop->addBasicBlockToLoop(CheckBlock, *LI);
02706     LoopBypassBlocks.push_back(CheckBlock);
02707 
02708     // Replace the branch into the memory check block with a conditional branch
02709     // for the "few elements case".
02710     Instruction *OldTerm = LastBypassBlock->getTerminator();
02711     BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm);
02712     OldTerm->eraseFromParent();
02713 
02714     Cmp = MemRuntimeCheck;
02715     LastBypassBlock = CheckBlock;
02716   }
02717 
02718   LastBypassBlock->getTerminator()->eraseFromParent();
02719   BranchInst::Create(MiddleBlock, VectorPH, Cmp,
02720                      LastBypassBlock);
02721 
02722   // We are going to resume the execution of the scalar loop.
02723   // Go over all of the induction variables that we found and fix the
02724   // PHIs that are left in the scalar version of the loop.
02725   // The starting values of PHI nodes depend on the counter of the last
02726   // iteration in the vectorized loop.
02727   // If we come from a bypass edge then we need to start from the original
02728   // start value.
02729 
02730   // This variable saves the new starting index for the scalar loop.
02731   PHINode *ResumeIndex = nullptr;
02732   LoopVectorizationLegality::InductionList::iterator I, E;
02733   LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
02734   // Set builder to point to last bypass block.
02735   BypassBuilder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator());
02736   for (I = List->begin(), E = List->end(); I != E; ++I) {
02737     PHINode *OrigPhi = I->first;
02738     LoopVectorizationLegality::InductionInfo II = I->second;
02739 
02740     Type *ResumeValTy = (OrigPhi == OldInduction) ? IdxTy : OrigPhi->getType();
02741     PHINode *ResumeVal = PHINode::Create(ResumeValTy, 2, "resume.val",
02742                                          MiddleBlock->getTerminator());
02743     // We might have extended the type of the induction variable but we need a
02744     // truncated version for the scalar loop.
02745     PHINode *TruncResumeVal = (OrigPhi == OldInduction) ?
02746       PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val",
02747                       MiddleBlock->getTerminator()) : nullptr;
02748 
02749     // Create phi nodes to merge from the  backedge-taken check block.
02750     PHINode *BCResumeVal = PHINode::Create(ResumeValTy, 3, "bc.resume.val",
02751                                            ScalarPH->getTerminator());
02752     BCResumeVal->addIncoming(ResumeVal, MiddleBlock);
02753 
02754     PHINode *BCTruncResumeVal = nullptr;
02755     if (OrigPhi == OldInduction) {
02756       BCTruncResumeVal =
02757           PHINode::Create(OrigPhi->getType(), 2, "bc.trunc.resume.val",
02758                           ScalarPH->getTerminator());
02759       BCTruncResumeVal->addIncoming(TruncResumeVal, MiddleBlock);
02760     }
02761 
02762     Value *EndValue = nullptr;
02763     switch (II.IK) {
02764     case LoopVectorizationLegality::IK_NoInduction:
02765       llvm_unreachable("Unknown induction");
02766     case LoopVectorizationLegality::IK_IntInduction: {
02767       // Handle the integer induction counter.
02768       assert(OrigPhi->getType()->isIntegerTy() && "Invalid type");
02769 
02770       // We have the canonical induction variable.
02771       if (OrigPhi == OldInduction) {
02772         // Create a truncated version of the resume value for the scalar loop,
02773         // we might have promoted the type to a larger width.
02774         EndValue =
02775           BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType());
02776         // The new PHI merges the original incoming value, in case of a bypass,
02777         // or the value at the end of the vectorized loop.
02778         for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
02779           TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
02780         TruncResumeVal->addIncoming(EndValue, VecBody);
02781 
02782         BCTruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]);
02783 
02784         // We know what the end value is.
02785         EndValue = IdxEndRoundDown;
02786         // We also know which PHI node holds it.
02787         ResumeIndex = ResumeVal;
02788         break;
02789       }
02790 
02791       // Not the canonical induction variable - add the vector loop count to the
02792       // start value.
02793       Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,
02794                                                    II.StartValue->getType(),
02795                                                    "cast.crd");
02796       EndValue = II.transform(BypassBuilder, CRD);
02797       EndValue->setName("ind.end");
02798       break;
02799     }
02800     case LoopVectorizationLegality::IK_PtrInduction: {
02801       EndValue = II.transform(BypassBuilder, CountRoundDown);
02802       EndValue->setName("ptr.ind.end");
02803       break;
02804     }
02805     }// end of case
02806 
02807     // The new PHI merges the original incoming value, in case of a bypass,
02808     // or the value at the end of the vectorized loop.
02809     for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) {
02810       if (OrigPhi == OldInduction)
02811         ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]);
02812       else
02813         ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
02814     }
02815     ResumeVal->addIncoming(EndValue, VecBody);
02816 
02817     // Fix the scalar body counter (PHI node).
02818     unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
02819 
02820     // The old induction's phi node in the scalar body needs the truncated
02821     // value.
02822     if (OrigPhi == OldInduction) {
02823       BCResumeVal->addIncoming(StartIdx, LoopBypassBlocks[0]);
02824       OrigPhi->setIncomingValue(BlockIdx, BCTruncResumeVal);
02825     } else {
02826       BCResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]);
02827       OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
02828     }
02829   }
02830 
02831   // If we are generating a new induction variable then we also need to
02832   // generate the code that calculates the exit value. This value is not
02833   // simply the end of the counter because we may skip the vectorized body
02834   // in case of a runtime check.
02835   if (!OldInduction){
02836     assert(!ResumeIndex && "Unexpected resume value found");
02837     ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val",
02838                                   MiddleBlock->getTerminator());
02839     for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
02840       ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]);
02841     ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
02842   }
02843 
02844   // Make sure that we found the index where scalar loop needs to continue.
02845   assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() &&
02846          "Invalid resume Index");
02847 
02848   // Add a check in the middle block to see if we have completed
02849   // all of the iterations in the first vector loop.
02850   // If (N - N%VF) == N, then we *don't* need to run the remainder.
02851   Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd,
02852                                 ResumeIndex, "cmp.n",
02853                                 MiddleBlock->getTerminator());
02854 
02855   BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator());
02856   // Remove the old terminator.
02857   MiddleBlock->getTerminator()->eraseFromParent();
02858 
02859   // Create i+1 and fill the PHINode.
02860   Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next");
02861   Induction->addIncoming(StartIdx, VectorPH);
02862   Induction->addIncoming(NextIdx, VecBody);
02863   // Create the compare.
02864   Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown);
02865   Builder.CreateCondBr(ICmp, MiddleBlock, VecBody);
02866 
02867   // Now we have two terminators. Remove the old one from the block.
02868   VecBody->getTerminator()->eraseFromParent();
02869 
02870   // Get ready to start creating new instructions into the vectorized body.
02871   Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
02872 
02873   // Save the state.
02874   LoopVectorPreHeader = VectorPH;
02875   LoopScalarPreHeader = ScalarPH;
02876   LoopMiddleBlock = MiddleBlock;
02877   LoopExitBlock = ExitBlock;
02878   LoopVectorBody.push_back(VecBody);
02879   LoopScalarBody = OldBasicBlock;
02880 
02881   LoopVectorizeHints Hints(Lp, true);
02882   Hints.setAlreadyVectorized();
02883 }
02884 
02885 namespace {
02886 struct CSEDenseMapInfo {
02887   static bool canHandle(Instruction *I) {
02888     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
02889            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
02890   }
02891   static inline Instruction *getEmptyKey() {
02892     return DenseMapInfo<Instruction *>::getEmptyKey();
02893   }
02894   static inline Instruction *getTombstoneKey() {
02895     return DenseMapInfo<Instruction *>::getTombstoneKey();
02896   }
02897   static unsigned getHashValue(Instruction *I) {
02898     assert(canHandle(I) && "Unknown instruction!");
02899     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
02900                                                            I->value_op_end()));
02901   }
02902   static bool isEqual(Instruction *LHS, Instruction *RHS) {
02903     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
02904         LHS == getTombstoneKey() || RHS == getTombstoneKey())
02905       return LHS == RHS;
02906     return LHS->isIdenticalTo(RHS);
02907   }
02908 };
02909 }
02910 
02911 /// \brief Check whether this block is a predicated block.
02912 /// Due to if predication of stores we might create a sequence of "if(pred) a[i]
02913 /// = ...;  " blocks. We start with one vectorized basic block. For every
02914 /// conditional block we split this vectorized block. Therefore, every second
02915 /// block will be a predicated one.
02916 static bool isPredicatedBlock(unsigned BlockNum) {
02917   return BlockNum % 2;
02918 }
02919 
02920 ///\brief Perform cse of induction variable instructions.
02921 static void cse(SmallVector<BasicBlock *, 4> &BBs) {
02922   // Perform simple cse.
02923   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
02924   for (unsigned i = 0, e = BBs.size(); i != e; ++i) {
02925     BasicBlock *BB = BBs[i];
02926     for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
02927       Instruction *In = I++;
02928 
02929       if (!CSEDenseMapInfo::canHandle(In))
02930         continue;
02931 
02932       // Check if we can replace this instruction with any of the
02933       // visited instructions.
02934       if (Instruction *V = CSEMap.lookup(In)) {
02935         In->replaceAllUsesWith(V);
02936         In->eraseFromParent();
02937         continue;
02938       }
02939       // Ignore instructions in conditional blocks. We create "if (pred) a[i] =
02940       // ...;" blocks for predicated stores. Every second block is a predicated
02941       // block.
02942       if (isPredicatedBlock(i))
02943         continue;
02944 
02945       CSEMap[In] = In;
02946     }
02947   }
02948 }
02949 
02950 /// \brief Adds a 'fast' flag to floating point operations.
02951 static Value *addFastMathFlag(Value *V) {
02952   if (isa<FPMathOperator>(V)){
02953     FastMathFlags Flags;
02954     Flags.setUnsafeAlgebra();
02955     cast<Instruction>(V)->setFastMathFlags(Flags);
02956   }
02957   return V;
02958 }
02959 
02960 /// Estimate the overhead of scalarizing a value. Insert and Extract are set if
02961 /// the result needs to be inserted and/or extracted from vectors.
02962 static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract,
02963                                          const TargetTransformInfo &TTI) {
02964   if (Ty->isVoidTy())
02965     return 0;
02966 
02967   assert(Ty->isVectorTy() && "Can only scalarize vectors");
02968   unsigned Cost = 0;
02969 
02970   for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
02971     if (Insert)
02972       Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, i);
02973     if (Extract)
02974       Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, Ty, i);
02975   }
02976 
02977   return Cost;
02978 }
02979 
02980 // Estimate cost of a call instruction CI if it were vectorized with factor VF.
02981 // Return the cost of the instruction, including scalarization overhead if it's
02982 // needed. The flag NeedToScalarize shows if the call needs to be scalarized -
02983 // i.e. either vector version isn't available, or is too expensive.
02984 static unsigned getVectorCallCost(CallInst *CI, unsigned VF,
02985                                   const TargetTransformInfo &TTI,
02986                                   const TargetLibraryInfo *TLI,
02987                                   bool &NeedToScalarize) {
02988   Function *F = CI->getCalledFunction();
02989   StringRef FnName = CI->getCalledFunction()->getName();
02990   Type *ScalarRetTy = CI->getType();
02991   SmallVector<Type *, 4> Tys, ScalarTys;
02992   for (auto &ArgOp : CI->arg_operands())
02993     ScalarTys.push_back(ArgOp->getType());
02994 
02995   // Estimate cost of scalarized vector call. The source operands are assumed
02996   // to be vectors, so we need to extract individual elements from there,
02997   // execute VF scalar calls, and then gather the result into the vector return
02998   // value.
02999   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
03000   if (VF == 1)
03001     return ScalarCallCost;
03002 
03003   // Compute corresponding vector type for return value and arguments.
03004   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
03005   for (unsigned i = 0, ie = ScalarTys.size(); i != ie; ++i)
03006     Tys.push_back(ToVectorTy(ScalarTys[i], VF));
03007 
03008   // Compute costs of unpacking argument values for the scalar calls and
03009   // packing the return values to a vector.
03010   unsigned ScalarizationCost =
03011       getScalarizationOverhead(RetTy, true, false, TTI);
03012   for (unsigned i = 0, ie = Tys.size(); i != ie; ++i)
03013     ScalarizationCost += getScalarizationOverhead(Tys[i], false, true, TTI);
03014 
03015   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
03016 
03017   // If we can't emit a vector call for this function, then the currently found
03018   // cost is the cost we need to return.
03019   NeedToScalarize = true;
03020   if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
03021     return Cost;
03022 
03023   // If the corresponding vector cost is cheaper, return its cost.
03024   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
03025   if (VectorCallCost < Cost) {
03026     NeedToScalarize = false;
03027     return VectorCallCost;
03028   }
03029   return Cost;
03030 }
03031 
03032 // Estimate cost of an intrinsic call instruction CI if it were vectorized with
03033 // factor VF.  Return the cost of the instruction, including scalarization
03034 // overhead if it's needed.
03035 static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF,
03036                                        const TargetTransformInfo &TTI,
03037                                        const TargetLibraryInfo *TLI) {
03038   Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
03039   assert(ID && "Expected intrinsic call!");
03040 
03041   Type *RetTy = ToVectorTy(CI->getType(), VF);
03042   SmallVector<Type *, 4> Tys;
03043   for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
03044     Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF));
03045 
03046   return TTI.getIntrinsicInstrCost(ID, RetTy, Tys);
03047 }
03048 
03049 void InnerLoopVectorizer::vectorizeLoop() {
03050   //===------------------------------------------------===//
03051   //
03052   // Notice: any optimization or new instruction that go
03053   // into the code below should be also be implemented in
03054   // the cost-model.
03055   //
03056   //===------------------------------------------------===//
03057   Constant *Zero = Builder.getInt32(0);
03058 
03059   // In order to support reduction variables we need to be able to vectorize
03060   // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
03061   // stages. First, we create a new vector PHI node with no incoming edges.
03062   // We use this value when we vectorize all of the instructions that use the
03063   // PHI. Next, after all of the instructions in the block are complete we
03064   // add the new incoming edges to the PHI. At this point all of the
03065   // instructions in the basic block are vectorized, so we can use them to
03066   // construct the PHI.
03067   PhiVector RdxPHIsToFix;
03068 
03069   // Scan the loop in a topological order to ensure that defs are vectorized
03070   // before users.
03071   LoopBlocksDFS DFS(OrigLoop);
03072   DFS.perform(LI);
03073 
03074   // Vectorize all of the blocks in the original loop.
03075   for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
03076        be = DFS.endRPO(); bb != be; ++bb)
03077     vectorizeBlockInLoop(*bb, &RdxPHIsToFix);
03078 
03079   // At this point every instruction in the original loop is widened to
03080   // a vector form. We are almost done. Now, we need to fix the PHI nodes
03081   // that we vectorized. The PHI nodes are currently empty because we did
03082   // not want to introduce cycles. Notice that the remaining PHI nodes
03083   // that we need to fix are reduction variables.
03084 
03085   // Create the 'reduced' values for each of the induction vars.
03086   // The reduced values are the vector values that we scalarize and combine
03087   // after the loop is finished.
03088   for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end();
03089        it != e; ++it) {
03090     PHINode *RdxPhi = *it;
03091     assert(RdxPhi && "Unable to recover vectorized PHI");
03092 
03093     // Find the reduction variable descriptor.
03094     assert(Legal->getReductionVars()->count(RdxPhi) &&
03095            "Unable to find the reduction variable");
03096     RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[RdxPhi];
03097 
03098     RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
03099     TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
03100     Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
03101     RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
03102         RdxDesc.getMinMaxRecurrenceKind();
03103     setDebugLocFromInst(Builder, ReductionStartValue);
03104 
03105     // We need to generate a reduction vector from the incoming scalar.
03106     // To do so, we need to generate the 'identity' vector and override
03107     // one of the elements with the incoming scalar reduction. We need
03108     // to do it in the vector-loop preheader.
03109     Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator());
03110 
03111     // This is the vector-clone of the value that leaves the loop.
03112     VectorParts &VectorExit = getVectorValue(LoopExitInst);
03113     Type *VecTy = VectorExit[0]->getType();
03114 
03115     // Find the reduction identity variable. Zero for addition, or, xor,
03116     // one for multiplication, -1 for And.
03117     Value *Identity;
03118     Value *VectorStart;
03119     if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
03120         RK == RecurrenceDescriptor::RK_FloatMinMax) {
03121       // MinMax reduction have the start value as their identify.
03122       if (VF == 1) {
03123         VectorStart = Identity = ReductionStartValue;
03124       } else {
03125         VectorStart = Identity =
03126             Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
03127       }
03128     } else {
03129       // Handle other reduction kinds:
03130       Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
03131           RK, VecTy->getScalarType());
03132       if (VF == 1) {
03133         Identity = Iden;
03134         // This vector is the Identity vector where the first element is the
03135         // incoming scalar reduction.
03136         VectorStart = ReductionStartValue;
03137       } else {
03138         Identity = ConstantVector::getSplat(VF, Iden);
03139 
03140         // This vector is the Identity vector where the first element is the
03141         // incoming scalar reduction.
03142         VectorStart =
03143             Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
03144       }
03145     }
03146 
03147     // Fix the vector-loop phi.
03148 
03149     // Reductions do not have to start at zero. They can start with
03150     // any loop invariant values.
03151     VectorParts &VecRdxPhi = WidenMap.get(RdxPhi);
03152     BasicBlock *Latch = OrigLoop->getLoopLatch();
03153     Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch);
03154     VectorParts &Val = getVectorValue(LoopVal);
03155     for (unsigned part = 0; part < UF; ++part) {
03156       // Make sure to add the reduction stat value only to the
03157       // first unroll part.
03158       Value *StartVal = (part == 0) ? VectorStart : Identity;
03159       cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal,
03160                                                   LoopVectorPreHeader);
03161       cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part],
03162                                                   LoopVectorBody.back());
03163     }
03164 
03165     // Before each round, move the insertion point right between
03166     // the PHIs and the values we are going to write.
03167     // This allows us to write both PHINodes and the extractelement
03168     // instructions.
03169     Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
03170 
03171     VectorParts RdxParts;
03172     setDebugLocFromInst(Builder, LoopExitInst);
03173     for (unsigned part = 0; part < UF; ++part) {
03174       // This PHINode contains the vectorized reduction variable, or
03175       // the initial value vector, if we bypass the vector loop.
03176       VectorParts &RdxExitVal = getVectorValue(LoopExitInst);
03177       PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
03178       Value *StartVal = (part == 0) ? VectorStart : Identity;
03179       for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
03180         NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]);
03181       NewPhi->addIncoming(RdxExitVal[part],
03182                           LoopVectorBody.back());
03183       RdxParts.push_back(NewPhi);
03184     }
03185 
03186     // Reduce all of the unrolled parts into a single vector.
03187     Value *ReducedPartRdx = RdxParts[0];
03188     unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
03189     setDebugLocFromInst(Builder, ReducedPartRdx);
03190     for (unsigned part = 1; part < UF; ++part) {
03191       if (Op != Instruction::ICmp && Op != Instruction::FCmp)
03192         // Floating point operations had to be 'fast' to enable the reduction.
03193         ReducedPartRdx = addFastMathFlag(
03194             Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part],
03195                                 ReducedPartRdx, "bin.rdx"));
03196       else
03197         ReducedPartRdx = RecurrenceDescriptor::createMinMaxOp(
03198             Builder, MinMaxKind, ReducedPartRdx, RdxParts[part]);
03199     }
03200 
03201     if (VF > 1) {
03202       // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
03203       // and vector ops, reducing the set of values being computed by half each
03204       // round.
03205       assert(isPowerOf2_32(VF) &&
03206              "Reduction emission only supported for pow2 vectors!");
03207       Value *TmpVec = ReducedPartRdx;
03208       SmallVector<Constant*, 32> ShuffleMask(VF, nullptr);
03209       for (unsigned i = VF; i != 1; i >>= 1) {
03210         // Move the upper half of the vector to the lower half.
03211         for (unsigned j = 0; j != i/2; ++j)
03212           ShuffleMask[j] = Builder.getInt32(i/2 + j);
03213 
03214         // Fill the rest of the mask with undef.
03215         std::fill(&ShuffleMask[i/2], ShuffleMask.end(),
03216                   UndefValue::get(Builder.getInt32Ty()));
03217 
03218         Value *Shuf =
03219         Builder.CreateShuffleVector(TmpVec,
03220                                     UndefValue::get(TmpVec->getType()),
03221                                     ConstantVector::get(ShuffleMask),
03222                                     "rdx.shuf");
03223 
03224         if (Op != Instruction::ICmp && Op != Instruction::FCmp)
03225           // Floating point operations had to be 'fast' to enable the reduction.
03226           TmpVec = addFastMathFlag(Builder.CreateBinOp(
03227               (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx"));
03228         else
03229           TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind,
03230                                                         TmpVec, Shuf);
03231       }
03232 
03233       // The result is in the first element of the vector.
03234       ReducedPartRdx = Builder.CreateExtractElement(TmpVec,
03235                                                     Builder.getInt32(0));
03236     }
03237 
03238     // Create a phi node that merges control-flow from the backedge-taken check
03239     // block and the middle block.
03240     PHINode *BCBlockPhi = PHINode::Create(RdxPhi->getType(), 2, "bc.merge.rdx",
03241                                           LoopScalarPreHeader->getTerminator());
03242     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[0]);
03243     BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
03244 
03245     // Now, we need to fix the users of the reduction variable
03246     // inside and outside of the scalar remainder loop.
03247     // We know that the loop is in LCSSA form. We need to update the
03248     // PHI nodes in the exit blocks.
03249     for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
03250          LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
03251       PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
03252       if (!LCSSAPhi) break;
03253 
03254       // All PHINodes need to have a single entry edge, or two if
03255       // we already fixed them.
03256       assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
03257 
03258       // We found our reduction value exit-PHI. Update it with the
03259       // incoming bypass edge.
03260       if (LCSSAPhi->getIncomingValue(0) == LoopExitInst) {
03261         // Add an edge coming from the bypass.
03262         LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
03263         break;
03264       }
03265     }// end of the LCSSA phi scan.
03266 
03267     // Fix the scalar loop reduction variable with the incoming reduction sum
03268     // from the vector body and from the backedge value.
03269     int IncomingEdgeBlockIdx =
03270     (RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch());
03271     assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
03272     // Pick the other block.
03273     int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
03274     (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
03275     (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
03276   }// end of for each redux variable.
03277 
03278   fixLCSSAPHIs();
03279 
03280   // Remove redundant induction instructions.
03281   cse(LoopVectorBody);
03282 }
03283 
03284 void InnerLoopVectorizer::fixLCSSAPHIs() {
03285   for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
03286        LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
03287     PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
03288     if (!LCSSAPhi) break;
03289     if (LCSSAPhi->getNumIncomingValues() == 1)
03290       LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()),
03291                             LoopMiddleBlock);
03292   }
03293 }
03294 
03295 InnerLoopVectorizer::VectorParts
03296 InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
03297   assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) &&
03298          "Invalid edge");
03299 
03300   // Look for cached value.
03301   std::pair<BasicBlock*, BasicBlock*> Edge(Src, Dst);
03302   EdgeMaskCache::iterator ECEntryIt = MaskCache.find(Edge);
03303   if (ECEntryIt != MaskCache.end())
03304     return ECEntryIt->second;
03305 
03306   VectorParts SrcMask = createBlockInMask(Src);
03307 
03308   // The terminator has to be a branch inst!
03309   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
03310   assert(BI && "Unexpected terminator found");
03311 
03312   if (BI->isConditional()) {
03313     VectorParts EdgeMask = getVectorValue(BI->getCondition());
03314 
03315     if (BI->getSuccessor(0) != Dst)
03316       for (unsigned part = 0; part < UF; ++part)
03317         EdgeMask[part] = Builder.CreateNot(EdgeMask[part]);
03318 
03319     for (unsigned part = 0; part < UF; ++part)
03320       EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]);
03321 
03322     MaskCache[Edge] = EdgeMask;
03323     return EdgeMask;
03324   }
03325 
03326   MaskCache[Edge] = SrcMask;
03327   return SrcMask;
03328 }
03329 
03330 InnerLoopVectorizer::VectorParts
03331 InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
03332   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
03333 
03334   // Loop incoming mask is all-one.
03335   if (OrigLoop->getHeader() == BB) {
03336     Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1);
03337     return getVectorValue(C);
03338   }
03339 
03340   // This is the block mask. We OR all incoming edges, and with zero.
03341   Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0);
03342   VectorParts BlockMask = getVectorValue(Zero);
03343 
03344   // For each pred:
03345   for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) {
03346     VectorParts EM = createEdgeMask(*it, BB);
03347     for (unsigned part = 0; part < UF; ++part)
03348       BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]);
03349   }
03350 
03351   return BlockMask;
03352 }
03353 
03354 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
03355                                               InnerLoopVectorizer::VectorParts &Entry,
03356                                               unsigned UF, unsigned VF, PhiVector *PV) {
03357   PHINode* P = cast<PHINode>(PN);
03358   // Handle reduction variables:
03359   if (Legal->getReductionVars()->count(P)) {
03360     for (unsigned part = 0; part < UF; ++part) {
03361       // This is phase one of vectorizing PHIs.
03362       Type *VecTy = (VF == 1) ? PN->getType() :
03363       VectorType::get(PN->getType(), VF);
03364       Entry[part] = PHINode::Create(VecTy, 2, "vec.phi",
03365                                     LoopVectorBody.back()-> getFirstInsertionPt());
03366     }
03367     PV->push_back(P);
03368     return;
03369   }
03370 
03371   setDebugLocFromInst(Builder, P);
03372   // Check for PHI nodes that are lowered to vector selects.
03373   if (P->getParent() != OrigLoop->getHeader()) {
03374     // We know that all PHIs in non-header blocks are converted into
03375     // selects, so we don't have to worry about the insertion order and we
03376     // can just use the builder.
03377     // At this point we generate the predication tree. There may be
03378     // duplications since this is a simple recursive scan, but future
03379     // optimizations will clean it up.
03380 
03381     unsigned NumIncoming = P->getNumIncomingValues();
03382 
03383     // Generate a sequence of selects of the form:
03384     // SELECT(Mask3, In3,
03385     //      SELECT(Mask2, In2,
03386     //                   ( ...)))
03387     for (unsigned In = 0; In < NumIncoming; In++) {
03388       VectorParts Cond = createEdgeMask(P->getIncomingBlock(In),
03389                                         P->getParent());
03390       VectorParts &In0 = getVectorValue(P->getIncomingValue(In));
03391 
03392       for (unsigned part = 0; part < UF; ++part) {
03393         // We might have single edge PHIs (blocks) - use an identity
03394         // 'select' for the first PHI operand.
03395         if (In == 0)
03396           Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
03397                                              In0[part]);
03398         else
03399           // Select between the current value and the previous incoming edge
03400           // based on the incoming mask.
03401           Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
03402                                              Entry[part], "predphi");
03403       }
03404     }
03405     return;
03406   }
03407 
03408   // This PHINode must be an induction variable.
03409   // Make sure that we know about it.
03410   assert(Legal->getInductionVars()->count(P) &&
03411          "Not an induction variable");
03412 
03413   LoopVectorizationLegality::InductionInfo II =
03414   Legal->getInductionVars()->lookup(P);
03415 
03416   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
03417   // which can be found from the original scalar operations.
03418   switch (II.IK) {
03419     case LoopVectorizationLegality::IK_NoInduction:
03420       llvm_unreachable("Unknown induction");
03421     case LoopVectorizationLegality::IK_IntInduction: {
03422       assert(P->getType() == II.StartValue->getType() && "Types must match");
03423       Type *PhiTy = P->getType();
03424       Value *Broadcasted;
03425       if (P == OldInduction) {
03426         // Handle the canonical induction variable. We might have had to
03427         // extend the type.
03428         Broadcasted = Builder.CreateTrunc(Induction, PhiTy);
03429       } else {
03430         // Handle other induction variables that are now based on the
03431         // canonical one.
03432         Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx,
03433                                                  "normalized.idx");
03434         NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy);
03435         Broadcasted = II.transform(Builder, NormalizedIdx);
03436         Broadcasted->setName("offset.idx");
03437       }
03438       Broadcasted = getBroadcastInstrs(Broadcasted);
03439       // After broadcasting the induction variable we need to make the vector
03440       // consecutive by adding 0, 1, 2, etc.
03441       for (unsigned part = 0; part < UF; ++part)
03442         Entry[part] = getStepVector(Broadcasted, VF * part, II.StepValue);
03443       return;
03444     }
03445     case LoopVectorizationLegality::IK_PtrInduction:
03446       // Handle the pointer induction variable case.
03447       assert(P->getType()->isPointerTy() && "Unexpected type.");
03448       // This is the normalized GEP that starts counting at zero.
03449       Value *NormalizedIdx =
03450           Builder.CreateSub(Induction, ExtendedIdx, "normalized.idx");
03451       // This is the vector of results. Notice that we don't generate
03452       // vector geps because scalar geps result in better code.
03453       for (unsigned part = 0; part < UF; ++part) {
03454         if (VF == 1) {
03455           int EltIndex = part;
03456           Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex);
03457           Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx);
03458           Value *SclrGep = II.transform(Builder, GlobalIdx);
03459           SclrGep->setName("next.gep");
03460           Entry[part] = SclrGep;
03461           continue;
03462         }
03463 
03464         Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
03465         for (unsigned int i = 0; i < VF; ++i) {
03466           int EltIndex = i + part * VF;
03467           Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex);
03468           Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx);
03469           Value *SclrGep = II.transform(Builder, GlobalIdx);
03470           SclrGep->setName("next.gep");
03471           VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
03472                                                Builder.getInt32(i),
03473                                                "insert.gep");
03474         }
03475         Entry[part] = VecVal;
03476       }
03477       return;
03478   }
03479 }
03480 
03481 void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
03482   // For each instruction in the old loop.
03483   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
03484     VectorParts &Entry = WidenMap.get(it);
03485     switch (it->getOpcode()) {
03486     case Instruction::Br:
03487       // Nothing to do for PHIs and BR, since we already took care of the
03488       // loop control flow instructions.
03489       continue;
03490     case Instruction::PHI: {
03491       // Vectorize PHINodes.
03492       widenPHIInstruction(it, Entry, UF, VF, PV);
03493       continue;
03494     }// End of PHI.
03495 
03496     case Instruction::Add:
03497     case Instruction::FAdd:
03498     case Instruction::Sub:
03499     case Instruction::FSub:
03500     case Instruction::Mul:
03501     case Instruction::FMul:
03502     case Instruction::UDiv:
03503     case Instruction::SDiv:
03504     case Instruction::FDiv:
03505     case Instruction::URem:
03506     case Instruction::SRem:
03507     case Instruction::FRem:
03508     case Instruction::Shl:
03509     case Instruction::LShr:
03510     case Instruction::AShr:
03511     case Instruction::And:
03512     case Instruction::Or:
03513     case Instruction::Xor: {
03514       // Just widen binops.
03515       BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it);
03516       setDebugLocFromInst(Builder, BinOp);
03517       VectorParts &A = getVectorValue(it->getOperand(0));
03518       VectorParts &B = getVectorValue(it->getOperand(1));
03519 
03520       // Use this vector value for all users of the original instruction.
03521       for (unsigned Part = 0; Part < UF; ++Part) {
03522         Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]);
03523 
03524         if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
03525           VecOp->copyIRFlags(BinOp);
03526 
03527         Entry[Part] = V;
03528       }
03529 
03530       propagateMetadata(Entry, it);
03531       break;
03532     }
03533     case Instruction::Select: {
03534       // Widen selects.
03535       // If the selector is loop invariant we can create a select
03536       // instruction with a scalar condition. Otherwise, use vector-select.
03537       bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)),
03538                                                OrigLoop);
03539       setDebugLocFromInst(Builder, it);
03540 
03541       // The condition can be loop invariant  but still defined inside the
03542       // loop. This means that we can't just use the original 'cond' value.
03543       // We have to take the 'vectorized' value and pick the first lane.
03544       // Instcombine will make this a no-op.
03545       VectorParts &Cond = getVectorValue(it->getOperand(0));
03546       VectorParts &Op0  = getVectorValue(it->getOperand(1));
03547       VectorParts &Op1  = getVectorValue(it->getOperand(2));
03548 
03549       Value *ScalarCond = (VF == 1) ? Cond[0] :
03550         Builder.CreateExtractElement(Cond[0], Builder.getInt32(0));
03551 
03552       for (unsigned Part = 0; Part < UF; ++Part) {
03553         Entry[Part] = Builder.CreateSelect(
03554           InvariantCond ? ScalarCond : Cond[Part],
03555           Op0[Part],
03556           Op1[Part]);
03557       }
03558 
03559       propagateMetadata(Entry, it);
03560       break;
03561     }
03562 
03563     case Instruction::ICmp:
03564     case Instruction::FCmp: {
03565       // Widen compares. Generate vector compares.
03566       bool FCmp = (it->getOpcode() == Instruction::FCmp);
03567       CmpInst *Cmp = dyn_cast<CmpInst>(it);
03568       setDebugLocFromInst(Builder, it);
03569       VectorParts &A = getVectorValue(it->getOperand(0));
03570       VectorParts &B = getVectorValue(it->getOperand(1));
03571       for (unsigned Part = 0; Part < UF; ++Part) {
03572         Value *C = nullptr;
03573         if (FCmp)
03574           C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]);
03575         else
03576           C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]);
03577         Entry[Part] = C;
03578       }
03579 
03580       propagateMetadata(Entry, it);
03581       break;
03582     }
03583 
03584     case Instruction::Store:
03585     case Instruction::Load:
03586       vectorizeMemoryInstruction(it);
03587         break;
03588     case Instruction::ZExt:
03589     case Instruction::SExt:
03590     case Instruction::FPToUI:
03591     case Instruction::FPToSI:
03592     case Instruction::FPExt:
03593     case Instruction::PtrToInt:
03594     case Instruction::IntToPtr:
03595     case Instruction::SIToFP:
03596     case Instruction::UIToFP:
03597     case Instruction::Trunc:
03598     case Instruction::FPTrunc:
03599     case Instruction::BitCast: {
03600       CastInst *CI = dyn_cast<CastInst>(it);
03601       setDebugLocFromInst(Builder, it);
03602       /// Optimize the special case where the source is the induction
03603       /// variable. Notice that we can only optimize the 'trunc' case
03604       /// because: a. FP conversions lose precision, b. sext/zext may wrap,
03605       /// c. other casts depend on pointer size.
03606       if (CI->getOperand(0) == OldInduction &&
03607           it->getOpcode() == Instruction::Trunc) {
03608         Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction,
03609                                                CI->getType());
03610         Value *Broadcasted = getBroadcastInstrs(ScalarCast);
03611         LoopVectorizationLegality::InductionInfo II =
03612             Legal->getInductionVars()->lookup(OldInduction);
03613         Constant *Step =
03614             ConstantInt::getSigned(CI->getType(), II.StepValue->getSExtValue());
03615         for (unsigned Part = 0; Part < UF; ++Part)
03616           Entry[Part] = getStepVector(Broadcasted, VF * Part, Step);
03617         propagateMetadata(Entry, it);
03618         break;
03619       }
03620       /// Vectorize casts.
03621       Type *DestTy = (VF == 1) ? CI->getType() :
03622                                  VectorType::get(CI->getType(), VF);
03623 
03624       VectorParts &A = getVectorValue(it->getOperand(0));
03625       for (unsigned Part = 0; Part < UF; ++Part)
03626         Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy);
03627       propagateMetadata(Entry, it);
03628       break;
03629     }
03630 
03631     case Instruction::Call: {
03632       // Ignore dbg intrinsics.
03633       if (isa<DbgInfoIntrinsic>(it))
03634         break;
03635       setDebugLocFromInst(Builder, it);
03636 
03637       Module *M = BB->getParent()->getParent();
03638       CallInst *CI = cast<CallInst>(it);
03639 
03640       StringRef FnName = CI->getCalledFunction()->getName();
03641       Function *F = CI->getCalledFunction();
03642       Type *RetTy = ToVectorTy(CI->getType(), VF);
03643       SmallVector<Type *, 4> Tys;
03644       for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
03645         Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF));
03646 
03647       Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
03648       if (ID &&
03649           (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
03650            ID == Intrinsic::lifetime_start)) {
03651         scalarizeInstruction(it);
03652         break;
03653       }
03654       // The flag shows whether we use Intrinsic or a usual Call for vectorized
03655       // version of the instruction.
03656       // Is it beneficial to perform intrinsic call compared to lib call?
03657       bool NeedToScalarize;
03658       unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
03659       bool UseVectorIntrinsic =
03660           ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
03661       if (!UseVectorIntrinsic && NeedToScalarize) {
03662         scalarizeInstruction(it);
03663         break;
03664       }
03665 
03666       for (unsigned Part = 0; Part < UF; ++Part) {
03667         SmallVector<Value *, 4> Args;
03668         for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
03669           Value *Arg = CI->getArgOperand(i);
03670           // Some intrinsics have a scalar argument - don't replace it with a
03671           // vector.
03672           if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) {
03673             VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i));
03674             Arg = VectorArg[Part];
03675           }
03676           Args.push_back(Arg);
03677         }
03678 
03679         Function *VectorF;
03680         if (UseVectorIntrinsic) {
03681           // Use vector version of the intrinsic.
03682           Type *TysForDecl[] = {CI->getType()};
03683           if (VF > 1)
03684             TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
03685           VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
03686         } else {
03687           // Use vector version of the library call.
03688           StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
03689           assert(!VFnName.empty() && "Vector function name is empty.");
03690           VectorF = M->getFunction(VFnName);
03691           if (!VectorF) {
03692             // Generate a declaration
03693             FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
03694             VectorF =
03695                 Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
03696             VectorF->copyAttributesFrom(F);
03697           }
03698         }
03699         assert(VectorF && "Can't create vector function.");
03700         Entry[Part] = Builder.CreateCall(VectorF, Args);
03701       }
03702 
03703       propagateMetadata(Entry, it);
03704       break;
03705     }
03706 
03707     default:
03708       // All other instructions are unsupported. Scalarize them.
03709       scalarizeInstruction(it);
03710       break;
03711     }// end of switch.
03712   }// end of for_each instr.
03713 }
03714 
03715 void InnerLoopVectorizer::updateAnalysis() {
03716   // Forget the original basic block.
03717   SE->forgetLoop(OrigLoop);
03718 
03719   // Update the dominator tree information.
03720   assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
03721          "Entry does not dominate exit.");
03722 
03723   for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
03724     DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]);
03725   DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back());
03726 
03727   // Due to if predication of stores we might create a sequence of "if(pred)
03728   // a[i] = ...;  " blocks.
03729   for (unsigned i = 0, e = LoopVectorBody.size(); i != e; ++i) {
03730     if (i == 0)
03731       DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader);
03732     else if (isPredicatedBlock(i)) {
03733       DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-1]);
03734     } else {
03735       DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-2]);
03736     }
03737   }
03738 
03739   DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks[1]);
03740   DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
03741   DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
03742   DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
03743 
03744   DEBUG(DT->verifyDomTree());
03745 }
03746 
03747 /// \brief Check whether it is safe to if-convert this phi node.
03748 ///
03749 /// Phi nodes with constant expressions that can trap are not safe to if
03750 /// convert.
03751 static bool canIfConvertPHINodes(BasicBlock *BB) {
03752   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
03753     PHINode *Phi = dyn_cast<PHINode>(I);
03754     if (!Phi)
03755       return true;
03756     for (unsigned p = 0, e = Phi->getNumIncomingValues(); p != e; ++p)
03757       if (Constant *C = dyn_cast<Constant>(Phi->getIncomingValue(p)))
03758         if (C->canTrap())
03759           return false;
03760   }
03761   return true;
03762 }
03763 
03764 bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
03765   if (!EnableIfConversion) {
03766     emitAnalysis(VectorizationReport() << "if-conversion is disabled");
03767     return false;
03768   }
03769 
03770   assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
03771 
03772   // A list of pointers that we can safely read and write to.
03773   SmallPtrSet<Value *, 8> SafePointes;
03774 
03775   // Collect safe addresses.
03776   for (Loop::block_iterator BI = TheLoop->block_begin(),
03777          BE = TheLoop->block_end(); BI != BE; ++BI) {
03778     BasicBlock *BB = *BI;
03779 
03780     if (blockNeedsPredication(BB))
03781       continue;
03782 
03783     for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
03784       if (LoadInst *LI = dyn_cast<LoadInst>(I))
03785         SafePointes.insert(LI->getPointerOperand());
03786       else if (StoreInst *SI = dyn_cast<StoreInst>(I))
03787         SafePointes.insert(SI->getPointerOperand());
03788     }
03789   }
03790 
03791   // Collect the blocks that need predication.
03792   BasicBlock *Header = TheLoop->getHeader();
03793   for (Loop::block_iterator BI = TheLoop->block_begin(),
03794          BE = TheLoop->block_end(); BI != BE; ++BI) {
03795     BasicBlock *BB = *BI;
03796 
03797     // We don't support switch statements inside loops.
03798     if (!isa<BranchInst>(BB->getTerminator())) {
03799       emitAnalysis(VectorizationReport(BB->getTerminator())
03800                    << "loop contains a switch statement");
03801       return false;
03802     }
03803 
03804     // We must be able to predicate all blocks that need to be predicated.
03805     if (blockNeedsPredication(BB)) {
03806       if (!blockCanBePredicated(BB, SafePointes)) {
03807         emitAnalysis(VectorizationReport(BB->getTerminator())
03808                      << "control flow cannot be substituted for a select");
03809         return false;
03810       }
03811     } else if (BB != Header && !canIfConvertPHINodes(BB)) {
03812       emitAnalysis(VectorizationReport(BB->getTerminator())
03813                    << "control flow cannot be substituted for a select");
03814       return false;
03815     }
03816   }
03817 
03818   // We can if-convert this loop.
03819   return true;
03820 }
03821 
03822 bool LoopVectorizationLegality::canVectorize() {
03823   // We must have a loop in canonical form. Loops with indirectbr in them cannot
03824   // be canonicalized.
03825   if (!TheLoop->getLoopPreheader()) {
03826     emitAnalysis(
03827         VectorizationReport() <<
03828         "loop control flow is not understood by vectorizer");
03829     return false;
03830   }
03831 
03832   // We can only vectorize innermost loops.
03833   if (!TheLoop->getSubLoopsVector().empty()) {
03834     emitAnalysis(VectorizationReport() << "loop is not the innermost loop");
03835     return false;
03836   }
03837 
03838   // We must have a single backedge.
03839   if (TheLoop->getNumBackEdges() != 1) {
03840     emitAnalysis(
03841         VectorizationReport() <<
03842         "loop control flow is not understood by vectorizer");
03843     return false;
03844   }
03845 
03846   // We must have a single exiting block.
03847   if (!TheLoop->getExitingBlock()) {
03848     emitAnalysis(
03849         VectorizationReport() <<
03850         "loop control flow is not understood by vectorizer");
03851     return false;
03852   }
03853 
03854   // We only handle bottom-tested loops, i.e. loop in which the condition is
03855   // checked at the end of each iteration. With that we can assume that all
03856   // instructions in the loop are executed the same number of times.
03857   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
03858     emitAnalysis(
03859         VectorizationReport() <<
03860         "loop control flow is not understood by vectorizer");
03861     return false;
03862   }
03863 
03864   // We need to have a loop header.
03865   DEBUG(dbgs() << "LV: Found a loop: " <<
03866         TheLoop->getHeader()->getName() << '\n');
03867 
03868   // Check if we can if-convert non-single-bb loops.
03869   unsigned NumBlocks = TheLoop->getNumBlocks();
03870   if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
03871     DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
03872     return false;
03873   }
03874 
03875   // ScalarEvolution needs to be able to find the exit count.
03876   const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop);
03877   if (ExitCount == SE->getCouldNotCompute()) {
03878     emitAnalysis(VectorizationReport() <<
03879                  "could not determine number of loop iterations");
03880     DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
03881     return false;
03882   }
03883 
03884   // Check if we can vectorize the instructions and CFG in this loop.
03885   if (!canVectorizeInstrs()) {
03886     DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
03887     return false;
03888   }
03889 
03890   // Go over each instruction and look at memory deps.
03891   if (!canVectorizeMemory()) {
03892     DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
03893     return false;
03894   }
03895 
03896   // Collect all of the variables that remain uniform after vectorization.
03897   collectLoopUniforms();
03898 
03899   DEBUG(dbgs() << "LV: We can vectorize this loop" <<
03900         (LAI->getRuntimePointerCheck()->Need ? " (with a runtime bound check)" :
03901          "")
03902         <<"!\n");
03903 
03904   // Analyze interleaved memory accesses.
03905   if (EnableInterleavedMemAccesses)
03906     InterleaveInfo.analyzeInterleaving(Strides);
03907 
03908   // Okay! We can vectorize. At this point we don't have any other mem analysis
03909   // which may limit our maximum vectorization factor, so just return true with
03910   // no restrictions.
03911   return true;
03912 }
03913 
03914 static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {
03915   if (Ty->isPointerTy())
03916     return DL.getIntPtrType(Ty);
03917 
03918   // It is possible that char's or short's overflow when we ask for the loop's
03919   // trip count, work around this by changing the type size.
03920   if (Ty->getScalarSizeInBits() < 32)
03921     return Type::getInt32Ty(Ty->getContext());
03922 
03923   return Ty;
03924 }
03925 
03926 static Type* getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
03927   Ty0 = convertPointerToIntegerType(DL, Ty0);
03928   Ty1 = convertPointerToIntegerType(DL, Ty1);
03929   if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
03930     return Ty0;
03931   return Ty1;
03932 }
03933 
03934 /// \brief Check that the instruction has outside loop users and is not an
03935 /// identified reduction variable.
03936 static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
03937                                SmallPtrSetImpl<Value *> &Reductions) {
03938   // Reduction instructions are allowed to have exit users. All other
03939   // instructions must not have external users.
03940   if (!Reductions.count(Inst))
03941     //Check that all of the users of the loop are inside the BB.
03942     for (User *U : Inst->users()) {
03943       Instruction *UI = cast<Instruction>(U);
03944       // This user may be a reduction exit value.
03945       if (!TheLoop->contains(UI)) {
03946         DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n');
03947         return true;
03948       }
03949     }
03950   return false;
03951 }
03952 
03953 bool LoopVectorizationLegality::canVectorizeInstrs() {
03954   BasicBlock *PreHeader = TheLoop->getLoopPreheader();
03955   BasicBlock *Header = TheLoop->getHeader();
03956 
03957   // Look for the attribute signaling the absence of NaNs.
03958   Function &F = *Header->getParent();
03959   const DataLayout &DL = F.getParent()->getDataLayout();
03960   if (F.hasFnAttribute("no-nans-fp-math"))
03961     HasFunNoNaNAttr =
03962         F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
03963 
03964   // For each block in the loop.
03965   for (Loop::block_iterator bb = TheLoop->block_begin(),
03966        be = TheLoop->block_end(); bb != be; ++bb) {
03967 
03968     // Scan the instructions in the block and look for hazards.
03969     for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
03970          ++it) {
03971 
03972       if (PHINode *Phi = dyn_cast<PHINode>(it)) {
03973         Type *PhiTy = Phi->getType();
03974         // Check that this PHI type is allowed.
03975         if (!PhiTy->isIntegerTy() &&
03976             !PhiTy->isFloatingPointTy() &&
03977             !PhiTy->isPointerTy()) {
03978           emitAnalysis(VectorizationReport(it)
03979                        << "loop control flow is not understood by vectorizer");
03980           DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
03981           return false;
03982         }
03983 
03984         // If this PHINode is not in the header block, then we know that we
03985         // can convert it to select during if-conversion. No need to check if
03986         // the PHIs in this block are induction or reduction variables.
03987         if (*bb != Header) {
03988           // Check that this instruction has no outside users or is an
03989           // identified reduction value with an outside user.
03990           if (!hasOutsideLoopUser(TheLoop, it, AllowedExit))
03991             continue;
03992           emitAnalysis(VectorizationReport(it) <<
03993                        "value could not be identified as "
03994                        "an induction or reduction variable");
03995           return false;
03996         }
03997 
03998         // We only allow if-converted PHIs with exactly two incoming values.
03999         if (Phi->getNumIncomingValues() != 2) {
04000           emitAnalysis(VectorizationReport(it)
04001                        << "control flow not understood by vectorizer");
04002           DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
04003           return false;
04004         }
04005 
04006         // This is the value coming from the preheader.
04007         Value *StartValue = Phi->getIncomingValueForBlock(PreHeader);
04008         ConstantInt *StepValue = nullptr;
04009         // Check if this is an induction variable.
04010         InductionKind IK = isInductionVariable(Phi, StepValue);
04011 
04012         if (IK_NoInduction != IK) {
04013           // Get the widest type.
04014           if (!WidestIndTy)
04015             WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
04016           else
04017             WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
04018 
04019           // Int inductions are special because we only allow one IV.
04020           if (IK == IK_IntInduction && StepValue->isOne()) {
04021             // Use the phi node with the widest type as induction. Use the last
04022             // one if there are multiple (no good reason for doing this other
04023             // than it is expedient).
04024             if (!Induction || PhiTy == WidestIndTy)
04025               Induction = Phi;
04026           }
04027 
04028           DEBUG(dbgs() << "LV: Found an induction variable.\n");
04029           Inductions[Phi] = InductionInfo(StartValue, IK, StepValue);
04030 
04031           // Until we explicitly handle the case of an induction variable with
04032           // an outside loop user we have to give up vectorizing this loop.
04033           if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) {
04034             emitAnalysis(VectorizationReport(it) <<
04035                          "use of induction value outside of the "
04036                          "loop is not handled by vectorizer");
04037             return false;
04038           }
04039 
04040           continue;
04041         }
04042 
04043         if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop,
04044                                                  Reductions[Phi])) {
04045           AllowedExit.insert(Reductions[Phi].getLoopExitInstr());
04046           continue;
04047         }
04048 
04049         emitAnalysis(VectorizationReport(it) <<
04050                      "value that could not be identified as "
04051                      "reduction is used outside the loop");
04052         DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
04053         return false;
04054       }// end of PHI handling
04055 
04056       // We handle calls that:
04057       //   * Are debug info intrinsics.
04058       //   * Have a mapping to an IR intrinsic.
04059       //   * Have a vector version available.
04060       CallInst *CI = dyn_cast<CallInst>(it);
04061       if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI) &&
04062           !(CI->getCalledFunction() && TLI &&
04063             TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) {
04064         emitAnalysis(VectorizationReport(it) <<
04065                      "call instruction cannot be vectorized");
04066         DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n");
04067         return false;
04068       }
04069 
04070       // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the
04071       // second argument is the same (i.e. loop invariant)
04072       if (CI &&
04073           hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) {
04074         if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) {
04075           emitAnalysis(VectorizationReport(it)
04076                        << "intrinsic instruction cannot be vectorized");
04077           DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n");
04078           return false;
04079         }
04080       }
04081 
04082       // Check that the instruction return type is vectorizable.
04083       // Also, we can't vectorize extractelement instructions.
04084       if ((!VectorType::isValidElementType(it->getType()) &&
04085            !it->getType()->isVoidTy()) || isa<ExtractElementInst>(it)) {
04086         emitAnalysis(VectorizationReport(it)
04087                      << "instruction return type cannot be vectorized");
04088         DEBUG(dbgs() << "LV: Found unvectorizable type.\n");
04089         return false;
04090       }
04091 
04092       // Check that the stored type is vectorizable.
04093       if (StoreInst *ST = dyn_cast<StoreInst>(it)) {
04094         Type *T = ST->getValueOperand()->getType();
04095         if (!VectorType::isValidElementType(T)) {
04096           emitAnalysis(VectorizationReport(ST) <<
04097                        "store instruction cannot be vectorized");
04098           return false;
04099         }
04100         if (EnableMemAccessVersioning)
04101           collectStridedAccess(ST);
04102       }
04103 
04104       if (EnableMemAccessVersioning)
04105         if (LoadInst *LI = dyn_cast<LoadInst>(it))
04106           collectStridedAccess(LI);
04107 
04108       // Reduction instructions are allowed to have exit users.
04109       // All other instructions must not have external users.
04110       if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) {
04111         emitAnalysis(VectorizationReport(it) <<
04112                      "value cannot be used outside the loop");
04113         return false;
04114       }
04115 
04116     } // next instr.
04117 
04118   }
04119 
04120   if (!Induction) {
04121     DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
04122     if (Inductions.empty()) {
04123       emitAnalysis(VectorizationReport()
04124                    << "loop induction variable could not be identified");
04125       return false;
04126     }
04127   }
04128 
04129   return true;
04130 }
04131 
04132 ///\brief Remove GEPs whose indices but the last one are loop invariant and
04133 /// return the induction operand of the gep pointer.
04134 static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
04135   GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
04136   if (!GEP)
04137     return Ptr;
04138 
04139   unsigned InductionOperand = getGEPInductionOperand(GEP);
04140 
04141   // Check that all of the gep indices are uniform except for our induction
04142   // operand.
04143   for (unsigned i = 0, e = GEP->getNumOperands(); i != e; ++i)
04144     if (i != InductionOperand &&
04145         !SE->isLoopInvariant(SE->getSCEV(GEP->getOperand(i)), Lp))
04146       return Ptr;
04147   return GEP->getOperand(InductionOperand);
04148 }
04149 
04150 ///\brief Look for a cast use of the passed value.
04151 static Value *getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty) {
04152   Value *UniqueCast = nullptr;
04153   for (User *U : Ptr->users()) {
04154     CastInst *CI = dyn_cast<CastInst>(U);
04155     if (CI && CI->getType() == Ty) {
04156       if (!UniqueCast)
04157         UniqueCast = CI;
04158       else
04159         return nullptr;
04160     }
04161   }
04162   return UniqueCast;
04163 }
04164 
04165 ///\brief Get the stride of a pointer access in a loop.
04166 /// Looks for symbolic strides "a[i*stride]". Returns the symbolic stride as a
04167 /// pointer to the Value, or null otherwise.
04168 static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
04169   const PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
04170   if (!PtrTy || PtrTy->isAggregateType())
04171     return nullptr;
04172 
04173   // Try to remove a gep instruction to make the pointer (actually index at this
04174   // point) easier analyzable. If OrigPtr is equal to Ptr we are analzying the
04175   // pointer, otherwise, we are analyzing the index.
04176   Value *OrigPtr = Ptr;
04177 
04178   // The size of the pointer access.
04179   int64_t PtrAccessSize = 1;
04180 
04181   Ptr = stripGetElementPtr(Ptr, SE, Lp);
04182   const SCEV *V = SE->getSCEV(Ptr);
04183 
04184   if (Ptr != OrigPtr)
04185     // Strip off casts.
04186     while (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(V))
04187       V = C->getOperand();
04188 
04189   const SCEVAddRecExpr *S = dyn_cast<SCEVAddRecExpr>(V);
04190   if (!S)
04191     return nullptr;
04192 
04193   V = S->getStepRecurrence(*SE);
04194   if (!V)
04195     return nullptr;
04196 
04197   // Strip off the size of access multiplication if we are still analyzing the
04198   // pointer.
04199   if (OrigPtr == Ptr) {
04200     const DataLayout &DL = Lp->getHeader()->getModule()->getDataLayout();
04201     DL.getTypeAllocSize(PtrTy->getElementType());
04202     if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(V)) {
04203       if (M->getOperand(0)->getSCEVType() != scConstant)
04204         return nullptr;
04205 
04206       const APInt &APStepVal =
04207           cast<SCEVConstant>(M->getOperand(0))->getValue()->getValue();
04208 
04209       // Huge step value - give up.
04210       if (APStepVal.getBitWidth() > 64)
04211         return nullptr;
04212 
04213       int64_t StepVal = APStepVal.getSExtValue();
04214       if (PtrAccessSize != StepVal)
04215         return nullptr;
04216       V = M->getOperand(1);
04217     }
04218   }
04219 
04220   // Strip off casts.
04221   Type *StripedOffRecurrenceCast = nullptr;
04222   if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(V)) {
04223     StripedOffRecurrenceCast = C->getType();
04224     V = C->getOperand();
04225   }
04226 
04227   // Look for the loop invariant symbolic value.
04228   const SCEVUnknown *U = dyn_cast<SCEVUnknown>(V);
04229   if (!U)
04230     return nullptr;
04231 
04232   Value *Stride = U->getValue();
04233   if (!Lp->isLoopInvariant(Stride))
04234     return nullptr;
04235 
04236   // If we have stripped off the recurrence cast we have to make sure that we
04237   // return the value that is used in this loop so that we can replace it later.
04238   if (StripedOffRecurrenceCast)
04239     Stride = getUniqueCastUse(Stride, Lp, StripedOffRecurrenceCast);
04240 
04241   return Stride;
04242 }
04243 
04244 void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) {
04245   Value *Ptr = nullptr;
04246   if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess))
04247     Ptr = LI->getPointerOperand();
04248   else if (StoreInst *SI = dyn_cast<StoreInst>(MemAccess))
04249     Ptr = SI->getPointerOperand();
04250   else
04251     return;
04252 
04253   Value *Stride = getStrideFromPointer(Ptr, SE, TheLoop);
04254   if (!Stride)
04255     return;
04256 
04257   DEBUG(dbgs() << "LV: Found a strided access that we can version");
04258   DEBUG(dbgs() << "  Ptr: " << *Ptr << " Stride: " << *Stride << "\n");
04259   Strides[Ptr] = Stride;
04260   StrideSet.insert(Stride);
04261 }
04262 
04263 void LoopVectorizationLegality::collectLoopUniforms() {
04264   // We now know that the loop is vectorizable!
04265   // Collect variables that will remain uniform after vectorization.
04266   std::vector<Value*> Worklist;
04267   BasicBlock *Latch = TheLoop->getLoopLatch();
04268 
04269   // Start with the conditional branch and walk up the block.
04270   Worklist.push_back(Latch->getTerminator()->getOperand(0));
04271 
04272   // Also add all consecutive pointer values; these values will be uniform
04273   // after vectorization (and subsequent cleanup) and, until revectorization is
04274   // supported, all dependencies must also be uniform.
04275   for (Loop::block_iterator B = TheLoop->block_begin(),
04276        BE = TheLoop->block_end(); B != BE; ++B)
04277     for (BasicBlock::iterator I = (*B)->begin(), IE = (*B)->end();
04278          I != IE; ++I)
04279       if (I->getType()->isPointerTy() && isConsecutivePtr(I))
04280         Worklist.insert(Worklist.end(), I->op_begin(), I->op_end());
04281 
04282   while (!Worklist.empty()) {
04283     Instruction *I = dyn_cast<Instruction>(Worklist.back());
04284     Worklist.pop_back();
04285 
04286     // Look at instructions inside this loop.
04287     // Stop when reaching PHI nodes.
04288     // TODO: we need to follow values all over the loop, not only in this block.
04289     if (!I || !TheLoop->contains(I) || isa<PHINode>(I))
04290       continue;
04291 
04292     // This is a known uniform.
04293     Uniforms.insert(I);
04294 
04295     // Insert all operands.
04296     Worklist.insert(Worklist.end(), I->op_begin(), I->op_end());
04297   }
04298 }
04299 
04300 bool LoopVectorizationLegality::canVectorizeMemory() {
04301   LAI = &LAA->getInfo(TheLoop, Strides);
04302   auto &OptionalReport = LAI->getReport();
04303   if (OptionalReport)
04304     emitAnalysis(VectorizationReport(*OptionalReport));
04305   if (!LAI->canVectorizeMemory())
04306     return false;
04307 
04308   if (LAI->hasStoreToLoopInvariantAddress()) {
04309     emitAnalysis(
04310         VectorizationReport()
04311         << "write to a loop invariant address could not be vectorized");
04312     DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
04313     return false;
04314   }
04315 
04316   if (LAI->getNumRuntimePointerChecks() >
04317       VectorizerParams::RuntimeMemoryCheckThreshold) {
04318     emitAnalysis(VectorizationReport()
04319                  << LAI->getNumRuntimePointerChecks() << " exceeds limit of "
04320                  << VectorizerParams::RuntimeMemoryCheckThreshold
04321                  << " dependent memory operations checked at runtime");
04322     DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
04323     return false;
04324   }
04325   return true;
04326 }
04327 
04328 LoopVectorizationLegality::InductionKind
04329 LoopVectorizationLegality::isInductionVariable(PHINode *Phi,
04330                                                ConstantInt *&StepValue) {
04331   if (!isInductionPHI(Phi, SE, StepValue))
04332     return IK_NoInduction;
04333 
04334   Type *PhiTy = Phi->getType();
04335   // Found an Integer induction variable.
04336   if (PhiTy->isIntegerTy())
04337     return IK_IntInduction;
04338   // Found an Pointer induction variable.
04339   return IK_PtrInduction;
04340 }
04341 
04342 bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
04343   Value *In0 = const_cast<Value*>(V);
04344   PHINode *PN = dyn_cast_or_null<PHINode>(In0);
04345   if (!PN)
04346     return false;
04347 
04348   return Inductions.count(PN);
04349 }
04350 
04351 bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB)  {
04352   return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
04353 }
04354 
04355 bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,
04356                                            SmallPtrSetImpl<Value *> &SafePtrs) {
04357   
04358   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
04359     // Check that we don't have a constant expression that can trap as operand.
04360     for (Instruction::op_iterator OI = it->op_begin(), OE = it->op_end();
04361          OI != OE; ++OI) {
04362       if (Constant *C = dyn_cast<Constant>(*OI))
04363         if (C->canTrap())
04364           return false;
04365     }
04366     // We might be able to hoist the load.
04367     if (it->mayReadFromMemory()) {
04368       LoadInst *LI = dyn_cast<LoadInst>(it);
04369       if (!LI)
04370         return false;
04371       if (!SafePtrs.count(LI->getPointerOperand())) {
04372         if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand())) {
04373           MaskedOp.insert(LI);
04374           continue;
04375         }
04376         return false;
04377       }
04378     }
04379 
04380     // We don't predicate stores at the moment.
04381     if (it->mayWriteToMemory()) {
04382       StoreInst *SI = dyn_cast<StoreInst>(it);
04383       // We only support predication of stores in basic blocks with one
04384       // predecessor.
04385       if (!SI)
04386         return false;
04387 
04388       bool isSafePtr = (SafePtrs.count(SI->getPointerOperand()) != 0);
04389       bool isSinglePredecessor = SI->getParent()->getSinglePredecessor();
04390       
04391       if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr ||
04392           !isSinglePredecessor) {
04393         // Build a masked store if it is legal for the target, otherwise scalarize
04394         // the block.
04395         bool isLegalMaskedOp =
04396           isLegalMaskedStore(SI->getValueOperand()->getType(),
04397                              SI->getPointerOperand());
04398         if (isLegalMaskedOp) {
04399           --NumPredStores;
04400           MaskedOp.insert(SI);
04401           continue;
04402         }
04403         return false;
04404       }
04405     }
04406     if (it->mayThrow())
04407       return false;
04408 
04409     // The instructions below can trap.
04410     switch (it->getOpcode()) {
04411     default: continue;
04412     case Instruction::UDiv:
04413     case Instruction::SDiv:
04414     case Instruction::URem:
04415     case Instruction::SRem:
04416       return false;
04417     }
04418   }
04419 
04420   return true;
04421 }
04422 
04423 void InterleavedAccessInfo::collectConstStridedAccesses(
04424     MapVector<Instruction *, StrideDescriptor> &StrideAccesses,
04425     const ValueToValueMap &Strides) {
04426   // Holds load/store instructions in program order.
04427   SmallVector<Instruction *, 16> AccessList;
04428 
04429   for (auto *BB : TheLoop->getBlocks()) {
04430     bool IsPred = LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
04431 
04432     for (auto &I : *BB) {
04433       if (!isa<LoadInst>(&I) && !isa<StoreInst>(&I))
04434         continue;
04435       // FIXME: Currently we can't handle mixed accesses and predicated accesses
04436       if (IsPred)
04437         return;
04438 
04439       AccessList.push_back(&I);
04440     }
04441   }
04442 
04443   if (AccessList.empty())
04444     return;
04445 
04446   auto &DL = TheLoop->getHeader()->getModule()->getDataLayout();
04447   for (auto I : AccessList) {
04448     LoadInst *LI = dyn_cast<LoadInst>(I);
04449     StoreInst *SI = dyn_cast<StoreInst>(I);
04450 
04451     Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
04452     int Stride = isStridedPtr(SE, Ptr, TheLoop, Strides);
04453 
04454     // The factor of the corresponding interleave group.
04455     unsigned Factor = std::abs(Stride);
04456 
04457     // Ignore the access if the factor is too small or too large.
04458     if (Factor < 2 || Factor > MaxInterleaveGroupFactor)
04459       continue;
04460 
04461     const SCEV *Scev = replaceSymbolicStrideSCEV(SE, Strides, Ptr);
04462     PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
04463     unsigned Size = DL.getTypeAllocSize(PtrTy->getElementType());
04464 
04465     // An alignment of 0 means target ABI alignment.
04466     unsigned Align = LI ? LI->getAlignment() : SI->getAlignment();
04467     if (!Align)
04468       Align = DL.getABITypeAlignment(PtrTy->getElementType());
04469 
04470     StrideAccesses[I] = StrideDescriptor(Stride, Scev, Size, Align);
04471   }
04472 }
04473 
04474 // Analyze interleaved accesses and collect them into interleave groups.
04475 //
04476 // Notice that the vectorization on interleaved groups will change instruction
04477 // orders and may break dependences. But the memory dependence check guarantees
04478 // that there is no overlap between two pointers of different strides, element
04479 // sizes or underlying bases.
04480 //
04481 // For pointers sharing the same stride, element size and underlying base, no
04482 // need to worry about Read-After-Write dependences and Write-After-Read
04483 // dependences.
04484 //
04485 // E.g. The RAW dependence:  A[i] = a;
04486 //                           b = A[i];
04487 // This won't exist as it is a store-load forwarding conflict, which has
04488 // already been checked and forbidden in the dependence check.
04489 //
04490 // E.g. The WAR dependence:  a = A[i];  // (1)
04491 //                           A[i] = b;  // (2)
04492 // The store group of (2) is always inserted at or below (2), and the load group
04493 // of (1) is always inserted at or above (1). The dependence is safe.
04494 void InterleavedAccessInfo::analyzeInterleaving(
04495     const ValueToValueMap &Strides) {
04496   DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n");
04497 
04498   // Holds all the stride accesses.
04499   MapVector<Instruction *, StrideDescriptor> StrideAccesses;
04500   collectConstStridedAccesses(StrideAccesses, Strides);
04501 
04502   if (StrideAccesses.empty())
04503     return;
04504 
04505   // Holds all interleaved store groups temporarily.
04506   SmallSetVector<InterleaveGroup *, 4> StoreGroups;
04507 
04508   // Search the load-load/write-write pair B-A in bottom-up order and try to
04509   // insert B into the interleave group of A according to 3 rules:
04510   //   1. A and B have the same stride.
04511   //   2. A and B have the same memory object size.
04512   //   3. B belongs to the group according to the distance.
04513   //
04514   // The bottom-up order can avoid breaking the Write-After-Write dependences
04515   // between two pointers of the same base.
04516   // E.g.  A[i]   = a;   (1)
04517   //       A[i]   = b;   (2)
04518   //       A[i+1] = c    (3)
04519   // We form the group (2)+(3) in front, so (1) has to form groups with accesses
04520   // above (1), which guarantees that (1) is always above (2).
04521   for (auto I = StrideAccesses.rbegin(), E = StrideAccesses.rend(); I != E;
04522        ++I) {
04523     Instruction *A = I->first;
04524     StrideDescriptor DesA = I->second;
04525 
04526     InterleaveGroup *Group = getInterleaveGroup(A);
04527     if (!Group) {
04528       DEBUG(dbgs() << "LV: Creating an interleave group with:" << *A << '\n');
04529       Group = createInterleaveGroup(A, DesA.Stride, DesA.Align);
04530     }
04531 
04532     if (A->mayWriteToMemory())
04533       StoreGroups.insert(Group);
04534 
04535     for (auto II = std::next(I); II != E; ++II) {
04536       Instruction *B = II->first;
04537       StrideDescriptor DesB = II->second;
04538 
04539       // Ignore if B is already in a group or B is a different memory operation.
04540       if (isInterleaved(B) || A->mayReadFromMemory() != B->mayReadFromMemory())
04541         continue;
04542 
04543       // Check the rule 1 and 2.
04544       if (DesB.Stride != DesA.Stride || DesB.Size != DesA.Size)
04545         continue;
04546 
04547       // Calculate the distance and prepare for the rule 3.
04548       const SCEVConstant *DistToA =
04549           dyn_cast<SCEVConstant>(SE->getMinusSCEV(DesB.Scev, DesA.Scev));
04550       if (!DistToA)
04551         continue;
04552 
04553       int DistanceToA = DistToA->getValue()->getValue().getSExtValue();
04554 
04555       // Skip if the distance is not multiple of size as they are not in the
04556       // same group.
04557       if (DistanceToA % static_cast<int>(DesA.Size))
04558         continue;
04559 
04560       // The index of B is the index of A plus the related index to A.
04561       int IndexB =
04562           Group->getIndex(A) + DistanceToA / static_cast<int>(DesA.Size);
04563 
04564       // Try to insert B into the group.
04565       if (Group->insertMember(B, IndexB, DesB.Align)) {
04566         DEBUG(dbgs() << "LV: Inserted:" << *B << '\n'
04567                      << "    into the interleave group with" << *A << '\n');
04568         InterleaveGroupMap[B] = Group;
04569 
04570         // Set the first load in program order as the insert position.
04571         if (B->mayReadFromMemory())
04572           Group->setInsertPos(B);
04573       }
04574     } // Iteration on instruction B
04575   }   // Iteration on instruction A
04576 
04577   // Remove interleaved store groups with gaps.
04578   for (InterleaveGroup *Group : StoreGroups)
04579     if (Group->getNumMembers() != Group->getFactor())
04580       releaseGroup(Group);
04581 }
04582 
04583 LoopVectorizationCostModel::VectorizationFactor
04584 LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
04585   // Width 1 means no vectorize
04586   VectorizationFactor Factor = { 1U, 0U };
04587   if (OptForSize && Legal->getRuntimePointerCheck()->Need) {
04588     emitAnalysis(VectorizationReport() <<
04589                  "runtime pointer checks needed. Enable vectorization of this "
04590                  "loop with '#pragma clang loop vectorize(enable)' when "
04591                  "compiling with -Os");
04592     DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n");
04593     return Factor;
04594   }
04595 
04596   if (!EnableCondStoresVectorization && Legal->getNumPredStores()) {
04597     emitAnalysis(VectorizationReport() <<
04598                  "store that is conditionally executed prevents vectorization");
04599     DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");
04600     return Factor;
04601   }
04602 
04603   // Find the trip count.
04604   unsigned TC = SE->getSmallConstantTripCount(TheLoop);
04605   DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
04606 
04607   unsigned WidestType = getWidestType();
04608   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
04609   unsigned MaxSafeDepDist = -1U;
04610   if (Legal->getMaxSafeDepDistBytes() != -1U)
04611     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
04612   WidestRegister = ((WidestRegister < MaxSafeDepDist) ?
04613                     WidestRegister : MaxSafeDepDist);
04614   unsigned MaxVectorSize = WidestRegister / WidestType;
04615   DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n");
04616   DEBUG(dbgs() << "LV: The Widest register is: "
04617           << WidestRegister << " bits.\n");
04618 
04619   if (MaxVectorSize == 0) {
04620     DEBUG(dbgs() << "LV: The target has no vector registers.\n");
04621     MaxVectorSize = 1;
04622   }
04623 
04624   assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements"
04625          " into one vector!");
04626 
04627   unsigned VF = MaxVectorSize;
04628 
04629   // If we optimize the program for size, avoid creating the tail loop.
04630   if (OptForSize) {
04631     // If we are unable to calculate the trip count then don't try to vectorize.
04632     if (TC < 2) {
04633       emitAnalysis
04634         (VectorizationReport() <<
04635          "unable to calculate the loop count due to complex control flow");
04636       DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
04637       return Factor;
04638     }
04639 
04640     // Find the maximum SIMD width that can fit within the trip count.
04641     VF = TC % MaxVectorSize;
04642 
04643     if (VF == 0)
04644       VF = MaxVectorSize;
04645 
04646     // If the trip count that we found modulo the vectorization factor is not
04647     // zero then we require a tail.
04648     if (VF < 2) {
04649       emitAnalysis(VectorizationReport() <<
04650                    "cannot optimize for size and vectorize at the "
04651                    "same time. Enable vectorization of this loop "
04652                    "with '#pragma clang loop vectorize(enable)' "
04653                    "when compiling with -Os");
04654       DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
04655       return Factor;
04656     }
04657   }
04658 
04659   int UserVF = Hints->getWidth();
04660   if (UserVF != 0) {
04661     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
04662     DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
04663 
04664     Factor.Width = UserVF;
04665     return Factor;
04666   }
04667 
04668   float Cost = expectedCost(1);
04669 #ifndef NDEBUG
04670   const float ScalarCost = Cost;
04671 #endif /* NDEBUG */
04672   unsigned Width = 1;
04673   DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
04674 
04675   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
04676   // Ignore scalar width, because the user explicitly wants vectorization.
04677   if (ForceVectorization && VF > 1) {
04678     Width = 2;
04679     Cost = expectedCost(Width) / (float)Width;
04680   }
04681 
04682   for (unsigned i=2; i <= VF; i*=2) {
04683     // Notice that the vector loop needs to be executed less times, so
04684     // we need to divide the cost of the vector loops by the width of
04685     // the vector elements.
04686     float VectorCost = expectedCost(i) / (float)i;
04687     DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " <<
04688           (int)VectorCost << ".\n");
04689     if (VectorCost < Cost) {
04690       Cost = VectorCost;
04691       Width = i;
04692     }
04693   }
04694 
04695   DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
04696         << "LV: Vectorization seems to be not beneficial, "
04697         << "but was forced by a user.\n");
04698   DEBUG(dbgs() << "LV: Selecting VF: "<< Width << ".\n");
04699   Factor.Width = Width;
04700   Factor.Cost = Width * Cost;
04701   return Factor;
04702 }
04703 
04704 unsigned LoopVectorizationCostModel::getWidestType() {
04705   unsigned MaxWidth = 8;
04706   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
04707 
04708   // For each block.
04709   for (Loop::block_iterator bb = TheLoop->block_begin(),
04710        be = TheLoop->block_end(); bb != be; ++bb) {
04711     BasicBlock *BB = *bb;
04712 
04713     // For each instruction in the loop.
04714     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
04715       Type *T = it->getType();
04716 
04717       // Ignore ephemeral values.
04718       if (EphValues.count(it))
04719         continue;
04720 
04721       // Only examine Loads, Stores and PHINodes.
04722       if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it))
04723         continue;
04724 
04725       // Examine PHI nodes that are reduction variables.
04726       if (PHINode *PN = dyn_cast<PHINode>(it))
04727         if (!Legal->getReductionVars()->count(PN))
04728           continue;
04729 
04730       // Examine the stored values.
04731       if (StoreInst *ST = dyn_cast<StoreInst>(it))
04732         T = ST->getValueOperand()->getType();
04733 
04734       // Ignore loaded pointer types and stored pointer types that are not
04735       // consecutive. However, we do want to take consecutive stores/loads of
04736       // pointer vectors into account.
04737       if (T->isPointerTy() && !isConsecutiveLoadOrStore(it))
04738         continue;
04739 
04740       MaxWidth = std::max(MaxWidth,
04741                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
04742     }
04743   }
04744 
04745   return MaxWidth;
04746 }
04747 
04748 unsigned
04749 LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
04750                                                unsigned VF,
04751                                                unsigned LoopCost) {
04752 
04753   // -- The unroll heuristics --
04754   // We unroll the loop in order to expose ILP and reduce the loop overhead.
04755   // There are many micro-architectural considerations that we can't predict
04756   // at this level. For example, frontend pressure (on decode or fetch) due to
04757   // code size, or the number and capabilities of the execution ports.
04758   //
04759   // We use the following heuristics to select the unroll factor:
04760   // 1. If the code has reductions, then we unroll in order to break the cross
04761   // iteration dependency.
04762   // 2. If the loop is really small, then we unroll in order to reduce the loop
04763   // overhead.
04764   // 3. We don't unroll if we think that we will spill registers to memory due
04765   // to the increased register pressure.
04766 
04767   // Use the user preference, unless 'auto' is selected.
04768   int UserUF = Hints->getInterleave();
04769   if (UserUF != 0)
04770     return UserUF;
04771 
04772   // When we optimize for size, we don't unroll.
04773   if (OptForSize)
04774     return 1;
04775 
04776   // We used the distance for the unroll factor.
04777   if (Legal->getMaxSafeDepDistBytes() != -1U)
04778     return 1;
04779 
04780   // Do not unroll loops with a relatively small trip count.
04781   unsigned TC = SE->getSmallConstantTripCount(TheLoop);
04782   if (TC > 1 && TC < TinyTripCountUnrollThreshold)
04783     return 1;
04784 
04785   unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
04786   DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters <<
04787         " registers\n");
04788 
04789   if (VF == 1) {
04790     if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
04791       TargetNumRegisters = ForceTargetNumScalarRegs;
04792   } else {
04793     if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
04794       TargetNumRegisters = ForceTargetNumVectorRegs;
04795   }
04796 
04797   LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage();
04798   // We divide by these constants so assume that we have at least one
04799   // instruction that uses at least one register.
04800   R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
04801   R.NumInstructions = std::max(R.NumInstructions, 1U);
04802 
04803   // We calculate the unroll factor using the following formula.
04804   // Subtract the number of loop invariants from the number of available
04805   // registers. These registers are used by all of the unrolled instances.
04806   // Next, divide the remaining registers by the number of registers that is
04807   // required by the loop, in order to estimate how many parallel instances
04808   // fit without causing spills. All of this is rounded down if necessary to be
04809   // a power of two. We want power of two unroll factors to simplify any
04810   // addressing operations or alignment considerations.
04811   unsigned UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
04812                               R.MaxLocalUsers);
04813 
04814   // Don't count the induction variable as unrolled.
04815   if (EnableIndVarRegisterHeur)
04816     UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
04817                        std::max(1U, (R.MaxLocalUsers - 1)));
04818 
04819   // Clamp the unroll factor ranges to reasonable factors.
04820   unsigned MaxInterleaveSize = TTI.getMaxInterleaveFactor(VF);
04821 
04822   // Check if the user has overridden the unroll max.
04823   if (VF == 1) {
04824     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
04825       MaxInterleaveSize = ForceTargetMaxScalarInterleaveFactor;
04826   } else {
04827     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
04828       MaxInterleaveSize = ForceTargetMaxVectorInterleaveFactor;
04829   }
04830 
04831   // If we did not calculate the cost for VF (because the user selected the VF)
04832   // then we calculate the cost of VF here.
04833   if (LoopCost == 0)
04834     LoopCost = expectedCost(VF);
04835 
04836   // Clamp the calculated UF to be between the 1 and the max unroll factor
04837   // that the target allows.
04838   if (UF > MaxInterleaveSize)
04839     UF = MaxInterleaveSize;
04840   else if (UF < 1)
04841     UF = 1;
04842 
04843   // Unroll if we vectorized this loop and there is a reduction that could
04844   // benefit from unrolling.
04845   if (VF > 1 && Legal->getReductionVars()->size()) {
04846     DEBUG(dbgs() << "LV: Unrolling because of reductions.\n");
04847     return UF;
04848   }
04849 
04850   // Note that if we've already vectorized the loop we will have done the
04851   // runtime check and so unrolling won't require further checks.
04852   bool UnrollingRequiresRuntimePointerCheck =
04853       (VF == 1 && Legal->getRuntimePointerCheck()->Need);
04854 
04855   // We want to unroll small loops in order to reduce the loop overhead and
04856   // potentially expose ILP opportunities.
04857   DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
04858   if (!UnrollingRequiresRuntimePointerCheck &&
04859       LoopCost < SmallLoopCost) {
04860     // We assume that the cost overhead is 1 and we use the cost model
04861     // to estimate the cost of the loop and unroll until the cost of the
04862     // loop overhead is about 5% of the cost of the loop.
04863     unsigned SmallUF = std::min(UF, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
04864 
04865     // Unroll until store/load ports (estimated by max unroll factor) are
04866     // saturated.
04867     unsigned NumStores = Legal->getNumStores();
04868     unsigned NumLoads = Legal->getNumLoads();
04869     unsigned StoresUF = UF / (NumStores ? NumStores : 1);
04870     unsigned LoadsUF = UF /  (NumLoads ? NumLoads : 1);
04871 
04872     // If we have a scalar reduction (vector reductions are already dealt with
04873     // by this point), we can increase the critical path length if the loop
04874     // we're unrolling is inside another loop. Limit, by default to 2, so the
04875     // critical path only gets increased by one reduction operation.
04876     if (Legal->getReductionVars()->size() &&
04877         TheLoop->getLoopDepth() > 1) {
04878       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionUF);
04879       SmallUF = std::min(SmallUF, F);
04880       StoresUF = std::min(StoresUF, F);
04881       LoadsUF = std::min(LoadsUF, F);
04882     }
04883 
04884     if (EnableLoadStoreRuntimeUnroll && std::max(StoresUF, LoadsUF) > SmallUF) {
04885       DEBUG(dbgs() << "LV: Unrolling to saturate store or load ports.\n");
04886       return std::max(StoresUF, LoadsUF);
04887     }
04888 
04889     DEBUG(dbgs() << "LV: Unrolling to reduce branch cost.\n");
04890     return SmallUF;
04891   }
04892 
04893   // Unroll if this is a large loop (small loops are already dealt with by this
04894   // point) that could benefit from interleaved unrolling.
04895   bool HasReductions = (Legal->getReductionVars()->size() > 0);
04896   if (TTI.enableAggressiveInterleaving(HasReductions)) {
04897     DEBUG(dbgs() << "LV: Unrolling to expose ILP.\n");
04898     return UF;
04899   }
04900 
04901   DEBUG(dbgs() << "LV: Not Unrolling.\n");
04902   return 1;
04903 }
04904 
04905 LoopVectorizationCostModel::RegisterUsage
04906 LoopVectorizationCostModel::calculateRegisterUsage() {
04907   // This function calculates the register usage by measuring the highest number
04908   // of values that are alive at a single location. Obviously, this is a very
04909   // rough estimation. We scan the loop in a topological order in order and
04910   // assign a number to each instruction. We use RPO to ensure that defs are
04911   // met before their users. We assume that each instruction that has in-loop
04912   // users starts an interval. We record every time that an in-loop value is
04913   // used, so we have a list of the first and last occurrences of each
04914   // instruction. Next, we transpose this data structure into a multi map that
04915   // holds the list of intervals that *end* at a specific location. This multi
04916   // map allows us to perform a linear search. We scan the instructions linearly
04917   // and record each time that a new interval starts, by placing it in a set.
04918   // If we find this value in the multi-map then we remove it from the set.
04919   // The max register usage is the maximum size of the set.
04920   // We also search for instructions that are defined outside the loop, but are
04921   // used inside the loop. We need this number separately from the max-interval
04922   // usage number because when we unroll, loop-invariant values do not take
04923   // more register.
04924   LoopBlocksDFS DFS(TheLoop);
04925   DFS.perform(LI);
04926 
04927   RegisterUsage R;
04928   R.NumInstructions = 0;
04929 
04930   // Each 'key' in the map opens a new interval. The values
04931   // of the map are the index of the 'last seen' usage of the
04932   // instruction that is the key.
04933   typedef DenseMap<Instruction*, unsigned> IntervalMap;
04934   // Maps instruction to its index.
04935   DenseMap<unsigned, Instruction*> IdxToInstr;
04936   // Marks the end of each interval.
04937   IntervalMap EndPoint;
04938   // Saves the list of instruction indices that are used in the loop.
04939   SmallSet<Instruction*, 8> Ends;
04940   // Saves the list of values that are used in the loop but are
04941   // defined outside the loop, such as arguments and constants.
04942   SmallPtrSet<Value*, 8> LoopInvariants;
04943 
04944   unsigned Index = 0;
04945   for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
04946        be = DFS.endRPO(); bb != be; ++bb) {
04947     R.NumInstructions += (*bb)->size();
04948     for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
04949          ++it) {
04950       Instruction *I = it;
04951       IdxToInstr[Index++] = I;
04952 
04953       // Save the end location of each USE.
04954       for (unsigned i = 0; i < I->getNumOperands(); ++i) {
04955         Value *U = I->getOperand(i);
04956         Instruction *Instr = dyn_cast<Instruction>(U);
04957 
04958         // Ignore non-instruction values such as arguments, constants, etc.
04959         if (!Instr) continue;
04960 
04961         // If this instruction is outside the loop then record it and continue.
04962         if (!TheLoop->contains(Instr)) {
04963           LoopInvariants.insert(Instr);
04964           continue;
04965         }
04966 
04967         // Overwrite previous end points.
04968         EndPoint[Instr] = Index;
04969         Ends.insert(Instr);
04970       }
04971     }
04972   }
04973 
04974   // Saves the list of intervals that end with the index in 'key'.
04975   typedef SmallVector<Instruction*, 2> InstrList;
04976   DenseMap<unsigned, InstrList> TransposeEnds;
04977 
04978   // Transpose the EndPoints to a list of values that end at each index.
04979   for (IntervalMap::iterator it = EndPoint.begin(), e = EndPoint.end();
04980        it != e; ++it)
04981     TransposeEnds[it->second].push_back(it->first);
04982 
04983   SmallSet<Instruction*, 8> OpenIntervals;
04984   unsigned MaxUsage = 0;
04985 
04986 
04987   DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
04988   for (unsigned int i = 0; i < Index; ++i) {
04989     Instruction *I = IdxToInstr[i];
04990     // Ignore instructions that are never used within the loop.
04991     if (!Ends.count(I)) continue;
04992 
04993     // Ignore ephemeral values.
04994     if (EphValues.count(I))
04995       continue;
04996 
04997     // Remove all of the instructions that end at this location.
04998     InstrList &List = TransposeEnds[i];
04999     for (unsigned int j=0, e = List.size(); j < e; ++j)
05000       OpenIntervals.erase(List[j]);
05001 
05002     // Count the number of live interals.
05003     MaxUsage = std::max(MaxUsage, OpenIntervals.size());
05004 
05005     DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " <<
05006           OpenIntervals.size() << '\n');
05007 
05008     // Add the current instruction to the list of open intervals.
05009     OpenIntervals.insert(I);
05010   }
05011 
05012   unsigned Invariant = LoopInvariants.size();
05013   DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << '\n');
05014   DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n');
05015   DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << '\n');
05016 
05017   R.LoopInvariantRegs = Invariant;
05018   R.MaxLocalUsers = MaxUsage;
05019   return R;
05020 }
05021 
05022 unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
05023   unsigned Cost = 0;
05024 
05025   // For each block.
05026   for (Loop::block_iterator bb = TheLoop->block_begin(),
05027        be = TheLoop->block_end(); bb != be; ++bb) {
05028     unsigned BlockCost = 0;
05029     BasicBlock *BB = *bb;
05030 
05031     // For each instruction in the old loop.
05032     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
05033       // Skip dbg intrinsics.
05034       if (isa<DbgInfoIntrinsic>(it))
05035         continue;
05036 
05037       // Ignore ephemeral values.
05038       if (EphValues.count(it))
05039         continue;
05040 
05041       unsigned C = getInstructionCost(it, VF);
05042 
05043       // Check if we should override the cost.
05044       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
05045         C = ForceTargetInstructionCost;
05046 
05047       BlockCost += C;
05048       DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " <<
05049             VF << " For instruction: " << *it << '\n');
05050     }
05051 
05052     // We assume that if-converted blocks have a 50% chance of being executed.
05053     // When the code is scalar then some of the blocks are avoided due to CF.
05054     // When the code is vectorized we execute all code paths.
05055     if (VF == 1 && Legal->blockNeedsPredication(*bb))
05056       BlockCost /= 2;
05057 
05058     Cost += BlockCost;
05059   }
05060 
05061   return Cost;
05062 }
05063 
05064 /// \brief Check whether the address computation for a non-consecutive memory
05065 /// access looks like an unlikely candidate for being merged into the indexing
05066 /// mode.
05067 ///
05068 /// We look for a GEP which has one index that is an induction variable and all
05069 /// other indices are loop invariant. If the stride of this access is also
05070 /// within a small bound we decide that this address computation can likely be
05071 /// merged into the addressing mode.
05072 /// In all other cases, we identify the address computation as complex.
05073 static bool isLikelyComplexAddressComputation(Value *Ptr,
05074                                               LoopVectorizationLegality *Legal,
05075                                               ScalarEvolution *SE,
05076                                               const Loop *TheLoop) {
05077   GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
05078   if (!Gep)
05079     return true;
05080 
05081   // We are looking for a gep with all loop invariant indices except for one
05082   // which should be an induction variable.
05083   unsigned NumOperands = Gep->getNumOperands();
05084   for (unsigned i = 1; i < NumOperands; ++i) {
05085     Value *Opd = Gep->getOperand(i);
05086     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
05087         !Legal->isInductionVariable(Opd))
05088       return true;
05089   }
05090 
05091   // Now we know we have a GEP ptr, %inv, %ind, %inv. Make sure that the step
05092   // can likely be merged into the address computation.
05093   unsigned MaxMergeDistance = 64;
05094 
05095   const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Ptr));
05096   if (!AddRec)
05097     return true;
05098 
05099   // Check the step is constant.
05100   const SCEV *Step = AddRec->getStepRecurrence(*SE);
05101   // Calculate the pointer stride and check if it is consecutive.
05102   const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
05103   if (!C)
05104     return true;
05105 
05106   const APInt &APStepVal = C->getValue()->getValue();
05107 
05108   // Huge step value - give up.
05109   if (APStepVal.getBitWidth() > 64)
05110     return true;
05111 
05112   int64_t StepVal = APStepVal.getSExtValue();
05113 
05114   return StepVal > MaxMergeDistance;
05115 }
05116 
05117 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
05118   if (Legal->hasStride(I->getOperand(0)) || Legal->hasStride(I->getOperand(1)))
05119     return true;
05120   return false;
05121 }
05122 
05123 unsigned
05124 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
05125   // If we know that this instruction will remain uniform, check the cost of
05126   // the scalar version.
05127   if (Legal->isUniformAfterVectorization(I))
05128     VF = 1;
05129 
05130   Type *RetTy = I->getType();
05131   Type *VectorTy = ToVectorTy(RetTy, VF);
05132 
05133   // TODO: We need to estimate the cost of intrinsic calls.
05134   switch (I->getOpcode()) {
05135   case Instruction::GetElementPtr:
05136     // We mark this instruction as zero-cost because the cost of GEPs in
05137     // vectorized code depends on whether the corresponding memory instruction
05138     // is scalarized or not. Therefore, we handle GEPs with the memory
05139     // instruction cost.
05140     return 0;
05141   case Instruction::Br: {
05142     return TTI.getCFInstrCost(I->getOpcode());
05143   }
05144   case Instruction::PHI:
05145     //TODO: IF-converted IFs become selects.
05146     return 0;
05147   case Instruction::Add:
05148   case Instruction::FAdd:
05149   case Instruction::Sub:
05150   case Instruction::FSub:
05151   case Instruction::Mul:
05152   case Instruction::FMul:
05153   case Instruction::UDiv:
05154   case Instruction::SDiv:
05155   case Instruction::FDiv:
05156   case Instruction::URem:
05157   case Instruction::SRem:
05158   case Instruction::FRem:
05159   case Instruction::Shl:
05160   case Instruction::LShr:
05161   case Instruction::AShr:
05162   case Instruction::And:
05163   case Instruction::Or:
05164   case Instruction::Xor: {
05165     // Since we will replace the stride by 1 the multiplication should go away.
05166     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
05167       return 0;
05168     // Certain instructions can be cheaper to vectorize if they have a constant
05169     // second vector operand. One example of this are shifts on x86.
05170     TargetTransformInfo::OperandValueKind Op1VK =
05171       TargetTransformInfo::OK_AnyValue;
05172     TargetTransformInfo::OperandValueKind Op2VK =
05173       TargetTransformInfo::OK_AnyValue;
05174     TargetTransformInfo::OperandValueProperties Op1VP =
05175         TargetTransformInfo::OP_None;
05176     TargetTransformInfo::OperandValueProperties Op2VP =
05177         TargetTransformInfo::OP_None;
05178     Value *Op2 = I->getOperand(1);
05179 
05180     // Check for a splat of a constant or for a non uniform vector of constants.
05181     if (isa<ConstantInt>(Op2)) {
05182       ConstantInt *CInt = cast<ConstantInt>(Op2);
05183       if (CInt && CInt->getValue().isPowerOf2())
05184         Op2VP = TargetTransformInfo::OP_PowerOf2;
05185       Op2VK = TargetTransformInfo::OK_UniformConstantValue;
05186     } else if (isa<ConstantVector>(Op2) || isa<ConstantDataVector>(Op2)) {
05187       Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
05188       Constant *SplatValue = cast<Constant>(Op2)->getSplatValue();
05189       if (SplatValue) {
05190         ConstantInt *CInt = dyn_cast<ConstantInt>(SplatValue);
05191         if (CInt && CInt->getValue().isPowerOf2())
05192           Op2VP = TargetTransformInfo::OP_PowerOf2;
05193         Op2VK = TargetTransformInfo::OK_UniformConstantValue;
05194       }
05195     }
05196 
05197     return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK,
05198                                       Op1VP, Op2VP);
05199   }
05200   case Instruction::Select: {
05201     SelectInst *SI = cast<SelectInst>(I);
05202     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
05203     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
05204     Type *CondTy = SI->getCondition()->getType();
05205     if (!ScalarCond)
05206       CondTy = VectorType::get(CondTy, VF);
05207 
05208     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
05209   }
05210   case Instruction::ICmp:
05211   case Instruction::FCmp: {
05212     Type *ValTy = I->getOperand(0)->getType();
05213     VectorTy = ToVectorTy(ValTy, VF);
05214     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy);
05215   }
05216   case Instruction::Store:
05217   case Instruction::Load: {
05218     StoreInst *SI = dyn_cast<StoreInst>(I);
05219     LoadInst *LI = dyn_cast<LoadInst>(I);
05220     Type *ValTy = (SI ? SI->getValueOperand()->getType() :
05221                    LI->getType());
05222     VectorTy = ToVectorTy(ValTy, VF);
05223 
05224     unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment();
05225     unsigned AS = SI ? SI->getPointerAddressSpace() :
05226       LI->getPointerAddressSpace();
05227     Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand();
05228     // We add the cost of address computation here instead of with the gep
05229     // instruction because only here we know whether the operation is
05230     // scalarized.
05231     if (VF == 1)
05232       return TTI.getAddressComputationCost(VectorTy) +
05233         TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
05234 
05235     // For an interleaved access, calculate the total cost of the whole
05236     // interleave group.
05237     if (Legal->isAccessInterleaved(I)) {
05238       auto Group = Legal->getInterleavedAccessGroup(I);
05239       assert(Group && "Fail to get an interleaved access group.");
05240 
05241       // Only calculate the cost once at the insert position.
05242       if (Group->getInsertPos() != I)
05243         return 0;
05244 
05245       unsigned InterleaveFactor = Group->getFactor();
05246       Type *WideVecTy =
05247           VectorType::get(VectorTy->getVectorElementType(),
05248                           VectorTy->getVectorNumElements() * InterleaveFactor);
05249 
05250       // Holds the indices of existing members in an interleaved load group.
05251       // An interleaved store group doesn't need this as it dones't allow gaps.
05252       SmallVector<unsigned, 4> Indices;
05253       if (LI) {
05254         for (unsigned i = 0; i < InterleaveFactor; i++)
05255           if (Group->getMember(i))
05256             Indices.push_back(i);
05257       }
05258 
05259       // Calculate the cost of the whole interleaved group.
05260       unsigned Cost = TTI.getInterleavedMemoryOpCost(
05261           I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
05262           Group->getAlignment(), AS);
05263 
05264       if (Group->isReverse())
05265         Cost +=
05266             Group->getNumMembers() *
05267             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
05268 
05269       // FIXME: The interleaved load group with a huge gap could be even more
05270       // expensive than scalar operations. Then we could ignore such group and
05271       // use scalar operations instead.
05272       return Cost;
05273     }
05274 
05275     // Scalarized loads/stores.
05276     int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
05277     bool Reverse = ConsecutiveStride < 0;
05278     const DataLayout &DL = I->getModule()->getDataLayout();
05279     unsigned ScalarAllocatedSize = DL.getTypeAllocSize(ValTy);
05280     unsigned VectorElementSize = DL.getTypeStoreSize(VectorTy) / VF;
05281     if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) {
05282       bool IsComplexComputation =
05283         isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop);
05284       unsigned Cost = 0;
05285       // The cost of extracting from the value vector and pointer vector.
05286       Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
05287       for (unsigned i = 0; i < VF; ++i) {
05288         //  The cost of extracting the pointer operand.
05289         Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i);
05290         // In case of STORE, the cost of ExtractElement from the vector.
05291         // In case of LOAD, the cost of InsertElement into the returned
05292         // vector.
05293         Cost += TTI.getVectorInstrCost(SI ? Instruction::ExtractElement :
05294                                             Instruction::InsertElement,
05295                                             VectorTy, i);
05296       }
05297 
05298       // The cost of the scalar loads/stores.
05299       Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation);
05300       Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
05301                                        Alignment, AS);
05302       return Cost;
05303     }
05304 
05305     // Wide load/stores.
05306     unsigned Cost = TTI.getAddressComputationCost(VectorTy);
05307     if (Legal->isMaskRequired(I))
05308       Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment,
05309                                         AS);
05310     else
05311       Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
05312 
05313     if (Reverse)
05314       Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
05315                                   VectorTy, 0);
05316     return Cost;
05317   }
05318   case Instruction::ZExt:
05319   case Instruction::SExt:
05320   case Instruction::FPToUI:
05321   case Instruction::FPToSI:
05322   case Instruction::FPExt:
05323   case Instruction::PtrToInt:
05324   case Instruction::IntToPtr:
05325   case Instruction::SIToFP:
05326   case Instruction::UIToFP:
05327   case Instruction::Trunc:
05328   case Instruction::FPTrunc:
05329   case Instruction::BitCast: {
05330     // We optimize the truncation of induction variable.
05331     // The cost of these is the same as the scalar operation.
05332     if (I->getOpcode() == Instruction::Trunc &&
05333         Legal->isInductionVariable(I->getOperand(0)))
05334       return TTI.getCastInstrCost(I->getOpcode(), I->getType(),
05335                                   I->getOperand(0)->getType());
05336 
05337     Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
05338     return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
05339   }
05340   case Instruction::Call: {
05341     bool NeedToScalarize;
05342     CallInst *CI = cast<CallInst>(I);
05343     unsigned CallCost = getVectorCallCost(CI, VF, TTI, TLI, NeedToScalarize);
05344     if (getIntrinsicIDForCall(CI, TLI))
05345       return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI));
05346     return CallCost;
05347   }
05348   default: {
05349     // We are scalarizing the instruction. Return the cost of the scalar
05350     // instruction, plus the cost of insert and extract into vector
05351     // elements, times the vector width.
05352     unsigned Cost = 0;
05353 
05354     if (!RetTy->isVoidTy() && VF != 1) {
05355       unsigned InsCost = TTI.getVectorInstrCost(Instruction::InsertElement,
05356                                                 VectorTy);
05357       unsigned ExtCost = TTI.getVectorInstrCost(Instruction::ExtractElement,
05358                                                 VectorTy);
05359 
05360       // The cost of inserting the results plus extracting each one of the
05361       // operands.
05362       Cost += VF * (InsCost + ExtCost * I->getNumOperands());
05363     }
05364 
05365     // The cost of executing VF copies of the scalar instruction. This opcode
05366     // is unknown. Assume that it is the same as 'mul'.
05367     Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy);
05368     return Cost;
05369   }
05370   }// end of switch.
05371 }
05372 
05373 char LoopVectorize::ID = 0;
05374 static const char lv_name[] = "Loop Vectorization";
05375 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
05376 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
05377 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
05378 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
05379 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfo)
05380 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
05381 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
05382 INITIALIZE_PASS_DEPENDENCY(LCSSA)
05383 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
05384 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
05385 INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis)
05386 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
05387 
05388 namespace llvm {
05389   Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) {
05390     return new LoopVectorize(NoUnrolling, AlwaysVectorize);
05391   }
05392 }
05393 
05394 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
05395   // Check for a store.
05396   if (StoreInst *ST = dyn_cast<StoreInst>(Inst))
05397     return Legal->isConsecutivePtr(ST->getPointerOperand()) != 0;
05398 
05399   // Check for a load.
05400   if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
05401     return Legal->isConsecutivePtr(LI->getPointerOperand()) != 0;
05402 
05403   return false;
05404 }
05405 
05406 
05407 void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
05408                                              bool IfPredicateStore) {
05409   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
05410   // Holds vector parameters or scalars, in case of uniform vals.
05411   SmallVector<VectorParts, 4> Params;
05412 
05413   setDebugLocFromInst(Builder, Instr);
05414 
05415   // Find all of the vectorized parameters.
05416   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
05417     Value *SrcOp = Instr->getOperand(op);
05418 
05419     // If we are accessing the old induction variable, use the new one.
05420     if (SrcOp == OldInduction) {
05421       Params.push_back(getVectorValue(SrcOp));
05422       continue;
05423     }
05424 
05425     // Try using previously calculated values.
05426     Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
05427 
05428     // If the src is an instruction that appeared earlier in the basic block
05429     // then it should already be vectorized.
05430     if (SrcInst && OrigLoop->contains(SrcInst)) {
05431       assert(WidenMap.has(SrcInst) && "Source operand is unavailable");
05432       // The parameter is a vector value from earlier.
05433       Params.push_back(WidenMap.get(SrcInst));
05434     } else {
05435       // The parameter is a scalar from outside the loop. Maybe even a constant.
05436       VectorParts Scalars;
05437       Scalars.append(UF, SrcOp);
05438       Params.push_back(Scalars);
05439     }
05440   }
05441 
05442   assert(Params.size() == Instr->getNumOperands() &&
05443          "Invalid number of operands");
05444 
05445   // Does this instruction return a value ?
05446   bool IsVoidRetTy = Instr->getType()->isVoidTy();
05447 
05448   Value *UndefVec = IsVoidRetTy ? nullptr :
05449   UndefValue::get(Instr->getType());
05450   // Create a new entry in the WidenMap and initialize it to Undef or Null.
05451   VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
05452 
05453   Instruction *InsertPt = Builder.GetInsertPoint();
05454   BasicBlock *IfBlock = Builder.GetInsertBlock();
05455   BasicBlock *CondBlock = nullptr;
05456 
05457   VectorParts Cond;
05458   Loop *VectorLp = nullptr;
05459   if (IfPredicateStore) {
05460     assert(Instr->getParent()->getSinglePredecessor() &&
05461            "Only support single predecessor blocks");
05462     Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
05463                           Instr->getParent());
05464     VectorLp = LI->getLoopFor(IfBlock);
05465     assert(VectorLp && "Must have a loop for this block");
05466   }
05467 
05468   // For each vector unroll 'part':
05469   for (unsigned Part = 0; Part < UF; ++Part) {
05470     // For each scalar that we create:
05471 
05472     // Start an "if (pred) a[i] = ..." block.
05473     Value *Cmp = nullptr;
05474     if (IfPredicateStore) {
05475       if (Cond[Part]->getType()->isVectorTy())
05476         Cond[Part] =
05477             Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0));
05478       Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part],
05479                                ConstantInt::get(Cond[Part]->getType(), 1));
05480       CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
05481       LoopVectorBody.push_back(CondBlock);
05482       VectorLp->addBasicBlockToLoop(CondBlock, *LI);
05483       // Update Builder with newly created basic block.
05484       Builder.SetInsertPoint(InsertPt);
05485     }
05486 
05487     Instruction *Cloned = Instr->clone();
05488       if (!IsVoidRetTy)
05489         Cloned->setName(Instr->getName() + ".cloned");
05490       // Replace the operands of the cloned instructions with extracted scalars.
05491       for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
05492         Value *Op = Params[op][Part];
05493         Cloned->setOperand(op, Op);
05494       }
05495 
05496       // Place the cloned scalar in the new loop.
05497       Builder.Insert(Cloned);
05498 
05499       // If the original scalar returns a value we need to place it in a vector
05500       // so that future users will be able to use it.
05501       if (!IsVoidRetTy)
05502         VecResults[Part] = Cloned;
05503 
05504     // End if-block.
05505       if (IfPredicateStore) {
05506         BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
05507         LoopVectorBody.push_back(NewIfBlock);
05508         VectorLp->addBasicBlockToLoop(NewIfBlock, *LI);
05509         Builder.SetInsertPoint(InsertPt);
05510         Instruction *OldBr = IfBlock->getTerminator();
05511         BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
05512         OldBr->eraseFromParent();
05513         IfBlock = NewIfBlock;
05514       }
05515   }
05516 }
05517 
05518 void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) {
05519   StoreInst *SI = dyn_cast<StoreInst>(Instr);
05520   bool IfPredicateStore = (SI && Legal->blockNeedsPredication(SI->getParent()));
05521 
05522   return scalarizeInstruction(Instr, IfPredicateStore);
05523 }
05524 
05525 Value *InnerLoopUnroller::reverseVector(Value *Vec) {
05526   return Vec;
05527 }
05528 
05529 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) {
05530   return V;
05531 }
05532 
05533 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step) {
05534   // When unrolling and the VF is 1, we only need to add a simple scalar.
05535   Type *ITy = Val->getType();
05536   assert(!ITy->isVectorTy() && "Val must be a scalar");
05537   Constant *C = ConstantInt::get(ITy, StartIdx);
05538   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
05539 }