LLVM  mainline
BBVectorize.cpp
Go to the documentation of this file.
00001 //===- BBVectorize.cpp - A Basic-Block Vectorizer -------------------------===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 // This file implements a basic-block vectorization pass. The algorithm was
00011 // inspired by that used by the Vienna MAP Vectorizor by Franchetti and Kral,
00012 // et al. It works by looking for chains of pairable operations and then
00013 // pairing them.
00014 //
00015 //===----------------------------------------------------------------------===//
00016 
00017 #define BBV_NAME "bb-vectorize"
00018 #include "llvm/Transforms/Vectorize.h"
00019 #include "llvm/ADT/DenseMap.h"
00020 #include "llvm/ADT/DenseSet.h"
00021 #include "llvm/ADT/STLExtras.h"
00022 #include "llvm/ADT/SmallSet.h"
00023 #include "llvm/ADT/SmallVector.h"
00024 #include "llvm/ADT/Statistic.h"
00025 #include "llvm/ADT/StringExtras.h"
00026 #include "llvm/Analysis/AliasAnalysis.h"
00027 #include "llvm/Analysis/AliasSetTracker.h"
00028 #include "llvm/Analysis/ScalarEvolution.h"
00029 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
00030 #include "llvm/Analysis/TargetTransformInfo.h"
00031 #include "llvm/Analysis/ValueTracking.h"
00032 #include "llvm/IR/Constants.h"
00033 #include "llvm/IR/DataLayout.h"
00034 #include "llvm/IR/DerivedTypes.h"
00035 #include "llvm/IR/Dominators.h"
00036 #include "llvm/IR/Function.h"
00037 #include "llvm/IR/Instructions.h"
00038 #include "llvm/IR/IntrinsicInst.h"
00039 #include "llvm/IR/Intrinsics.h"
00040 #include "llvm/IR/LLVMContext.h"
00041 #include "llvm/IR/Metadata.h"
00042 #include "llvm/IR/Module.h"
00043 #include "llvm/IR/Type.h"
00044 #include "llvm/IR/ValueHandle.h"
00045 #include "llvm/Pass.h"
00046 #include "llvm/Support/CommandLine.h"
00047 #include "llvm/Support/Debug.h"
00048 #include "llvm/Support/raw_ostream.h"
00049 #include "llvm/Transforms/Utils/Local.h"
00050 #include <algorithm>
00051 using namespace llvm;
00052 
00053 #define DEBUG_TYPE BBV_NAME
00054 
00055 static cl::opt<bool>
00056 IgnoreTargetInfo("bb-vectorize-ignore-target-info",  cl::init(false),
00057   cl::Hidden, cl::desc("Ignore target information"));
00058 
00059 static cl::opt<unsigned>
00060 ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden,
00061   cl::desc("The required chain depth for vectorization"));
00062 
00063 static cl::opt<bool>
00064 UseChainDepthWithTI("bb-vectorize-use-chain-depth",  cl::init(false),
00065   cl::Hidden, cl::desc("Use the chain depth requirement with"
00066                        " target information"));
00067 
00068 static cl::opt<unsigned>
00069 SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden,
00070   cl::desc("The maximum search distance for instruction pairs"));
00071 
00072 static cl::opt<bool>
00073 SplatBreaksChain("bb-vectorize-splat-breaks-chain", cl::init(false), cl::Hidden,
00074   cl::desc("Replicating one element to a pair breaks the chain"));
00075 
00076 static cl::opt<unsigned>
00077 VectorBits("bb-vectorize-vector-bits", cl::init(128), cl::Hidden,
00078   cl::desc("The size of the native vector registers"));
00079 
00080 static cl::opt<unsigned>
00081 MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden,
00082   cl::desc("The maximum number of pairing iterations"));
00083 
00084 static cl::opt<bool>
00085 Pow2LenOnly("bb-vectorize-pow2-len-only", cl::init(false), cl::Hidden,
00086   cl::desc("Don't try to form non-2^n-length vectors"));
00087 
00088 static cl::opt<unsigned>
00089 MaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden,
00090   cl::desc("The maximum number of pairable instructions per group"));
00091 
00092 static cl::opt<unsigned>
00093 MaxPairs("bb-vectorize-max-pairs-per-group", cl::init(3000), cl::Hidden,
00094   cl::desc("The maximum number of candidate instruction pairs per group"));
00095 
00096 static cl::opt<unsigned>
00097 MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200),
00098   cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use"
00099                        " a full cycle check"));
00100 
00101 static cl::opt<bool>
00102 NoBools("bb-vectorize-no-bools", cl::init(false), cl::Hidden,
00103   cl::desc("Don't try to vectorize boolean (i1) values"));
00104 
00105 static cl::opt<bool>
00106 NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden,
00107   cl::desc("Don't try to vectorize integer values"));
00108 
00109 static cl::opt<bool>
00110 NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden,
00111   cl::desc("Don't try to vectorize floating-point values"));
00112 
00113 // FIXME: This should default to false once pointer vector support works.
00114 static cl::opt<bool>
00115 NoPointers("bb-vectorize-no-pointers", cl::init(/*false*/ true), cl::Hidden,
00116   cl::desc("Don't try to vectorize pointer values"));
00117 
00118 static cl::opt<bool>
00119 NoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden,
00120   cl::desc("Don't try to vectorize casting (conversion) operations"));
00121 
00122 static cl::opt<bool>
00123 NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden,
00124   cl::desc("Don't try to vectorize floating-point math intrinsics"));
00125 
00126 static cl::opt<bool>
00127   NoBitManipulation("bb-vectorize-no-bitmanip", cl::init(false), cl::Hidden,
00128   cl::desc("Don't try to vectorize BitManipulation intrinsics"));
00129 
00130 static cl::opt<bool>
00131 NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden,
00132   cl::desc("Don't try to vectorize the fused-multiply-add intrinsic"));
00133 
00134 static cl::opt<bool>
00135 NoSelect("bb-vectorize-no-select", cl::init(false), cl::Hidden,
00136   cl::desc("Don't try to vectorize select instructions"));
00137 
00138 static cl::opt<bool>
00139 NoCmp("bb-vectorize-no-cmp", cl::init(false), cl::Hidden,
00140   cl::desc("Don't try to vectorize comparison instructions"));
00141 
00142 static cl::opt<bool>
00143 NoGEP("bb-vectorize-no-gep", cl::init(false), cl::Hidden,
00144   cl::desc("Don't try to vectorize getelementptr instructions"));
00145 
00146 static cl::opt<bool>
00147 NoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden,
00148   cl::desc("Don't try to vectorize loads and stores"));
00149 
00150 static cl::opt<bool>
00151 AlignedOnly("bb-vectorize-aligned-only", cl::init(false), cl::Hidden,
00152   cl::desc("Only generate aligned loads and stores"));
00153 
00154 static cl::opt<bool>
00155 NoMemOpBoost("bb-vectorize-no-mem-op-boost",
00156   cl::init(false), cl::Hidden,
00157   cl::desc("Don't boost the chain-depth contribution of loads and stores"));
00158 
00159 static cl::opt<bool>
00160 FastDep("bb-vectorize-fast-dep", cl::init(false), cl::Hidden,
00161   cl::desc("Use a fast instruction dependency analysis"));
00162 
00163 #ifndef NDEBUG
00164 static cl::opt<bool>
00165 DebugInstructionExamination("bb-vectorize-debug-instruction-examination",
00166   cl::init(false), cl::Hidden,
00167   cl::desc("When debugging is enabled, output information on the"
00168            " instruction-examination process"));
00169 static cl::opt<bool>
00170 DebugCandidateSelection("bb-vectorize-debug-candidate-selection",
00171   cl::init(false), cl::Hidden,
00172   cl::desc("When debugging is enabled, output information on the"
00173            " candidate-selection process"));
00174 static cl::opt<bool>
00175 DebugPairSelection("bb-vectorize-debug-pair-selection",
00176   cl::init(false), cl::Hidden,
00177   cl::desc("When debugging is enabled, output information on the"
00178            " pair-selection process"));
00179 static cl::opt<bool>
00180 DebugCycleCheck("bb-vectorize-debug-cycle-check",
00181   cl::init(false), cl::Hidden,
00182   cl::desc("When debugging is enabled, output information on the"
00183            " cycle-checking process"));
00184 
00185 static cl::opt<bool>
00186 PrintAfterEveryPair("bb-vectorize-debug-print-after-every-pair",
00187   cl::init(false), cl::Hidden,
00188   cl::desc("When debugging is enabled, dump the basic block after"
00189            " every pair is fused"));
00190 #endif
00191 
00192 STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize");
00193 
00194 namespace {
00195   struct BBVectorize : public BasicBlockPass {
00196     static char ID; // Pass identification, replacement for typeid
00197 
00198     const VectorizeConfig Config;
00199 
00200     BBVectorize(const VectorizeConfig &C = VectorizeConfig())
00201       : BasicBlockPass(ID), Config(C) {
00202       initializeBBVectorizePass(*PassRegistry::getPassRegistry());
00203     }
00204 
00205     BBVectorize(Pass *P, Function &F, const VectorizeConfig &C)
00206       : BasicBlockPass(ID), Config(C) {
00207       AA = &P->getAnalysis<AliasAnalysis>();
00208       DT = &P->getAnalysis<DominatorTreeWrapperPass>().getDomTree();
00209       SE = &P->getAnalysis<ScalarEvolution>();
00210       TTI = IgnoreTargetInfo
00211                 ? nullptr
00212                 : &P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
00213     }
00214 
00215     typedef std::pair<Value *, Value *> ValuePair;
00216     typedef std::pair<ValuePair, int> ValuePairWithCost;
00217     typedef std::pair<ValuePair, size_t> ValuePairWithDepth;
00218     typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair
00219     typedef std::pair<VPPair, unsigned> VPPairWithType;
00220 
00221     AliasAnalysis *AA;
00222     DominatorTree *DT;
00223     ScalarEvolution *SE;
00224     const TargetTransformInfo *TTI;
00225 
00226     // FIXME: const correct?
00227 
00228     bool vectorizePairs(BasicBlock &BB, bool NonPow2Len = false);
00229 
00230     bool getCandidatePairs(BasicBlock &BB,
00231                        BasicBlock::iterator &Start,
00232                        DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
00233                        DenseSet<ValuePair> &FixedOrderPairs,
00234                        DenseMap<ValuePair, int> &CandidatePairCostSavings,
00235                        std::vector<Value *> &PairableInsts, bool NonPow2Len);
00236 
00237     // FIXME: The current implementation does not account for pairs that
00238     // are connected in multiple ways. For example:
00239     //   C1 = A1 / A2; C2 = A2 / A1 (which may be both direct and a swap)
00240     enum PairConnectionType {
00241       PairConnectionDirect,
00242       PairConnectionSwap,
00243       PairConnectionSplat
00244     };
00245 
00246     void computeConnectedPairs(
00247              DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
00248              DenseSet<ValuePair> &CandidatePairsSet,
00249              std::vector<Value *> &PairableInsts,
00250              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
00251              DenseMap<VPPair, unsigned> &PairConnectionTypes);
00252 
00253     void buildDepMap(BasicBlock &BB,
00254              DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
00255              std::vector<Value *> &PairableInsts,
00256              DenseSet<ValuePair> &PairableInstUsers);
00257 
00258     void choosePairs(DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
00259              DenseSet<ValuePair> &CandidatePairsSet,
00260              DenseMap<ValuePair, int> &CandidatePairCostSavings,
00261              std::vector<Value *> &PairableInsts,
00262              DenseSet<ValuePair> &FixedOrderPairs,
00263              DenseMap<VPPair, unsigned> &PairConnectionTypes,
00264              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
00265              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
00266              DenseSet<ValuePair> &PairableInstUsers,
00267              DenseMap<Value *, Value *>& ChosenPairs);
00268 
00269     void fuseChosenPairs(BasicBlock &BB,
00270              std::vector<Value *> &PairableInsts,
00271              DenseMap<Value *, Value *>& ChosenPairs,
00272              DenseSet<ValuePair> &FixedOrderPairs,
00273              DenseMap<VPPair, unsigned> &PairConnectionTypes,
00274              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
00275              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps);
00276 
00277 
00278     bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore);
00279 
00280     bool areInstsCompatible(Instruction *I, Instruction *J,
00281                        bool IsSimpleLoadStore, bool NonPow2Len,
00282                        int &CostSavings, int &FixedOrder);
00283 
00284     bool trackUsesOfI(DenseSet<Value *> &Users,
00285                       AliasSetTracker &WriteSet, Instruction *I,
00286                       Instruction *J, bool UpdateUsers = true,
00287                       DenseSet<ValuePair> *LoadMoveSetPairs = nullptr);
00288 
00289   void computePairsConnectedTo(
00290              DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
00291              DenseSet<ValuePair> &CandidatePairsSet,
00292              std::vector<Value *> &PairableInsts,
00293              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
00294              DenseMap<VPPair, unsigned> &PairConnectionTypes,
00295              ValuePair P);
00296 
00297     bool pairsConflict(ValuePair P, ValuePair Q,
00298              DenseSet<ValuePair> &PairableInstUsers,
00299              DenseMap<ValuePair, std::vector<ValuePair> >
00300                *PairableInstUserMap = nullptr,
00301              DenseSet<VPPair> *PairableInstUserPairSet = nullptr);
00302 
00303     bool pairWillFormCycle(ValuePair P,
00304              DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUsers,
00305              DenseSet<ValuePair> &CurrentPairs);
00306 
00307     void pruneDAGFor(
00308              DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
00309              std::vector<Value *> &PairableInsts,
00310              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
00311              DenseSet<ValuePair> &PairableInstUsers,
00312              DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
00313              DenseSet<VPPair> &PairableInstUserPairSet,
00314              DenseMap<Value *, Value *> &ChosenPairs,
00315              DenseMap<ValuePair, size_t> &DAG,
00316              DenseSet<ValuePair> &PrunedDAG, ValuePair J,
00317              bool UseCycleCheck);
00318 
00319     void buildInitialDAGFor(
00320              DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
00321              DenseSet<ValuePair> &CandidatePairsSet,
00322              std::vector<Value *> &PairableInsts,
00323              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
00324              DenseSet<ValuePair> &PairableInstUsers,
00325              DenseMap<Value *, Value *> &ChosenPairs,
00326              DenseMap<ValuePair, size_t> &DAG, ValuePair J);
00327 
00328     void findBestDAGFor(
00329              DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
00330              DenseSet<ValuePair> &CandidatePairsSet,
00331              DenseMap<ValuePair, int> &CandidatePairCostSavings,
00332              std::vector<Value *> &PairableInsts,
00333              DenseSet<ValuePair> &FixedOrderPairs,
00334              DenseMap<VPPair, unsigned> &PairConnectionTypes,
00335              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
00336              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
00337              DenseSet<ValuePair> &PairableInstUsers,
00338              DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
00339              DenseSet<VPPair> &PairableInstUserPairSet,
00340              DenseMap<Value *, Value *> &ChosenPairs,
00341              DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth,
00342              int &BestEffSize, Value *II, std::vector<Value *>&JJ,
00343              bool UseCycleCheck);
00344 
00345     Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I,
00346                      Instruction *J, unsigned o);
00347 
00348     void fillNewShuffleMask(LLVMContext& Context, Instruction *J,
00349                      unsigned MaskOffset, unsigned NumInElem,
00350                      unsigned NumInElem1, unsigned IdxOffset,
00351                      std::vector<Constant*> &Mask);
00352 
00353     Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I,
00354                      Instruction *J);
00355 
00356     bool expandIEChain(LLVMContext& Context, Instruction *I, Instruction *J,
00357                        unsigned o, Value *&LOp, unsigned numElemL,
00358                        Type *ArgTypeL, Type *ArgTypeR, bool IBeforeJ,
00359                        unsigned IdxOff = 0);
00360 
00361     Value *getReplacementInput(LLVMContext& Context, Instruction *I,
00362                      Instruction *J, unsigned o, bool IBeforeJ);
00363 
00364     void getReplacementInputsForPair(LLVMContext& Context, Instruction *I,
00365                      Instruction *J, SmallVectorImpl<Value *> &ReplacedOperands,
00366                      bool IBeforeJ);
00367 
00368     void replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
00369                      Instruction *J, Instruction *K,
00370                      Instruction *&InsertionPt, Instruction *&K1,
00371                      Instruction *&K2);
00372 
00373     void collectPairLoadMoveSet(BasicBlock &BB,
00374                      DenseMap<Value *, Value *> &ChosenPairs,
00375                      DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
00376                      DenseSet<ValuePair> &LoadMoveSetPairs,
00377                      Instruction *I);
00378 
00379     void collectLoadMoveSet(BasicBlock &BB,
00380                      std::vector<Value *> &PairableInsts,
00381                      DenseMap<Value *, Value *> &ChosenPairs,
00382                      DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
00383                      DenseSet<ValuePair> &LoadMoveSetPairs);
00384 
00385     bool canMoveUsesOfIAfterJ(BasicBlock &BB,
00386                      DenseSet<ValuePair> &LoadMoveSetPairs,
00387                      Instruction *I, Instruction *J);
00388 
00389     void moveUsesOfIAfterJ(BasicBlock &BB,
00390                      DenseSet<ValuePair> &LoadMoveSetPairs,
00391                      Instruction *&InsertionPt,
00392                      Instruction *I, Instruction *J);
00393 
00394     bool vectorizeBB(BasicBlock &BB) {
00395       if (skipOptnoneFunction(BB))
00396         return false;
00397       if (!DT->isReachableFromEntry(&BB)) {
00398         DEBUG(dbgs() << "BBV: skipping unreachable " << BB.getName() <<
00399               " in " << BB.getParent()->getName() << "\n");
00400         return false;
00401       }
00402 
00403       DEBUG(if (TTI) dbgs() << "BBV: using target information\n");
00404 
00405       bool changed = false;
00406       // Iterate a sufficient number of times to merge types of size 1 bit,
00407       // then 2 bits, then 4, etc. up to half of the target vector width of the
00408       // target vector register.
00409       unsigned n = 1;
00410       for (unsigned v = 2;
00411            (TTI || v <= Config.VectorBits) &&
00412            (!Config.MaxIter || n <= Config.MaxIter);
00413            v *= 2, ++n) {
00414         DEBUG(dbgs() << "BBV: fusing loop #" << n <<
00415               " for " << BB.getName() << " in " <<
00416               BB.getParent()->getName() << "...\n");
00417         if (vectorizePairs(BB))
00418           changed = true;
00419         else
00420           break;
00421       }
00422 
00423       if (changed && !Pow2LenOnly) {
00424         ++n;
00425         for (; !Config.MaxIter || n <= Config.MaxIter; ++n) {
00426           DEBUG(dbgs() << "BBV: fusing for non-2^n-length vectors loop #: " <<
00427                 n << " for " << BB.getName() << " in " <<
00428                 BB.getParent()->getName() << "...\n");
00429           if (!vectorizePairs(BB, true)) break;
00430         }
00431       }
00432 
00433       DEBUG(dbgs() << "BBV: done!\n");
00434       return changed;
00435     }
00436 
00437     bool runOnBasicBlock(BasicBlock &BB) override {
00438       // OptimizeNone check deferred to vectorizeBB().
00439 
00440       AA = &getAnalysis<AliasAnalysis>();
00441       DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
00442       SE = &getAnalysis<ScalarEvolution>();
00443       TTI = IgnoreTargetInfo
00444                 ? nullptr
00445                 : &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
00446                       *BB.getParent());
00447 
00448       return vectorizeBB(BB);
00449     }
00450 
00451     void getAnalysisUsage(AnalysisUsage &AU) const override {
00452       BasicBlockPass::getAnalysisUsage(AU);
00453       AU.addRequired<AliasAnalysis>();
00454       AU.addRequired<DominatorTreeWrapperPass>();
00455       AU.addRequired<ScalarEvolution>();
00456       AU.addRequired<TargetTransformInfoWrapperPass>();
00457       AU.addPreserved<AliasAnalysis>();
00458       AU.addPreserved<DominatorTreeWrapperPass>();
00459       AU.addPreserved<ScalarEvolution>();
00460       AU.setPreservesCFG();
00461     }
00462 
00463     static inline VectorType *getVecTypeForPair(Type *ElemTy, Type *Elem2Ty) {
00464       assert(ElemTy->getScalarType() == Elem2Ty->getScalarType() &&
00465              "Cannot form vector from incompatible scalar types");
00466       Type *STy = ElemTy->getScalarType();
00467 
00468       unsigned numElem;
00469       if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) {
00470         numElem = VTy->getNumElements();
00471       } else {
00472         numElem = 1;
00473       }
00474 
00475       if (VectorType *VTy = dyn_cast<VectorType>(Elem2Ty)) {
00476         numElem += VTy->getNumElements();
00477       } else {
00478         numElem += 1;
00479       }
00480 
00481       return VectorType::get(STy, numElem);
00482     }
00483 
00484     static inline void getInstructionTypes(Instruction *I,
00485                                            Type *&T1, Type *&T2) {
00486       if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
00487         // For stores, it is the value type, not the pointer type that matters
00488         // because the value is what will come from a vector register.
00489   
00490         Value *IVal = SI->getValueOperand();
00491         T1 = IVal->getType();
00492       } else {
00493         T1 = I->getType();
00494       }
00495   
00496       if (CastInst *CI = dyn_cast<CastInst>(I))
00497         T2 = CI->getSrcTy();
00498       else
00499         T2 = T1;
00500 
00501       if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
00502         T2 = SI->getCondition()->getType();
00503       } else if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(I)) {
00504         T2 = SI->getOperand(0)->getType();
00505       } else if (CmpInst *CI = dyn_cast<CmpInst>(I)) {
00506         T2 = CI->getOperand(0)->getType();
00507       }
00508     }
00509 
00510     // Returns the weight associated with the provided value. A chain of
00511     // candidate pairs has a length given by the sum of the weights of its
00512     // members (one weight per pair; the weight of each member of the pair
00513     // is assumed to be the same). This length is then compared to the
00514     // chain-length threshold to determine if a given chain is significant
00515     // enough to be vectorized. The length is also used in comparing
00516     // candidate chains where longer chains are considered to be better.
00517     // Note: when this function returns 0, the resulting instructions are
00518     // not actually fused.
00519     inline size_t getDepthFactor(Value *V) {
00520       // InsertElement and ExtractElement have a depth factor of zero. This is
00521       // for two reasons: First, they cannot be usefully fused. Second, because
00522       // the pass generates a lot of these, they can confuse the simple metric
00523       // used to compare the dags in the next iteration. Thus, giving them a
00524       // weight of zero allows the pass to essentially ignore them in
00525       // subsequent iterations when looking for vectorization opportunities
00526       // while still tracking dependency chains that flow through those
00527       // instructions.
00528       if (isa<InsertElementInst>(V) || isa<ExtractElementInst>(V))
00529         return 0;
00530 
00531       // Give a load or store half of the required depth so that load/store
00532       // pairs will vectorize.
00533       if (!Config.NoMemOpBoost && (isa<LoadInst>(V) || isa<StoreInst>(V)))
00534         return Config.ReqChainDepth/2;
00535 
00536       return 1;
00537     }
00538 
00539     // Returns the cost of the provided instruction using TTI.
00540     // This does not handle loads and stores.
00541     unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2,
00542                           TargetTransformInfo::OperandValueKind Op1VK = 
00543                               TargetTransformInfo::OK_AnyValue,
00544                           TargetTransformInfo::OperandValueKind Op2VK =
00545                               TargetTransformInfo::OK_AnyValue) {
00546       switch (Opcode) {
00547       default: break;
00548       case Instruction::GetElementPtr:
00549         // We mark this instruction as zero-cost because scalar GEPs are usually
00550         // lowered to the instruction addressing mode. At the moment we don't
00551         // generate vector GEPs.
00552         return 0;
00553       case Instruction::Br:
00554         return TTI->getCFInstrCost(Opcode);
00555       case Instruction::PHI:
00556         return 0;
00557       case Instruction::Add:
00558       case Instruction::FAdd:
00559       case Instruction::Sub:
00560       case Instruction::FSub:
00561       case Instruction::Mul:
00562       case Instruction::FMul:
00563       case Instruction::UDiv:
00564       case Instruction::SDiv:
00565       case Instruction::FDiv:
00566       case Instruction::URem:
00567       case Instruction::SRem:
00568       case Instruction::FRem:
00569       case Instruction::Shl:
00570       case Instruction::LShr:
00571       case Instruction::AShr:
00572       case Instruction::And:
00573       case Instruction::Or:
00574       case Instruction::Xor:
00575         return TTI->getArithmeticInstrCost(Opcode, T1, Op1VK, Op2VK);
00576       case Instruction::Select:
00577       case Instruction::ICmp:
00578       case Instruction::FCmp:
00579         return TTI->getCmpSelInstrCost(Opcode, T1, T2);
00580       case Instruction::ZExt:
00581       case Instruction::SExt:
00582       case Instruction::FPToUI:
00583       case Instruction::FPToSI:
00584       case Instruction::FPExt:
00585       case Instruction::PtrToInt:
00586       case Instruction::IntToPtr:
00587       case Instruction::SIToFP:
00588       case Instruction::UIToFP:
00589       case Instruction::Trunc:
00590       case Instruction::FPTrunc:
00591       case Instruction::BitCast:
00592       case Instruction::ShuffleVector:
00593         return TTI->getCastInstrCost(Opcode, T1, T2);
00594       }
00595 
00596       return 1;
00597     }
00598 
00599     // This determines the relative offset of two loads or stores, returning
00600     // true if the offset could be determined to be some constant value.
00601     // For example, if OffsetInElmts == 1, then J accesses the memory directly
00602     // after I; if OffsetInElmts == -1 then I accesses the memory
00603     // directly after J.
00604     bool getPairPtrInfo(Instruction *I, Instruction *J,
00605         Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment,
00606         unsigned &IAddressSpace, unsigned &JAddressSpace,
00607         int64_t &OffsetInElmts, bool ComputeOffset = true) {
00608       OffsetInElmts = 0;
00609       if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
00610         LoadInst *LJ = cast<LoadInst>(J);
00611         IPtr = LI->getPointerOperand();
00612         JPtr = LJ->getPointerOperand();
00613         IAlignment = LI->getAlignment();
00614         JAlignment = LJ->getAlignment();
00615         IAddressSpace = LI->getPointerAddressSpace();
00616         JAddressSpace = LJ->getPointerAddressSpace();
00617       } else {
00618         StoreInst *SI = cast<StoreInst>(I), *SJ = cast<StoreInst>(J);
00619         IPtr = SI->getPointerOperand();
00620         JPtr = SJ->getPointerOperand();
00621         IAlignment = SI->getAlignment();
00622         JAlignment = SJ->getAlignment();
00623         IAddressSpace = SI->getPointerAddressSpace();
00624         JAddressSpace = SJ->getPointerAddressSpace();
00625       }
00626 
00627       if (!ComputeOffset)
00628         return true;
00629 
00630       const SCEV *IPtrSCEV = SE->getSCEV(IPtr);
00631       const SCEV *JPtrSCEV = SE->getSCEV(JPtr);
00632 
00633       // If this is a trivial offset, then we'll get something like
00634       // 1*sizeof(type). With target data, which we need anyway, this will get
00635       // constant folded into a number.
00636       const SCEV *OffsetSCEV = SE->getMinusSCEV(JPtrSCEV, IPtrSCEV);
00637       if (const SCEVConstant *ConstOffSCEV =
00638             dyn_cast<SCEVConstant>(OffsetSCEV)) {
00639         ConstantInt *IntOff = ConstOffSCEV->getValue();
00640         int64_t Offset = IntOff->getSExtValue();
00641         const DataLayout &DL = I->getModule()->getDataLayout();
00642         Type *VTy = IPtr->getType()->getPointerElementType();
00643         int64_t VTyTSS = (int64_t)DL.getTypeStoreSize(VTy);
00644 
00645         Type *VTy2 = JPtr->getType()->getPointerElementType();
00646         if (VTy != VTy2 && Offset < 0) {
00647           int64_t VTy2TSS = (int64_t)DL.getTypeStoreSize(VTy2);
00648           OffsetInElmts = Offset/VTy2TSS;
00649           return (std::abs(Offset) % VTy2TSS) == 0;
00650         }
00651 
00652         OffsetInElmts = Offset/VTyTSS;
00653         return (std::abs(Offset) % VTyTSS) == 0;
00654       }
00655 
00656       return false;
00657     }
00658 
00659     // Returns true if the provided CallInst represents an intrinsic that can
00660     // be vectorized.
00661     bool isVectorizableIntrinsic(CallInst* I) {
00662       Function *F = I->getCalledFunction();
00663       if (!F) return false;
00664 
00665       Intrinsic::ID IID = (Intrinsic::ID) F->getIntrinsicID();
00666       if (!IID) return false;
00667 
00668       switch(IID) {
00669       default:
00670         return false;
00671       case Intrinsic::sqrt:
00672       case Intrinsic::powi:
00673       case Intrinsic::sin:
00674       case Intrinsic::cos:
00675       case Intrinsic::log:
00676       case Intrinsic::log2:
00677       case Intrinsic::log10:
00678       case Intrinsic::exp:
00679       case Intrinsic::exp2:
00680       case Intrinsic::pow:
00681       case Intrinsic::round:
00682       case Intrinsic::copysign:
00683       case Intrinsic::ceil:
00684       case Intrinsic::nearbyint:
00685       case Intrinsic::rint:
00686       case Intrinsic::trunc:
00687       case Intrinsic::floor:
00688       case Intrinsic::fabs:
00689       case Intrinsic::minnum:
00690       case Intrinsic::maxnum:
00691         return Config.VectorizeMath;
00692       case Intrinsic::bswap:
00693       case Intrinsic::ctpop:
00694       case Intrinsic::ctlz:
00695       case Intrinsic::cttz:
00696         return Config.VectorizeBitManipulations;
00697       case Intrinsic::fma:
00698       case Intrinsic::fmuladd:
00699         return Config.VectorizeFMA;
00700       }
00701     }
00702 
00703     bool isPureIEChain(InsertElementInst *IE) {
00704       InsertElementInst *IENext = IE;
00705       do {
00706         if (!isa<UndefValue>(IENext->getOperand(0)) &&
00707             !isa<InsertElementInst>(IENext->getOperand(0))) {
00708           return false;
00709         }
00710       } while ((IENext =
00711                  dyn_cast<InsertElementInst>(IENext->getOperand(0))));
00712 
00713       return true;
00714     }
00715   };
00716 
00717   // This function implements one vectorization iteration on the provided
00718   // basic block. It returns true if the block is changed.
00719   bool BBVectorize::vectorizePairs(BasicBlock &BB, bool NonPow2Len) {
00720     bool ShouldContinue;
00721     BasicBlock::iterator Start = BB.getFirstInsertionPt();
00722 
00723     std::vector<Value *> AllPairableInsts;
00724     DenseMap<Value *, Value *> AllChosenPairs;
00725     DenseSet<ValuePair> AllFixedOrderPairs;
00726     DenseMap<VPPair, unsigned> AllPairConnectionTypes;
00727     DenseMap<ValuePair, std::vector<ValuePair> > AllConnectedPairs,
00728                                                  AllConnectedPairDeps;
00729 
00730     do {
00731       std::vector<Value *> PairableInsts;
00732       DenseMap<Value *, std::vector<Value *> > CandidatePairs;
00733       DenseSet<ValuePair> FixedOrderPairs;
00734       DenseMap<ValuePair, int> CandidatePairCostSavings;
00735       ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs,
00736                                          FixedOrderPairs,
00737                                          CandidatePairCostSavings,
00738                                          PairableInsts, NonPow2Len);
00739       if (PairableInsts.empty()) continue;
00740 
00741       // Build the candidate pair set for faster lookups.
00742       DenseSet<ValuePair> CandidatePairsSet;
00743       for (DenseMap<Value *, std::vector<Value *> >::iterator I =
00744            CandidatePairs.begin(), E = CandidatePairs.end(); I != E; ++I)
00745         for (std::vector<Value *>::iterator J = I->second.begin(),
00746              JE = I->second.end(); J != JE; ++J)
00747           CandidatePairsSet.insert(ValuePair(I->first, *J));
00748 
00749       // Now we have a map of all of the pairable instructions and we need to
00750       // select the best possible pairing. A good pairing is one such that the
00751       // users of the pair are also paired. This defines a (directed) forest
00752       // over the pairs such that two pairs are connected iff the second pair
00753       // uses the first.
00754 
00755       // Note that it only matters that both members of the second pair use some
00756       // element of the first pair (to allow for splatting).
00757 
00758       DenseMap<ValuePair, std::vector<ValuePair> > ConnectedPairs,
00759                                                    ConnectedPairDeps;
00760       DenseMap<VPPair, unsigned> PairConnectionTypes;
00761       computeConnectedPairs(CandidatePairs, CandidatePairsSet,
00762                             PairableInsts, ConnectedPairs, PairConnectionTypes);
00763       if (ConnectedPairs.empty()) continue;
00764 
00765       for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator
00766            I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
00767            I != IE; ++I)
00768         for (std::vector<ValuePair>::iterator J = I->second.begin(),
00769              JE = I->second.end(); J != JE; ++J)
00770           ConnectedPairDeps[*J].push_back(I->first);
00771 
00772       // Build the pairable-instruction dependency map
00773       DenseSet<ValuePair> PairableInstUsers;
00774       buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers);
00775 
00776       // There is now a graph of the connected pairs. For each variable, pick
00777       // the pairing with the largest dag meeting the depth requirement on at
00778       // least one branch. Then select all pairings that are part of that dag
00779       // and remove them from the list of available pairings and pairable
00780       // variables.
00781 
00782       DenseMap<Value *, Value *> ChosenPairs;
00783       choosePairs(CandidatePairs, CandidatePairsSet,
00784         CandidatePairCostSavings,
00785         PairableInsts, FixedOrderPairs, PairConnectionTypes,
00786         ConnectedPairs, ConnectedPairDeps,
00787         PairableInstUsers, ChosenPairs);
00788 
00789       if (ChosenPairs.empty()) continue;
00790       AllPairableInsts.insert(AllPairableInsts.end(), PairableInsts.begin(),
00791                               PairableInsts.end());
00792       AllChosenPairs.insert(ChosenPairs.begin(), ChosenPairs.end());
00793 
00794       // Only for the chosen pairs, propagate information on fixed-order pairs,
00795       // pair connections, and their types to the data structures used by the
00796       // pair fusion procedures.
00797       for (DenseMap<Value *, Value *>::iterator I = ChosenPairs.begin(),
00798            IE = ChosenPairs.end(); I != IE; ++I) {
00799         if (FixedOrderPairs.count(*I))
00800           AllFixedOrderPairs.insert(*I);
00801         else if (FixedOrderPairs.count(ValuePair(I->second, I->first)))
00802           AllFixedOrderPairs.insert(ValuePair(I->second, I->first));
00803 
00804         for (DenseMap<Value *, Value *>::iterator J = ChosenPairs.begin();
00805              J != IE; ++J) {
00806           DenseMap<VPPair, unsigned>::iterator K =
00807             PairConnectionTypes.find(VPPair(*I, *J));
00808           if (K != PairConnectionTypes.end()) {
00809             AllPairConnectionTypes.insert(*K);
00810           } else {
00811             K = PairConnectionTypes.find(VPPair(*J, *I));
00812             if (K != PairConnectionTypes.end())
00813               AllPairConnectionTypes.insert(*K);
00814           }
00815         }
00816       }
00817 
00818       for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator
00819            I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
00820            I != IE; ++I)
00821         for (std::vector<ValuePair>::iterator J = I->second.begin(),
00822           JE = I->second.end(); J != JE; ++J)
00823           if (AllPairConnectionTypes.count(VPPair(I->first, *J))) {
00824             AllConnectedPairs[I->first].push_back(*J);
00825             AllConnectedPairDeps[*J].push_back(I->first);
00826           }
00827     } while (ShouldContinue);
00828 
00829     if (AllChosenPairs.empty()) return false;
00830     NumFusedOps += AllChosenPairs.size();
00831 
00832     // A set of pairs has now been selected. It is now necessary to replace the
00833     // paired instructions with vector instructions. For this procedure each
00834     // operand must be replaced with a vector operand. This vector is formed
00835     // by using build_vector on the old operands. The replaced values are then
00836     // replaced with a vector_extract on the result.  Subsequent optimization
00837     // passes should coalesce the build/extract combinations.
00838 
00839     fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs, AllFixedOrderPairs,
00840                     AllPairConnectionTypes,
00841                     AllConnectedPairs, AllConnectedPairDeps);
00842 
00843     // It is important to cleanup here so that future iterations of this
00844     // function have less work to do.
00845     (void)SimplifyInstructionsInBlock(&BB, AA->getTargetLibraryInfo());
00846     return true;
00847   }
00848 
00849   // This function returns true if the provided instruction is capable of being
00850   // fused into a vector instruction. This determination is based only on the
00851   // type and other attributes of the instruction.
00852   bool BBVectorize::isInstVectorizable(Instruction *I,
00853                                          bool &IsSimpleLoadStore) {
00854     IsSimpleLoadStore = false;
00855 
00856     if (CallInst *C = dyn_cast<CallInst>(I)) {
00857       if (!isVectorizableIntrinsic(C))
00858         return false;
00859     } else if (LoadInst *L = dyn_cast<LoadInst>(I)) {
00860       // Vectorize simple loads if possbile:
00861       IsSimpleLoadStore = L->isSimple();
00862       if (!IsSimpleLoadStore || !Config.VectorizeMemOps)
00863         return false;
00864     } else if (StoreInst *S = dyn_cast<StoreInst>(I)) {
00865       // Vectorize simple stores if possbile:
00866       IsSimpleLoadStore = S->isSimple();
00867       if (!IsSimpleLoadStore || !Config.VectorizeMemOps)
00868         return false;
00869     } else if (CastInst *C = dyn_cast<CastInst>(I)) {
00870       // We can vectorize casts, but not casts of pointer types, etc.
00871       if (!Config.VectorizeCasts)
00872         return false;
00873 
00874       Type *SrcTy = C->getSrcTy();
00875       if (!SrcTy->isSingleValueType())
00876         return false;
00877 
00878       Type *DestTy = C->getDestTy();
00879       if (!DestTy->isSingleValueType())
00880         return false;
00881     } else if (isa<SelectInst>(I)) {
00882       if (!Config.VectorizeSelect)
00883         return false;
00884     } else if (isa<CmpInst>(I)) {
00885       if (!Config.VectorizeCmp)
00886         return false;
00887     } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(I)) {
00888       if (!Config.VectorizeGEP)
00889         return false;
00890 
00891       // Currently, vector GEPs exist only with one index.
00892       if (G->getNumIndices() != 1)
00893         return false;
00894     } else if (!(I->isBinaryOp() || isa<ShuffleVectorInst>(I) ||
00895         isa<ExtractElementInst>(I) || isa<InsertElementInst>(I))) {
00896       return false;
00897     }
00898 
00899     Type *T1, *T2;
00900     getInstructionTypes(I, T1, T2);
00901 
00902     // Not every type can be vectorized...
00903     if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) ||
00904         !(VectorType::isValidElementType(T2) || T2->isVectorTy()))
00905       return false;
00906 
00907     if (T1->getScalarSizeInBits() == 1) {
00908       if (!Config.VectorizeBools)
00909         return false;
00910     } else {
00911       if (!Config.VectorizeInts && T1->isIntOrIntVectorTy())
00912         return false;
00913     }
00914 
00915     if (T2->getScalarSizeInBits() == 1) {
00916       if (!Config.VectorizeBools)
00917         return false;
00918     } else {
00919       if (!Config.VectorizeInts && T2->isIntOrIntVectorTy())
00920         return false;
00921     }
00922 
00923     if (!Config.VectorizeFloats
00924         && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy()))
00925       return false;
00926 
00927     // Don't vectorize target-specific types.
00928     if (T1->isX86_FP80Ty() || T1->isPPC_FP128Ty() || T1->isX86_MMXTy())
00929       return false;
00930     if (T2->isX86_FP80Ty() || T2->isPPC_FP128Ty() || T2->isX86_MMXTy())
00931       return false;
00932 
00933     if (!Config.VectorizePointers && (T1->getScalarType()->isPointerTy() ||
00934                                       T2->getScalarType()->isPointerTy()))
00935       return false;
00936 
00937     if (!TTI && (T1->getPrimitiveSizeInBits() >= Config.VectorBits ||
00938                  T2->getPrimitiveSizeInBits() >= Config.VectorBits))
00939       return false;
00940 
00941     return true;
00942   }
00943 
00944   // This function returns true if the two provided instructions are compatible
00945   // (meaning that they can be fused into a vector instruction). This assumes
00946   // that I has already been determined to be vectorizable and that J is not
00947   // in the use dag of I.
00948   bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J,
00949                        bool IsSimpleLoadStore, bool NonPow2Len,
00950                        int &CostSavings, int &FixedOrder) {
00951     DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I <<
00952                      " <-> " << *J << "\n");
00953 
00954     CostSavings = 0;
00955     FixedOrder = 0;
00956 
00957     // Loads and stores can be merged if they have different alignments,
00958     // but are otherwise the same.
00959     if (!J->isSameOperationAs(I, Instruction::CompareIgnoringAlignment |
00960                       (NonPow2Len ? Instruction::CompareUsingScalarTypes : 0)))
00961       return false;
00962 
00963     Type *IT1, *IT2, *JT1, *JT2;
00964     getInstructionTypes(I, IT1, IT2);
00965     getInstructionTypes(J, JT1, JT2);
00966     unsigned MaxTypeBits = std::max(
00967       IT1->getPrimitiveSizeInBits() + JT1->getPrimitiveSizeInBits(),
00968       IT2->getPrimitiveSizeInBits() + JT2->getPrimitiveSizeInBits());
00969     if (!TTI && MaxTypeBits > Config.VectorBits)
00970       return false;
00971 
00972     // FIXME: handle addsub-type operations!
00973 
00974     if (IsSimpleLoadStore) {
00975       Value *IPtr, *JPtr;
00976       unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
00977       int64_t OffsetInElmts = 0;
00978       if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
00979                          IAddressSpace, JAddressSpace, OffsetInElmts) &&
00980           std::abs(OffsetInElmts) == 1) {
00981         FixedOrder = (int) OffsetInElmts;
00982         unsigned BottomAlignment = IAlignment;
00983         if (OffsetInElmts < 0) BottomAlignment = JAlignment;
00984 
00985         Type *aTypeI = isa<StoreInst>(I) ?
00986           cast<StoreInst>(I)->getValueOperand()->getType() : I->getType();
00987         Type *aTypeJ = isa<StoreInst>(J) ?
00988           cast<StoreInst>(J)->getValueOperand()->getType() : J->getType();
00989         Type *VType = getVecTypeForPair(aTypeI, aTypeJ);
00990 
00991         if (Config.AlignedOnly) {
00992           // An aligned load or store is possible only if the instruction
00993           // with the lower offset has an alignment suitable for the
00994           // vector type.
00995           const DataLayout &DL = I->getModule()->getDataLayout();
00996           unsigned VecAlignment = DL.getPrefTypeAlignment(VType);
00997           if (BottomAlignment < VecAlignment)
00998             return false;
00999         }
01000 
01001         if (TTI) {
01002           unsigned ICost = TTI->getMemoryOpCost(I->getOpcode(), aTypeI,
01003                                                 IAlignment, IAddressSpace);
01004           unsigned JCost = TTI->getMemoryOpCost(J->getOpcode(), aTypeJ,
01005                                                 JAlignment, JAddressSpace);
01006           unsigned VCost = TTI->getMemoryOpCost(I->getOpcode(), VType,
01007                                                 BottomAlignment,
01008                                                 IAddressSpace);
01009 
01010           ICost += TTI->getAddressComputationCost(aTypeI);
01011           JCost += TTI->getAddressComputationCost(aTypeJ);
01012           VCost += TTI->getAddressComputationCost(VType);
01013 
01014           if (VCost > ICost + JCost)
01015             return false;
01016 
01017           // We don't want to fuse to a type that will be split, even
01018           // if the two input types will also be split and there is no other
01019           // associated cost.
01020           unsigned VParts = TTI->getNumberOfParts(VType);
01021           if (VParts > 1)
01022             return false;
01023           else if (!VParts && VCost == ICost + JCost)
01024             return false;
01025 
01026           CostSavings = ICost + JCost - VCost;
01027         }
01028       } else {
01029         return false;
01030       }
01031     } else if (TTI) {
01032       unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2);
01033       unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2);
01034       Type *VT1 = getVecTypeForPair(IT1, JT1),
01035            *VT2 = getVecTypeForPair(IT2, JT2);
01036       TargetTransformInfo::OperandValueKind Op1VK =
01037           TargetTransformInfo::OK_AnyValue;
01038       TargetTransformInfo::OperandValueKind Op2VK =
01039           TargetTransformInfo::OK_AnyValue;
01040 
01041       // On some targets (example X86) the cost of a vector shift may vary
01042       // depending on whether the second operand is a Uniform or
01043       // NonUniform Constant.
01044       switch (I->getOpcode()) {
01045       default : break;
01046       case Instruction::Shl:
01047       case Instruction::LShr:
01048       case Instruction::AShr:
01049 
01050         // If both I and J are scalar shifts by constant, then the
01051         // merged vector shift count would be either a constant splat value
01052         // or a non-uniform vector of constants.
01053         if (ConstantInt *CII = dyn_cast<ConstantInt>(I->getOperand(1))) {
01054           if (ConstantInt *CIJ = dyn_cast<ConstantInt>(J->getOperand(1)))
01055             Op2VK = CII == CIJ ? TargetTransformInfo::OK_UniformConstantValue :
01056                                TargetTransformInfo::OK_NonUniformConstantValue;
01057         } else {
01058           // Check for a splat of a constant or for a non uniform vector
01059           // of constants.
01060           Value *IOp = I->getOperand(1);
01061           Value *JOp = J->getOperand(1);
01062           if ((isa<ConstantVector>(IOp) || isa<ConstantDataVector>(IOp)) &&
01063               (isa<ConstantVector>(JOp) || isa<ConstantDataVector>(JOp))) {
01064             Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
01065             Constant *SplatValue = cast<Constant>(IOp)->getSplatValue();
01066             if (SplatValue != nullptr &&
01067                 SplatValue == cast<Constant>(JOp)->getSplatValue())
01068               Op2VK = TargetTransformInfo::OK_UniformConstantValue;
01069           }
01070         }
01071       }
01072 
01073       // Note that this procedure is incorrect for insert and extract element
01074       // instructions (because combining these often results in a shuffle),
01075       // but this cost is ignored (because insert and extract element
01076       // instructions are assigned a zero depth factor and are not really
01077       // fused in general).
01078       unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2, Op1VK, Op2VK);
01079 
01080       if (VCost > ICost + JCost)
01081         return false;
01082 
01083       // We don't want to fuse to a type that will be split, even
01084       // if the two input types will also be split and there is no other
01085       // associated cost.
01086       unsigned VParts1 = TTI->getNumberOfParts(VT1),
01087                VParts2 = TTI->getNumberOfParts(VT2);
01088       if (VParts1 > 1 || VParts2 > 1)
01089         return false;
01090       else if ((!VParts1 || !VParts2) && VCost == ICost + JCost)
01091         return false;
01092 
01093       CostSavings = ICost + JCost - VCost;
01094     }
01095 
01096     // The powi,ctlz,cttz intrinsics are special because only the first
01097     // argument is vectorized, the second arguments must be equal.
01098     CallInst *CI = dyn_cast<CallInst>(I);
01099     Function *FI;
01100     if (CI && (FI = CI->getCalledFunction())) {
01101       Intrinsic::ID IID = (Intrinsic::ID) FI->getIntrinsicID();
01102       if (IID == Intrinsic::powi || IID == Intrinsic::ctlz ||
01103           IID == Intrinsic::cttz) {
01104         Value *A1I = CI->getArgOperand(1),
01105               *A1J = cast<CallInst>(J)->getArgOperand(1);
01106         const SCEV *A1ISCEV = SE->getSCEV(A1I),
01107                    *A1JSCEV = SE->getSCEV(A1J);
01108         return (A1ISCEV == A1JSCEV);
01109       }
01110 
01111       if (IID && TTI) {
01112         SmallVector<Type*, 4> Tys;
01113         for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
01114           Tys.push_back(CI->getArgOperand(i)->getType());
01115         unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, Tys);
01116 
01117         Tys.clear();
01118         CallInst *CJ = cast<CallInst>(J);
01119         for (unsigned i = 0, ie = CJ->getNumArgOperands(); i != ie; ++i)
01120           Tys.push_back(CJ->getArgOperand(i)->getType());
01121         unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, Tys);
01122 
01123         Tys.clear();
01124         assert(CI->getNumArgOperands() == CJ->getNumArgOperands() &&
01125                "Intrinsic argument counts differ");
01126         for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
01127           if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz ||
01128                IID == Intrinsic::cttz) && i == 1)
01129             Tys.push_back(CI->getArgOperand(i)->getType());
01130           else
01131             Tys.push_back(getVecTypeForPair(CI->getArgOperand(i)->getType(),
01132                                             CJ->getArgOperand(i)->getType()));
01133         }
01134 
01135         Type *RetTy = getVecTypeForPair(IT1, JT1);
01136         unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys);
01137 
01138         if (VCost > ICost + JCost)
01139           return false;
01140 
01141         // We don't want to fuse to a type that will be split, even
01142         // if the two input types will also be split and there is no other
01143         // associated cost.
01144         unsigned RetParts = TTI->getNumberOfParts(RetTy);
01145         if (RetParts > 1)
01146           return false;
01147         else if (!RetParts && VCost == ICost + JCost)
01148           return false;
01149 
01150         for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
01151           if (!Tys[i]->isVectorTy())
01152             continue;
01153 
01154           unsigned NumParts = TTI->getNumberOfParts(Tys[i]);
01155           if (NumParts > 1)
01156             return false;
01157           else if (!NumParts && VCost == ICost + JCost)
01158             return false;
01159         }
01160 
01161         CostSavings = ICost + JCost - VCost;
01162       }
01163     }
01164 
01165     return true;
01166   }
01167 
01168   // Figure out whether or not J uses I and update the users and write-set
01169   // structures associated with I. Specifically, Users represents the set of
01170   // instructions that depend on I. WriteSet represents the set
01171   // of memory locations that are dependent on I. If UpdateUsers is true,
01172   // and J uses I, then Users is updated to contain J and WriteSet is updated
01173   // to contain any memory locations to which J writes. The function returns
01174   // true if J uses I. By default, alias analysis is used to determine
01175   // whether J reads from memory that overlaps with a location in WriteSet.
01176   // If LoadMoveSet is not null, then it is a previously-computed map
01177   // where the key is the memory-based user instruction and the value is
01178   // the instruction to be compared with I. So, if LoadMoveSet is provided,
01179   // then the alias analysis is not used. This is necessary because this
01180   // function is called during the process of moving instructions during
01181   // vectorization and the results of the alias analysis are not stable during
01182   // that process.
01183   bool BBVectorize::trackUsesOfI(DenseSet<Value *> &Users,
01184                        AliasSetTracker &WriteSet, Instruction *I,
01185                        Instruction *J, bool UpdateUsers,
01186                        DenseSet<ValuePair> *LoadMoveSetPairs) {
01187     bool UsesI = false;
01188 
01189     // This instruction may already be marked as a user due, for example, to
01190     // being a member of a selected pair.
01191     if (Users.count(J))
01192       UsesI = true;
01193 
01194     if (!UsesI)
01195       for (User::op_iterator JU = J->op_begin(), JE = J->op_end();
01196            JU != JE; ++JU) {
01197         Value *V = *JU;
01198         if (I == V || Users.count(V)) {
01199           UsesI = true;
01200           break;
01201         }
01202       }
01203     if (!UsesI && J->mayReadFromMemory()) {
01204       if (LoadMoveSetPairs) {
01205         UsesI = LoadMoveSetPairs->count(ValuePair(J, I));
01206       } else {
01207         for (AliasSetTracker::iterator W = WriteSet.begin(),
01208              WE = WriteSet.end(); W != WE; ++W) {
01209           if (W->aliasesUnknownInst(J, *AA)) {
01210             UsesI = true;
01211             break;
01212           }
01213         }
01214       }
01215     }
01216 
01217     if (UsesI && UpdateUsers) {
01218       if (J->mayWriteToMemory()) WriteSet.add(J);
01219       Users.insert(J);
01220     }
01221 
01222     return UsesI;
01223   }
01224 
01225   // This function iterates over all instruction pairs in the provided
01226   // basic block and collects all candidate pairs for vectorization.
01227   bool BBVectorize::getCandidatePairs(BasicBlock &BB,
01228                        BasicBlock::iterator &Start,
01229                        DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
01230                        DenseSet<ValuePair> &FixedOrderPairs,
01231                        DenseMap<ValuePair, int> &CandidatePairCostSavings,
01232                        std::vector<Value *> &PairableInsts, bool NonPow2Len) {
01233     size_t TotalPairs = 0;
01234     BasicBlock::iterator E = BB.end();
01235     if (Start == E) return false;
01236 
01237     bool ShouldContinue = false, IAfterStart = false;
01238     for (BasicBlock::iterator I = Start++; I != E; ++I) {
01239       if (I == Start) IAfterStart = true;
01240 
01241       bool IsSimpleLoadStore;
01242       if (!isInstVectorizable(I, IsSimpleLoadStore)) continue;
01243 
01244       // Look for an instruction with which to pair instruction *I...
01245       DenseSet<Value *> Users;
01246       AliasSetTracker WriteSet(*AA);
01247       if (I->mayWriteToMemory()) WriteSet.add(I);
01248 
01249       bool JAfterStart = IAfterStart;
01250       BasicBlock::iterator J = std::next(I);
01251       for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) {
01252         if (J == Start) JAfterStart = true;
01253 
01254         // Determine if J uses I, if so, exit the loop.
01255         bool UsesI = trackUsesOfI(Users, WriteSet, I, J, !Config.FastDep);
01256         if (Config.FastDep) {
01257           // Note: For this heuristic to be effective, independent operations
01258           // must tend to be intermixed. This is likely to be true from some
01259           // kinds of grouped loop unrolling (but not the generic LLVM pass),
01260           // but otherwise may require some kind of reordering pass.
01261 
01262           // When using fast dependency analysis,
01263           // stop searching after first use:
01264           if (UsesI) break;
01265         } else {
01266           if (UsesI) continue;
01267         }
01268 
01269         // J does not use I, and comes before the first use of I, so it can be
01270         // merged with I if the instructions are compatible.
01271         int CostSavings, FixedOrder;
01272         if (!areInstsCompatible(I, J, IsSimpleLoadStore, NonPow2Len,
01273             CostSavings, FixedOrder)) continue;
01274 
01275         // J is a candidate for merging with I.
01276         if (PairableInsts.empty() ||
01277              PairableInsts[PairableInsts.size()-1] != I) {
01278           PairableInsts.push_back(I);
01279         }
01280 
01281         CandidatePairs[I].push_back(J);
01282         ++TotalPairs;
01283         if (TTI)
01284           CandidatePairCostSavings.insert(ValuePairWithCost(ValuePair(I, J),
01285                                                             CostSavings));
01286 
01287         if (FixedOrder == 1)
01288           FixedOrderPairs.insert(ValuePair(I, J));
01289         else if (FixedOrder == -1)
01290           FixedOrderPairs.insert(ValuePair(J, I));
01291 
01292         // The next call to this function must start after the last instruction
01293         // selected during this invocation.
01294         if (JAfterStart) {
01295           Start = std::next(J);
01296           IAfterStart = JAfterStart = false;
01297         }
01298 
01299         DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair "
01300                      << *I << " <-> " << *J << " (cost savings: " <<
01301                      CostSavings << ")\n");
01302 
01303         // If we have already found too many pairs, break here and this function
01304         // will be called again starting after the last instruction selected
01305         // during this invocation.
01306         if (PairableInsts.size() >= Config.MaxInsts ||
01307             TotalPairs >= Config.MaxPairs) {
01308           ShouldContinue = true;
01309           break;
01310         }
01311       }
01312 
01313       if (ShouldContinue)
01314         break;
01315     }
01316 
01317     DEBUG(dbgs() << "BBV: found " << PairableInsts.size()
01318            << " instructions with candidate pairs\n");
01319 
01320     return ShouldContinue;
01321   }
01322 
01323   // Finds candidate pairs connected to the pair P = <PI, PJ>. This means that
01324   // it looks for pairs such that both members have an input which is an
01325   // output of PI or PJ.
01326   void BBVectorize::computePairsConnectedTo(
01327                   DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
01328                   DenseSet<ValuePair> &CandidatePairsSet,
01329                   std::vector<Value *> &PairableInsts,
01330                   DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
01331                   DenseMap<VPPair, unsigned> &PairConnectionTypes,
01332                   ValuePair P) {
01333     StoreInst *SI, *SJ;
01334 
01335     // For each possible pairing for this variable, look at the uses of
01336     // the first value...
01337     for (Value::user_iterator I = P.first->user_begin(),
01338                               E = P.first->user_end();
01339          I != E; ++I) {
01340       User *UI = *I;
01341       if (isa<LoadInst>(UI)) {
01342         // A pair cannot be connected to a load because the load only takes one
01343         // operand (the address) and it is a scalar even after vectorization.
01344         continue;
01345       } else if ((SI = dyn_cast<StoreInst>(UI)) &&
01346                  P.first == SI->getPointerOperand()) {
01347         // Similarly, a pair cannot be connected to a store through its
01348         // pointer operand.
01349         continue;
01350       }
01351 
01352       // For each use of the first variable, look for uses of the second
01353       // variable...
01354       for (User *UJ : P.second->users()) {
01355         if ((SJ = dyn_cast<StoreInst>(UJ)) &&
01356             P.second == SJ->getPointerOperand())
01357           continue;
01358 
01359         // Look for <I, J>:
01360         if (CandidatePairsSet.count(ValuePair(UI, UJ))) {
01361           VPPair VP(P, ValuePair(UI, UJ));
01362           ConnectedPairs[VP.first].push_back(VP.second);
01363           PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionDirect));
01364         }
01365 
01366         // Look for <J, I>:
01367         if (CandidatePairsSet.count(ValuePair(UJ, UI))) {
01368           VPPair VP(P, ValuePair(UJ, UI));
01369           ConnectedPairs[VP.first].push_back(VP.second);
01370           PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSwap));
01371         }
01372       }
01373 
01374       if (Config.SplatBreaksChain) continue;
01375       // Look for cases where just the first value in the pair is used by
01376       // both members of another pair (splatting).
01377       for (Value::user_iterator J = P.first->user_begin(); J != E; ++J) {
01378         User *UJ = *J;
01379         if ((SJ = dyn_cast<StoreInst>(UJ)) &&
01380             P.first == SJ->getPointerOperand())
01381           continue;
01382 
01383         if (CandidatePairsSet.count(ValuePair(UI, UJ))) {
01384           VPPair VP(P, ValuePair(UI, UJ));
01385           ConnectedPairs[VP.first].push_back(VP.second);
01386           PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
01387         }
01388       }
01389     }
01390 
01391     if (Config.SplatBreaksChain) return;
01392     // Look for cases where just the second value in the pair is used by
01393     // both members of another pair (splatting).
01394     for (Value::user_iterator I = P.second->user_begin(),
01395                               E = P.second->user_end();
01396          I != E; ++I) {
01397       User *UI = *I;
01398       if (isa<LoadInst>(UI))
01399         continue;
01400       else if ((SI = dyn_cast<StoreInst>(UI)) &&
01401                P.second == SI->getPointerOperand())
01402         continue;
01403 
01404       for (Value::user_iterator J = P.second->user_begin(); J != E; ++J) {
01405         User *UJ = *J;
01406         if ((SJ = dyn_cast<StoreInst>(UJ)) &&
01407             P.second == SJ->getPointerOperand())
01408           continue;
01409 
01410         if (CandidatePairsSet.count(ValuePair(UI, UJ))) {
01411           VPPair VP(P, ValuePair(UI, UJ));
01412           ConnectedPairs[VP.first].push_back(VP.second);
01413           PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
01414         }
01415       }
01416     }
01417   }
01418 
01419   // This function figures out which pairs are connected.  Two pairs are
01420   // connected if some output of the first pair forms an input to both members
01421   // of the second pair.
01422   void BBVectorize::computeConnectedPairs(
01423                   DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
01424                   DenseSet<ValuePair> &CandidatePairsSet,
01425                   std::vector<Value *> &PairableInsts,
01426                   DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
01427                   DenseMap<VPPair, unsigned> &PairConnectionTypes) {
01428     for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
01429          PE = PairableInsts.end(); PI != PE; ++PI) {
01430       DenseMap<Value *, std::vector<Value *> >::iterator PP =
01431         CandidatePairs.find(*PI);
01432       if (PP == CandidatePairs.end())
01433         continue;
01434 
01435       for (std::vector<Value *>::iterator P = PP->second.begin(),
01436            E = PP->second.end(); P != E; ++P)
01437         computePairsConnectedTo(CandidatePairs, CandidatePairsSet,
01438                                 PairableInsts, ConnectedPairs,
01439                                 PairConnectionTypes, ValuePair(*PI, *P));
01440     }
01441 
01442     DEBUG(size_t TotalPairs = 0;
01443           for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator I =
01444                ConnectedPairs.begin(), IE = ConnectedPairs.end(); I != IE; ++I)
01445             TotalPairs += I->second.size();
01446           dbgs() << "BBV: found " << TotalPairs
01447                  << " pair connections.\n");
01448   }
01449 
01450   // This function builds a set of use tuples such that <A, B> is in the set
01451   // if B is in the use dag of A. If B is in the use dag of A, then B
01452   // depends on the output of A.
01453   void BBVectorize::buildDepMap(
01454                       BasicBlock &BB,
01455                       DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
01456                       std::vector<Value *> &PairableInsts,
01457                       DenseSet<ValuePair> &PairableInstUsers) {
01458     DenseSet<Value *> IsInPair;
01459     for (DenseMap<Value *, std::vector<Value *> >::iterator C =
01460          CandidatePairs.begin(), E = CandidatePairs.end(); C != E; ++C) {
01461       IsInPair.insert(C->first);
01462       IsInPair.insert(C->second.begin(), C->second.end());
01463     }
01464 
01465     // Iterate through the basic block, recording all users of each
01466     // pairable instruction.
01467 
01468     BasicBlock::iterator E = BB.end(), EL =
01469       BasicBlock::iterator(cast<Instruction>(PairableInsts.back()));
01470     for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) {
01471       if (IsInPair.find(I) == IsInPair.end()) continue;
01472 
01473       DenseSet<Value *> Users;
01474       AliasSetTracker WriteSet(*AA);
01475       if (I->mayWriteToMemory()) WriteSet.add(I);
01476 
01477       for (BasicBlock::iterator J = std::next(I); J != E; ++J) {
01478         (void) trackUsesOfI(Users, WriteSet, I, J);
01479 
01480         if (J == EL)
01481           break;
01482       }
01483 
01484       for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end();
01485            U != E; ++U) {
01486         if (IsInPair.find(*U) == IsInPair.end()) continue;
01487         PairableInstUsers.insert(ValuePair(I, *U));
01488       }
01489 
01490       if (I == EL)
01491         break;
01492     }
01493   }
01494 
01495   // Returns true if an input to pair P is an output of pair Q and also an
01496   // input of pair Q is an output of pair P. If this is the case, then these
01497   // two pairs cannot be simultaneously fused.
01498   bool BBVectorize::pairsConflict(ValuePair P, ValuePair Q,
01499              DenseSet<ValuePair> &PairableInstUsers,
01500              DenseMap<ValuePair, std::vector<ValuePair> > *PairableInstUserMap,
01501              DenseSet<VPPair> *PairableInstUserPairSet) {
01502     // Two pairs are in conflict if they are mutual Users of eachother.
01503     bool QUsesP = PairableInstUsers.count(ValuePair(P.first,  Q.first))  ||
01504                   PairableInstUsers.count(ValuePair(P.first,  Q.second)) ||
01505                   PairableInstUsers.count(ValuePair(P.second, Q.first))  ||
01506                   PairableInstUsers.count(ValuePair(P.second, Q.second));
01507     bool PUsesQ = PairableInstUsers.count(ValuePair(Q.first,  P.first))  ||
01508                   PairableInstUsers.count(ValuePair(Q.first,  P.second)) ||
01509                   PairableInstUsers.count(ValuePair(Q.second, P.first))  ||
01510                   PairableInstUsers.count(ValuePair(Q.second, P.second));
01511     if (PairableInstUserMap) {
01512       // FIXME: The expensive part of the cycle check is not so much the cycle
01513       // check itself but this edge insertion procedure. This needs some
01514       // profiling and probably a different data structure.
01515       if (PUsesQ) {
01516         if (PairableInstUserPairSet->insert(VPPair(Q, P)).second)
01517           (*PairableInstUserMap)[Q].push_back(P);
01518       }
01519       if (QUsesP) {
01520         if (PairableInstUserPairSet->insert(VPPair(P, Q)).second)
01521           (*PairableInstUserMap)[P].push_back(Q);
01522       }
01523     }
01524 
01525     return (QUsesP && PUsesQ);
01526   }
01527 
01528   // This function walks the use graph of current pairs to see if, starting
01529   // from P, the walk returns to P.
01530   bool BBVectorize::pairWillFormCycle(ValuePair P,
01531              DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
01532              DenseSet<ValuePair> &CurrentPairs) {
01533     DEBUG(if (DebugCycleCheck)
01534             dbgs() << "BBV: starting cycle check for : " << *P.first << " <-> "
01535                    << *P.second << "\n");
01536     // A lookup table of visisted pairs is kept because the PairableInstUserMap
01537     // contains non-direct associations.
01538     DenseSet<ValuePair> Visited;
01539     SmallVector<ValuePair, 32> Q;
01540     // General depth-first post-order traversal:
01541     Q.push_back(P);
01542     do {
01543       ValuePair QTop = Q.pop_back_val();
01544       Visited.insert(QTop);
01545 
01546       DEBUG(if (DebugCycleCheck)
01547               dbgs() << "BBV: cycle check visiting: " << *QTop.first << " <-> "
01548                      << *QTop.second << "\n");
01549       DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ =
01550         PairableInstUserMap.find(QTop);
01551       if (QQ == PairableInstUserMap.end())
01552         continue;
01553 
01554       for (std::vector<ValuePair>::iterator C = QQ->second.begin(),
01555            CE = QQ->second.end(); C != CE; ++C) {
01556         if (*C == P) {
01557           DEBUG(dbgs()
01558                  << "BBV: rejected to prevent non-trivial cycle formation: "
01559                  << QTop.first << " <-> " << C->second << "\n");
01560           return true;
01561         }
01562 
01563         if (CurrentPairs.count(*C) && !Visited.count(*C))
01564           Q.push_back(*C);
01565       }
01566     } while (!Q.empty());
01567 
01568     return false;
01569   }
01570 
01571   // This function builds the initial dag of connected pairs with the
01572   // pair J at the root.
01573   void BBVectorize::buildInitialDAGFor(
01574                   DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
01575                   DenseSet<ValuePair> &CandidatePairsSet,
01576                   std::vector<Value *> &PairableInsts,
01577                   DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
01578                   DenseSet<ValuePair> &PairableInstUsers,
01579                   DenseMap<Value *, Value *> &ChosenPairs,
01580                   DenseMap<ValuePair, size_t> &DAG, ValuePair J) {
01581     // Each of these pairs is viewed as the root node of a DAG. The DAG
01582     // is then walked (depth-first). As this happens, we keep track of
01583     // the pairs that compose the DAG and the maximum depth of the DAG.
01584     SmallVector<ValuePairWithDepth, 32> Q;
01585     // General depth-first post-order traversal:
01586     Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
01587     do {
01588       ValuePairWithDepth QTop = Q.back();
01589 
01590       // Push each child onto the queue:
01591       bool MoreChildren = false;
01592       size_t MaxChildDepth = QTop.second;
01593       DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ =
01594         ConnectedPairs.find(QTop.first);
01595       if (QQ != ConnectedPairs.end())
01596         for (std::vector<ValuePair>::iterator k = QQ->second.begin(),
01597              ke = QQ->second.end(); k != ke; ++k) {
01598           // Make sure that this child pair is still a candidate:
01599           if (CandidatePairsSet.count(*k)) {
01600             DenseMap<ValuePair, size_t>::iterator C = DAG.find(*k);
01601             if (C == DAG.end()) {
01602               size_t d = getDepthFactor(k->first);
01603               Q.push_back(ValuePairWithDepth(*k, QTop.second+d));
01604               MoreChildren = true;
01605             } else {
01606               MaxChildDepth = std::max(MaxChildDepth, C->second);
01607             }
01608           }
01609         }
01610 
01611       if (!MoreChildren) {
01612         // Record the current pair as part of the DAG:
01613         DAG.insert(ValuePairWithDepth(QTop.first, MaxChildDepth));
01614         Q.pop_back();
01615       }
01616     } while (!Q.empty());
01617   }
01618 
01619   // Given some initial dag, prune it by removing conflicting pairs (pairs
01620   // that cannot be simultaneously chosen for vectorization).
01621   void BBVectorize::pruneDAGFor(
01622               DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
01623               std::vector<Value *> &PairableInsts,
01624               DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
01625               DenseSet<ValuePair> &PairableInstUsers,
01626               DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
01627               DenseSet<VPPair> &PairableInstUserPairSet,
01628               DenseMap<Value *, Value *> &ChosenPairs,
01629               DenseMap<ValuePair, size_t> &DAG,
01630               DenseSet<ValuePair> &PrunedDAG, ValuePair J,
01631               bool UseCycleCheck) {
01632     SmallVector<ValuePairWithDepth, 32> Q;
01633     // General depth-first post-order traversal:
01634     Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
01635     do {
01636       ValuePairWithDepth QTop = Q.pop_back_val();
01637       PrunedDAG.insert(QTop.first);
01638 
01639       // Visit each child, pruning as necessary...
01640       SmallVector<ValuePairWithDepth, 8> BestChildren;
01641       DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ =
01642         ConnectedPairs.find(QTop.first);
01643       if (QQ == ConnectedPairs.end())
01644         continue;
01645 
01646       for (std::vector<ValuePair>::iterator K = QQ->second.begin(),
01647            KE = QQ->second.end(); K != KE; ++K) {
01648         DenseMap<ValuePair, size_t>::iterator C = DAG.find(*K);
01649         if (C == DAG.end()) continue;
01650 
01651         // This child is in the DAG, now we need to make sure it is the
01652         // best of any conflicting children. There could be multiple
01653         // conflicting children, so first, determine if we're keeping
01654         // this child, then delete conflicting children as necessary.
01655 
01656         // It is also necessary to guard against pairing-induced
01657         // dependencies. Consider instructions a .. x .. y .. b
01658         // such that (a,b) are to be fused and (x,y) are to be fused
01659         // but a is an input to x and b is an output from y. This
01660         // means that y cannot be moved after b but x must be moved
01661         // after b for (a,b) to be fused. In other words, after
01662         // fusing (a,b) we have y .. a/b .. x where y is an input
01663         // to a/b and x is an output to a/b: x and y can no longer
01664         // be legally fused. To prevent this condition, we must
01665         // make sure that a child pair added to the DAG is not
01666         // both an input and output of an already-selected pair.
01667 
01668         // Pairing-induced dependencies can also form from more complicated
01669         // cycles. The pair vs. pair conflicts are easy to check, and so
01670         // that is done explicitly for "fast rejection", and because for
01671         // child vs. child conflicts, we may prefer to keep the current
01672         // pair in preference to the already-selected child.
01673         DenseSet<ValuePair> CurrentPairs;
01674 
01675         bool CanAdd = true;
01676         for (SmallVectorImpl<ValuePairWithDepth>::iterator C2
01677               = BestChildren.begin(), E2 = BestChildren.end();
01678              C2 != E2; ++C2) {
01679           if (C2->first.first == C->first.first ||
01680               C2->first.first == C->first.second ||
01681               C2->first.second == C->first.first ||
01682               C2->first.second == C->first.second ||
01683               pairsConflict(C2->first, C->first, PairableInstUsers,
01684                             UseCycleCheck ? &PairableInstUserMap : nullptr,
01685                             UseCycleCheck ? &PairableInstUserPairSet
01686                                           : nullptr)) {
01687             if (C2->second >= C->second) {
01688               CanAdd = false;
01689               break;
01690             }
01691 
01692             CurrentPairs.insert(C2->first);
01693           }
01694         }
01695         if (!CanAdd) continue;
01696 
01697         // Even worse, this child could conflict with another node already
01698         // selected for the DAG. If that is the case, ignore this child.
01699         for (DenseSet<ValuePair>::iterator T = PrunedDAG.begin(),
01700              E2 = PrunedDAG.end(); T != E2; ++T) {
01701           if (T->first == C->first.first ||
01702               T->first == C->first.second ||
01703               T->second == C->first.first ||
01704               T->second == C->first.second ||
01705               pairsConflict(*T, C->first, PairableInstUsers,
01706                             UseCycleCheck ? &PairableInstUserMap : nullptr,
01707                             UseCycleCheck ? &PairableInstUserPairSet
01708                                           : nullptr)) {
01709             CanAdd = false;
01710             break;
01711           }
01712 
01713           CurrentPairs.insert(*T);
01714         }
01715         if (!CanAdd) continue;
01716 
01717         // And check the queue too...
01718         for (SmallVectorImpl<ValuePairWithDepth>::iterator C2 = Q.begin(),
01719              E2 = Q.end(); C2 != E2; ++C2) {
01720           if (C2->first.first == C->first.first ||
01721               C2->first.first == C->first.second ||
01722               C2->first.second == C->first.first ||
01723               C2->first.second == C->first.second ||
01724               pairsConflict(C2->first, C->first, PairableInstUsers,
01725                             UseCycleCheck ? &PairableInstUserMap : nullptr,
01726                             UseCycleCheck ? &PairableInstUserPairSet
01727                                           : nullptr)) {
01728             CanAdd = false;
01729             break;
01730           }
01731 
01732           CurrentPairs.insert(C2->first);
01733         }
01734         if (!CanAdd) continue;
01735 
01736         // Last but not least, check for a conflict with any of the
01737         // already-chosen pairs.
01738         for (DenseMap<Value *, Value *>::iterator C2 =
01739               ChosenPairs.begin(), E2 = ChosenPairs.end();
01740              C2 != E2; ++C2) {
01741           if (pairsConflict(*C2, C->first, PairableInstUsers,
01742                             UseCycleCheck ? &PairableInstUserMap : nullptr,
01743                             UseCycleCheck ? &PairableInstUserPairSet
01744                                           : nullptr)) {
01745             CanAdd = false;
01746             break;
01747           }
01748 
01749           CurrentPairs.insert(*C2);
01750         }
01751         if (!CanAdd) continue;
01752 
01753         // To check for non-trivial cycles formed by the addition of the
01754         // current pair we've formed a list of all relevant pairs, now use a
01755         // graph walk to check for a cycle. We start from the current pair and
01756         // walk the use dag to see if we again reach the current pair. If we
01757         // do, then the current pair is rejected.
01758 
01759         // FIXME: It may be more efficient to use a topological-ordering
01760         // algorithm to improve the cycle check. This should be investigated.
01761         if (UseCycleCheck &&
01762             pairWillFormCycle(C->first, PairableInstUserMap, CurrentPairs))
01763           continue;
01764 
01765         // This child can be added, but we may have chosen it in preference
01766         // to an already-selected child. Check for this here, and if a
01767         // conflict is found, then remove the previously-selected child
01768         // before adding this one in its place.
01769         for (SmallVectorImpl<ValuePairWithDepth>::iterator C2
01770               = BestChildren.begin(); C2 != BestChildren.end();) {
01771           if (C2->first.first == C->first.first ||
01772               C2->first.first == C->first.second ||
01773               C2->first.second == C->first.first ||
01774               C2->first.second == C->first.second ||
01775               pairsConflict(C2->first, C->first, PairableInstUsers))
01776             C2 = BestChildren.erase(C2);
01777           else
01778             ++C2;
01779         }
01780 
01781         BestChildren.push_back(ValuePairWithDepth(C->first, C->second));
01782       }
01783 
01784       for (SmallVectorImpl<ValuePairWithDepth>::iterator C
01785             = BestChildren.begin(), E2 = BestChildren.end();
01786            C != E2; ++C) {
01787         size_t DepthF = getDepthFactor(C->first.first);
01788         Q.push_back(ValuePairWithDepth(C->first, QTop.second+DepthF));
01789       }
01790     } while (!Q.empty());
01791   }
01792 
01793   // This function finds the best dag of mututally-compatible connected
01794   // pairs, given the choice of root pairs as an iterator range.
01795   void BBVectorize::findBestDAGFor(
01796               DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
01797               DenseSet<ValuePair> &CandidatePairsSet,
01798               DenseMap<ValuePair, int> &CandidatePairCostSavings,
01799               std::vector<Value *> &PairableInsts,
01800               DenseSet<ValuePair> &FixedOrderPairs,
01801               DenseMap<VPPair, unsigned> &PairConnectionTypes,
01802               DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
01803               DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
01804               DenseSet<ValuePair> &PairableInstUsers,
01805               DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
01806               DenseSet<VPPair> &PairableInstUserPairSet,
01807               DenseMap<Value *, Value *> &ChosenPairs,
01808               DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth,
01809               int &BestEffSize, Value *II, std::vector<Value *>&JJ,
01810               bool UseCycleCheck) {
01811     for (std::vector<Value *>::iterator J = JJ.begin(), JE = JJ.end();
01812          J != JE; ++J) {
01813       ValuePair IJ(II, *J);
01814       if (!CandidatePairsSet.count(IJ))
01815         continue;
01816 
01817       // Before going any further, make sure that this pair does not
01818       // conflict with any already-selected pairs (see comment below
01819       // near the DAG pruning for more details).
01820       DenseSet<ValuePair> ChosenPairSet;
01821       bool DoesConflict = false;
01822       for (DenseMap<Value *, Value *>::iterator C = ChosenPairs.begin(),
01823            E = ChosenPairs.end(); C != E; ++C) {
01824         if (pairsConflict(*C, IJ, PairableInstUsers,
01825                           UseCycleCheck ? &PairableInstUserMap : nullptr,
01826                           UseCycleCheck ? &PairableInstUserPairSet : nullptr)) {
01827           DoesConflict = true;
01828           break;
01829         }
01830 
01831         ChosenPairSet.insert(*C);
01832       }
01833       if (DoesConflict) continue;
01834 
01835       if (UseCycleCheck &&
01836           pairWillFormCycle(IJ, PairableInstUserMap, ChosenPairSet))
01837         continue;
01838 
01839       DenseMap<ValuePair, size_t> DAG;
01840       buildInitialDAGFor(CandidatePairs, CandidatePairsSet,
01841                           PairableInsts, ConnectedPairs,
01842                           PairableInstUsers, ChosenPairs, DAG, IJ);
01843 
01844       // Because we'll keep the child with the largest depth, the largest
01845       // depth is still the same in the unpruned DAG.
01846       size_t MaxDepth = DAG.lookup(IJ);
01847 
01848       DEBUG(if (DebugPairSelection) dbgs() << "BBV: found DAG for pair {"
01849                    << *IJ.first << " <-> " << *IJ.second << "} of depth " <<
01850                    MaxDepth << " and size " << DAG.size() << "\n");
01851 
01852       // At this point the DAG has been constructed, but, may contain
01853       // contradictory children (meaning that different children of
01854       // some dag node may be attempting to fuse the same instruction).
01855       // So now we walk the dag again, in the case of a conflict,
01856       // keep only the child with the largest depth. To break a tie,
01857       // favor the first child.
01858 
01859       DenseSet<ValuePair> PrunedDAG;
01860       pruneDAGFor(CandidatePairs, PairableInsts, ConnectedPairs,
01861                    PairableInstUsers, PairableInstUserMap,
01862                    PairableInstUserPairSet,
01863                    ChosenPairs, DAG, PrunedDAG, IJ, UseCycleCheck);
01864 
01865       int EffSize = 0;
01866       if (TTI) {
01867         DenseSet<Value *> PrunedDAGInstrs;
01868         for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
01869              E = PrunedDAG.end(); S != E; ++S) {
01870           PrunedDAGInstrs.insert(S->first);
01871           PrunedDAGInstrs.insert(S->second);
01872         }
01873 
01874         // The set of pairs that have already contributed to the total cost.
01875         DenseSet<ValuePair> IncomingPairs;
01876 
01877         // If the cost model were perfect, this might not be necessary; but we
01878         // need to make sure that we don't get stuck vectorizing our own
01879         // shuffle chains.
01880         bool HasNontrivialInsts = false;
01881 
01882         // The node weights represent the cost savings associated with
01883         // fusing the pair of instructions.
01884         for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
01885              E = PrunedDAG.end(); S != E; ++S) {
01886           if (!isa<ShuffleVectorInst>(S->first) &&
01887               !isa<InsertElementInst>(S->first) &&
01888               !isa<ExtractElementInst>(S->first))
01889             HasNontrivialInsts = true;
01890 
01891           bool FlipOrder = false;
01892 
01893           if (getDepthFactor(S->first)) {
01894             int ESContrib = CandidatePairCostSavings.find(*S)->second;
01895             DEBUG(if (DebugPairSelection) dbgs() << "\tweight {"
01896                    << *S->first << " <-> " << *S->second << "} = " <<
01897                    ESContrib << "\n");
01898             EffSize += ESContrib;
01899           }
01900 
01901           // The edge weights contribute in a negative sense: they represent
01902           // the cost of shuffles.
01903           DenseMap<ValuePair, std::vector<ValuePair> >::iterator SS =
01904             ConnectedPairDeps.find(*S);
01905           if (SS != ConnectedPairDeps.end()) {
01906             unsigned NumDepsDirect = 0, NumDepsSwap = 0;
01907             for (std::vector<ValuePair>::iterator T = SS->second.begin(),
01908                  TE = SS->second.end(); T != TE; ++T) {
01909               VPPair Q(*S, *T);
01910               if (!PrunedDAG.count(Q.second))
01911                 continue;
01912               DenseMap<VPPair, unsigned>::iterator R =
01913                 PairConnectionTypes.find(VPPair(Q.second, Q.first));
01914               assert(R != PairConnectionTypes.end() &&
01915                      "Cannot find pair connection type");
01916               if (R->second == PairConnectionDirect)
01917                 ++NumDepsDirect;
01918               else if (R->second == PairConnectionSwap)
01919                 ++NumDepsSwap;
01920             }
01921 
01922             // If there are more swaps than direct connections, then
01923             // the pair order will be flipped during fusion. So the real
01924             // number of swaps is the minimum number.
01925             FlipOrder = !FixedOrderPairs.count(*S) &&
01926               ((NumDepsSwap > NumDepsDirect) ||
01927                 FixedOrderPairs.count(ValuePair(S->second, S->first)));
01928 
01929             for (std::vector<ValuePair>::iterator T = SS->second.begin(),
01930                  TE = SS->second.end(); T != TE; ++T) {
01931               VPPair Q(*S, *T);
01932               if (!PrunedDAG.count(Q.second))
01933                 continue;
01934               DenseMap<VPPair, unsigned>::iterator R =
01935                 PairConnectionTypes.find(VPPair(Q.second, Q.first));
01936               assert(R != PairConnectionTypes.end() &&
01937                      "Cannot find pair connection type");
01938               Type *Ty1 = Q.second.first->getType(),
01939                    *Ty2 = Q.second.second->getType();
01940               Type *VTy = getVecTypeForPair(Ty1, Ty2);
01941               if ((R->second == PairConnectionDirect && FlipOrder) ||
01942                   (R->second == PairConnectionSwap && !FlipOrder)  ||
01943                   R->second == PairConnectionSplat) {
01944                 int ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
01945                                                    VTy, VTy);
01946 
01947                 if (VTy->getVectorNumElements() == 2) {
01948                   if (R->second == PairConnectionSplat)
01949                     ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
01950                       TargetTransformInfo::SK_Broadcast, VTy));
01951                   else
01952                     ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
01953                       TargetTransformInfo::SK_Reverse, VTy));
01954                 }
01955 
01956                 DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
01957                   *Q.second.first << " <-> " << *Q.second.second <<
01958                     "} -> {" <<
01959                   *S->first << " <-> " << *S->second << "} = " <<
01960                    ESContrib << "\n");
01961                 EffSize -= ESContrib;
01962               }
01963             }
01964           }
01965 
01966           // Compute the cost of outgoing edges. We assume that edges outgoing
01967           // to shuffles, inserts or extracts can be merged, and so contribute
01968           // no additional cost.
01969           if (!S->first->getType()->isVoidTy()) {
01970             Type *Ty1 = S->first->getType(),
01971                  *Ty2 = S->second->getType();
01972             Type *VTy = getVecTypeForPair(Ty1, Ty2);
01973 
01974             bool NeedsExtraction = false;
01975             for (User *U : S->first->users()) {
01976               if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(U)) {
01977                 // Shuffle can be folded if it has no other input
01978                 if (isa<UndefValue>(SI->getOperand(1)))
01979                   continue;
01980               }
01981               if (isa<ExtractElementInst>(U))
01982                 continue;
01983               if (PrunedDAGInstrs.count(U))
01984                 continue;
01985               NeedsExtraction = true;
01986               break;
01987             }
01988 
01989             if (NeedsExtraction) {
01990               int ESContrib;
01991               if (Ty1->isVectorTy()) {
01992                 ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
01993                                                Ty1, VTy);
01994                 ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
01995                   TargetTransformInfo::SK_ExtractSubvector, VTy, 0, Ty1));
01996               } else
01997                 ESContrib = (int) TTI->getVectorInstrCost(
01998                                     Instruction::ExtractElement, VTy, 0);
01999 
02000               DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
02001                 *S->first << "} = " << ESContrib << "\n");
02002               EffSize -= ESContrib;
02003             }
02004 
02005             NeedsExtraction = false;
02006             for (User *U : S->second->users()) {
02007               if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(U)) {
02008                 // Shuffle can be folded if it has no other input
02009                 if (isa<UndefValue>(SI->getOperand(1)))
02010                   continue;
02011               }
02012               if (isa<ExtractElementInst>(U))
02013                 continue;
02014               if (PrunedDAGInstrs.count(U))
02015                 continue;
02016               NeedsExtraction = true;
02017               break;
02018             }
02019 
02020             if (NeedsExtraction) {
02021               int ESContrib;
02022               if (Ty2->isVectorTy()) {
02023                 ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
02024                                                Ty2, VTy);
02025                 ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
02026                   TargetTransformInfo::SK_ExtractSubvector, VTy,
02027                   Ty1->isVectorTy() ? Ty1->getVectorNumElements() : 1, Ty2));
02028               } else
02029                 ESContrib = (int) TTI->getVectorInstrCost(
02030                                     Instruction::ExtractElement, VTy, 1);
02031               DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
02032                 *S->second << "} = " << ESContrib << "\n");
02033               EffSize -= ESContrib;
02034             }
02035           }
02036 
02037           // Compute the cost of incoming edges.
02038           if (!isa<LoadInst>(S->first) && !isa<StoreInst>(S->first)) {
02039             Instruction *S1 = cast<Instruction>(S->first),
02040                         *S2 = cast<Instruction>(S->second);
02041             for (unsigned o = 0; o < S1->getNumOperands(); ++o) {
02042               Value *O1 = S1->getOperand(o), *O2 = S2->getOperand(o);
02043 
02044               // Combining constants into vector constants (or small vector
02045               // constants into larger ones are assumed free).
02046               if (isa<Constant>(O1) && isa<Constant>(O2))
02047                 continue;
02048 
02049               if (FlipOrder)
02050                 std::swap(O1, O2);
02051 
02052               ValuePair VP  = ValuePair(O1, O2);
02053               ValuePair VPR = ValuePair(O2, O1);
02054 
02055               // Internal edges are not handled here.
02056               if (PrunedDAG.count(VP) || PrunedDAG.count(VPR))
02057                 continue;
02058 
02059               Type *Ty1 = O1->getType(),
02060                    *Ty2 = O2->getType();
02061               Type *VTy = getVecTypeForPair(Ty1, Ty2);
02062 
02063               // Combining vector operations of the same type is also assumed
02064               // folded with other operations.
02065               if (Ty1 == Ty2) {
02066                 // If both are insert elements, then both can be widened.
02067                 InsertElementInst *IEO1 = dyn_cast<InsertElementInst>(O1),
02068                                   *IEO2 = dyn_cast<InsertElementInst>(O2);
02069                 if (IEO1 && IEO2 && isPureIEChain(IEO1) && isPureIEChain(IEO2))
02070                   continue;
02071                 // If both are extract elements, and both have the same input
02072                 // type, then they can be replaced with a shuffle
02073                 ExtractElementInst *EIO1 = dyn_cast<ExtractElementInst>(O1),
02074                                    *EIO2 = dyn_cast<ExtractElementInst>(O2);
02075                 if (EIO1 && EIO2 &&
02076                     EIO1->getOperand(0)->getType() ==
02077                       EIO2->getOperand(0)->getType())
02078                   continue;
02079                 // If both are a shuffle with equal operand types and only two
02080                 // unqiue operands, then they can be replaced with a single
02081                 // shuffle
02082                 ShuffleVectorInst *SIO1 = dyn_cast<ShuffleVectorInst>(O1),
02083                                   *SIO2 = dyn_cast<ShuffleVectorInst>(O2);
02084                 if (SIO1 && SIO2 &&
02085                     SIO1->getOperand(0)->getType() ==
02086                       SIO2->getOperand(0)->getType()) {
02087                   SmallSet<Value *, 4> SIOps;
02088                   SIOps.insert(SIO1->getOperand(0));
02089                   SIOps.insert(SIO1->getOperand(1));
02090                   SIOps.insert(SIO2->getOperand(0));
02091                   SIOps.insert(SIO2->getOperand(1));
02092                   if (SIOps.size() <= 2)
02093                     continue;
02094                 }
02095               }
02096 
02097               int ESContrib;
02098               // This pair has already been formed.
02099               if (IncomingPairs.count(VP)) {
02100                 continue;
02101               } else if (IncomingPairs.count(VPR)) {
02102                 ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
02103                                                VTy, VTy);
02104 
02105                 if (VTy->getVectorNumElements() == 2)
02106                   ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
02107                     TargetTransformInfo::SK_Reverse, VTy));
02108               } else if (!Ty1->isVectorTy() && !Ty2->isVectorTy()) {
02109                 ESContrib = (int) TTI->getVectorInstrCost(
02110                                     Instruction::InsertElement, VTy, 0);
02111                 ESContrib += (int) TTI->getVectorInstrCost(
02112                                      Instruction::InsertElement, VTy, 1);
02113               } else if (!Ty1->isVectorTy()) {
02114                 // O1 needs to be inserted into a vector of size O2, and then
02115                 // both need to be shuffled together.
02116                 ESContrib = (int) TTI->getVectorInstrCost(
02117                                     Instruction::InsertElement, Ty2, 0);
02118                 ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
02119                                                 VTy, Ty2);
02120               } else if (!Ty2->isVectorTy()) {
02121                 // O2 needs to be inserted into a vector of size O1, and then
02122                 // both need to be shuffled together.
02123                 ESContrib = (int) TTI->getVectorInstrCost(
02124                                     Instruction::InsertElement, Ty1, 0);
02125                 ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
02126                                                 VTy, Ty1);
02127               } else {
02128                 Type *TyBig = Ty1, *TySmall = Ty2;
02129                 if (Ty2->getVectorNumElements() > Ty1->getVectorNumElements())
02130                   std::swap(TyBig, TySmall);
02131 
02132                 ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
02133                                                VTy, TyBig);
02134                 if (TyBig != TySmall)
02135                   ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
02136                                                   TyBig, TySmall);
02137               }
02138 
02139               DEBUG(if (DebugPairSelection) dbgs() << "\tcost {"
02140                      << *O1 << " <-> " << *O2 << "} = " <<
02141                      ESContrib << "\n");
02142               EffSize -= ESContrib;
02143               IncomingPairs.insert(VP);
02144             }
02145           }
02146         }
02147 
02148         if (!HasNontrivialInsts) {
02149           DEBUG(if (DebugPairSelection) dbgs() <<
02150                 "\tNo non-trivial instructions in DAG;"
02151                 " override to zero effective size\n");
02152           EffSize = 0;
02153         }
02154       } else {
02155         for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
02156              E = PrunedDAG.end(); S != E; ++S)
02157           EffSize += (int) getDepthFactor(S->first);
02158       }
02159 
02160       DEBUG(if (DebugPairSelection)
02161              dbgs() << "BBV: found pruned DAG for pair {"
02162              << *IJ.first << " <-> " << *IJ.second << "} of depth " <<
02163              MaxDepth << " and size " << PrunedDAG.size() <<
02164             " (effective size: " << EffSize << ")\n");
02165       if (((TTI && !UseChainDepthWithTI) ||
02166             MaxDepth >= Config.ReqChainDepth) &&
02167           EffSize > 0 && EffSize > BestEffSize) {
02168         BestMaxDepth = MaxDepth;
02169         BestEffSize = EffSize;
02170         BestDAG = PrunedDAG;
02171       }
02172     }
02173   }
02174 
02175   // Given the list of candidate pairs, this function selects those
02176   // that will be fused into vector instructions.
02177   void BBVectorize::choosePairs(
02178                 DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
02179                 DenseSet<ValuePair> &CandidatePairsSet,
02180                 DenseMap<ValuePair, int> &CandidatePairCostSavings,
02181                 std::vector<Value *> &PairableInsts,
02182                 DenseSet<ValuePair> &FixedOrderPairs,
02183                 DenseMap<VPPair, unsigned> &PairConnectionTypes,
02184                 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
02185                 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
02186                 DenseSet<ValuePair> &PairableInstUsers,
02187                 DenseMap<Value *, Value *>& ChosenPairs) {
02188     bool UseCycleCheck =
02189      CandidatePairsSet.size() <= Config.MaxCandPairsForCycleCheck;
02190 
02191     DenseMap<Value *, std::vector<Value *> > CandidatePairs2;
02192     for (DenseSet<ValuePair>::iterator I = CandidatePairsSet.begin(),
02193          E = CandidatePairsSet.end(); I != E; ++I) {
02194       std::vector<Value *> &JJ = CandidatePairs2[I->second];
02195       if (JJ.empty()) JJ.reserve(32);
02196       JJ.push_back(I->first);
02197     }
02198 
02199     DenseMap<ValuePair, std::vector<ValuePair> > PairableInstUserMap;
02200     DenseSet<VPPair> PairableInstUserPairSet;
02201     for (std::vector<Value *>::iterator I = PairableInsts.begin(),
02202          E = PairableInsts.end(); I != E; ++I) {
02203       // The number of possible pairings for this variable:
02204       size_t NumChoices = CandidatePairs.lookup(*I).size();
02205       if (!NumChoices) continue;
02206 
02207       std::vector<Value *> &JJ = CandidatePairs[*I];
02208 
02209       // The best pair to choose and its dag:
02210       size_t BestMaxDepth = 0;
02211       int BestEffSize = 0;
02212       DenseSet<ValuePair> BestDAG;
02213       findBestDAGFor(CandidatePairs, CandidatePairsSet,
02214                       CandidatePairCostSavings,
02215                       PairableInsts, FixedOrderPairs, PairConnectionTypes,
02216                       ConnectedPairs, ConnectedPairDeps,
02217                       PairableInstUsers, PairableInstUserMap,
02218                       PairableInstUserPairSet, ChosenPairs,
02219                       BestDAG, BestMaxDepth, BestEffSize, *I, JJ,
02220                       UseCycleCheck);
02221 
02222       if (BestDAG.empty())
02223         continue;
02224 
02225       // A dag has been chosen (or not) at this point. If no dag was
02226       // chosen, then this instruction, I, cannot be paired (and is no longer
02227       // considered).
02228 
02229       DEBUG(dbgs() << "BBV: selected pairs in the best DAG for: "
02230                    << *cast<Instruction>(*I) << "\n");
02231 
02232       for (DenseSet<ValuePair>::iterator S = BestDAG.begin(),
02233            SE2 = BestDAG.end(); S != SE2; ++S) {
02234         // Insert the members of this dag into the list of chosen pairs.
02235         ChosenPairs.insert(ValuePair(S->first, S->second));
02236         DEBUG(dbgs() << "BBV: selected pair: " << *S->first << " <-> " <<
02237                *S->second << "\n");
02238 
02239         // Remove all candidate pairs that have values in the chosen dag.
02240         std::vector<Value *> &KK = CandidatePairs[S->first];
02241         for (std::vector<Value *>::iterator K = KK.begin(), KE = KK.end();
02242              K != KE; ++K) {
02243           if (*K == S->second)
02244             continue;
02245 
02246           CandidatePairsSet.erase(ValuePair(S->first, *K));
02247         }
02248 
02249         std::vector<Value *> &LL = CandidatePairs2[S->second];
02250         for (std::vector<Value *>::iterator L = LL.begin(), LE = LL.end();
02251              L != LE; ++L) {
02252           if (*L == S->first)
02253             continue;
02254 
02255           CandidatePairsSet.erase(ValuePair(*L, S->second));
02256         }
02257 
02258         std::vector<Value *> &MM = CandidatePairs[S->second];
02259         for (std::vector<Value *>::iterator M = MM.begin(), ME = MM.end();
02260              M != ME; ++M) {
02261           assert(*M != S->first && "Flipped pair in candidate list?");
02262           CandidatePairsSet.erase(ValuePair(S->second, *M));
02263         }
02264 
02265         std::vector<Value *> &NN = CandidatePairs2[S->first];
02266         for (std::vector<Value *>::iterator N = NN.begin(), NE = NN.end();
02267              N != NE; ++N) {
02268           assert(*N != S->second && "Flipped pair in candidate list?");
02269           CandidatePairsSet.erase(ValuePair(*N, S->first));
02270         }
02271       }
02272     }
02273 
02274     DEBUG(dbgs() << "BBV: selected " << ChosenPairs.size() << " pairs.\n");
02275   }
02276 
02277   std::string getReplacementName(Instruction *I, bool IsInput, unsigned o,
02278                      unsigned n = 0) {
02279     if (!I->hasName())
02280       return "";
02281 
02282     return (I->getName() + (IsInput ? ".v.i" : ".v.r") + utostr(o) +
02283              (n > 0 ? "." + utostr(n) : "")).str();
02284   }
02285 
02286   // Returns the value that is to be used as the pointer input to the vector
02287   // instruction that fuses I with J.
02288   Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context,
02289                      Instruction *I, Instruction *J, unsigned o) {
02290     Value *IPtr, *JPtr;
02291     unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
02292     int64_t OffsetInElmts;
02293 
02294     // Note: the analysis might fail here, that is why the pair order has
02295     // been precomputed (OffsetInElmts must be unused here).
02296     (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
02297                           IAddressSpace, JAddressSpace,
02298                           OffsetInElmts, false);
02299 
02300     // The pointer value is taken to be the one with the lowest offset.
02301     Value *VPtr = IPtr;
02302 
02303     Type *ArgTypeI = IPtr->getType()->getPointerElementType();
02304     Type *ArgTypeJ = JPtr->getType()->getPointerElementType();
02305     Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
02306     Type *VArgPtrType
02307       = PointerType::get(VArgType,
02308                          IPtr->getType()->getPointerAddressSpace());
02309     return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o),
02310                         /* insert before */ I);
02311   }
02312 
02313   void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J,
02314                      unsigned MaskOffset, unsigned NumInElem,
02315                      unsigned NumInElem1, unsigned IdxOffset,
02316                      std::vector<Constant*> &Mask) {
02317     unsigned NumElem1 = J->getType()->getVectorNumElements();
02318     for (unsigned v = 0; v < NumElem1; ++v) {
02319       int m = cast<ShuffleVectorInst>(J)->getMaskValue(v);
02320       if (m < 0) {
02321         Mask[v+MaskOffset] = UndefValue::get(Type::getInt32Ty(Context));
02322       } else {
02323         unsigned mm = m + (int) IdxOffset;
02324         if (m >= (int) NumInElem1)
02325           mm += (int) NumInElem;
02326 
02327         Mask[v+MaskOffset] =
02328           ConstantInt::get(Type::getInt32Ty(Context), mm);
02329       }
02330     }
02331   }
02332 
02333   // Returns the value that is to be used as the vector-shuffle mask to the
02334   // vector instruction that fuses I with J.
02335   Value *BBVectorize::getReplacementShuffleMask(LLVMContext& Context,
02336                      Instruction *I, Instruction *J) {
02337     // This is the shuffle mask. We need to append the second
02338     // mask to the first, and the numbers need to be adjusted.
02339 
02340     Type *ArgTypeI = I->getType();
02341     Type *ArgTypeJ = J->getType();
02342     Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
02343 
02344     unsigned NumElemI = ArgTypeI->getVectorNumElements();
02345 
02346     // Get the total number of elements in the fused vector type.
02347     // By definition, this must equal the number of elements in
02348     // the final mask.
02349     unsigned NumElem = VArgType->getVectorNumElements();
02350     std::vector<Constant*> Mask(NumElem);
02351 
02352     Type *OpTypeI = I->getOperand(0)->getType();
02353     unsigned NumInElemI = OpTypeI->getVectorNumElements();
02354     Type *OpTypeJ = J->getOperand(0)->getType();
02355     unsigned NumInElemJ = OpTypeJ->getVectorNumElements();
02356 
02357     // The fused vector will be:
02358     // -----------------------------------------------------
02359     // | NumInElemI | NumInElemJ | NumInElemI | NumInElemJ |
02360     // -----------------------------------------------------
02361     // from which we'll extract NumElem total elements (where the first NumElemI
02362     // of them come from the mask in I and the remainder come from the mask
02363     // in J.
02364 
02365     // For the mask from the first pair...
02366     fillNewShuffleMask(Context, I, 0,        NumInElemJ, NumInElemI,
02367                        0,          Mask);
02368 
02369     // For the mask from the second pair...
02370     fillNewShuffleMask(Context, J, NumElemI, NumInElemI, NumInElemJ,
02371                        NumInElemI, Mask);
02372 
02373     return ConstantVector::get(Mask);
02374   }
02375 
02376   bool BBVectorize::expandIEChain(LLVMContext& Context, Instruction *I,
02377                                   Instruction *J, unsigned o, Value *&LOp,
02378                                   unsigned numElemL,
02379                                   Type *ArgTypeL, Type *ArgTypeH,
02380                                   bool IBeforeJ, unsigned IdxOff) {
02381     bool ExpandedIEChain = false;
02382     if (InsertElementInst *LIE = dyn_cast<InsertElementInst>(LOp)) {
02383       // If we have a pure insertelement chain, then this can be rewritten
02384       // into a chain that directly builds the larger type.
02385       if (isPureIEChain(LIE)) {
02386         SmallVector<Value *, 8> VectElemts(numElemL,
02387           UndefValue::get(ArgTypeL->getScalarType()));
02388         InsertElementInst *LIENext = LIE;
02389         do {
02390           unsigned Idx =
02391             cast<ConstantInt>(LIENext->getOperand(2))->getSExtValue();
02392           VectElemts[Idx] = LIENext->getOperand(1);
02393         } while ((LIENext =
02394                    dyn_cast<InsertElementInst>(LIENext->getOperand(0))));
02395 
02396         LIENext = nullptr;
02397         Value *LIEPrev = UndefValue::get(ArgTypeH);
02398         for (unsigned i = 0; i < numElemL; ++i) {
02399           if (isa<UndefValue>(VectElemts[i])) continue;
02400           LIENext = InsertElementInst::Create(LIEPrev, VectElemts[i],
02401                              ConstantInt::get(Type::getInt32Ty(Context),
02402                                               i + IdxOff),
02403                              getReplacementName(IBeforeJ ? I : J,
02404                                                 true, o, i+1));
02405           LIENext->insertBefore(IBeforeJ ? J : I);
02406           LIEPrev = LIENext;
02407         }
02408 
02409         LOp = LIENext ? (Value*) LIENext : UndefValue::get(ArgTypeH);
02410         ExpandedIEChain = true;
02411       }
02412     }
02413 
02414     return ExpandedIEChain;
02415   }
02416 
02417   static unsigned getNumScalarElements(Type *Ty) {
02418     if (VectorType *VecTy = dyn_cast<VectorType>(Ty))
02419       return VecTy->getNumElements();
02420     return 1;
02421   }
02422 
02423   // Returns the value to be used as the specified operand of the vector
02424   // instruction that fuses I with J.
02425   Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I,
02426                      Instruction *J, unsigned o, bool IBeforeJ) {
02427     Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
02428     Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1);
02429 
02430     // Compute the fused vector type for this operand
02431     Type *ArgTypeI = I->getOperand(o)->getType();
02432     Type *ArgTypeJ = J->getOperand(o)->getType();
02433     VectorType *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
02434 
02435     Instruction *L = I, *H = J;
02436     Type *ArgTypeL = ArgTypeI, *ArgTypeH = ArgTypeJ;
02437 
02438     unsigned numElemL = getNumScalarElements(ArgTypeL);
02439     unsigned numElemH = getNumScalarElements(ArgTypeH);
02440 
02441     Value *LOp = L->getOperand(o);
02442     Value *HOp = H->getOperand(o);
02443     unsigned numElem = VArgType->getNumElements();
02444 
02445     // First, we check if we can reuse the "original" vector outputs (if these
02446     // exist). We might need a shuffle.
02447     ExtractElementInst *LEE = dyn_cast<ExtractElementInst>(LOp);
02448     ExtractElementInst *HEE = dyn_cast<ExtractElementInst>(HOp);
02449     ShuffleVectorInst *LSV = dyn_cast<ShuffleVectorInst>(LOp);
02450     ShuffleVectorInst *HSV = dyn_cast<ShuffleVectorInst>(HOp);
02451 
02452     // FIXME: If we're fusing shuffle instructions, then we can't apply this
02453     // optimization. The input vectors to the shuffle might be a different
02454     // length from the shuffle outputs. Unfortunately, the replacement
02455     // shuffle mask has already been formed, and the mask entries are sensitive
02456     // to the sizes of the inputs.
02457     bool IsSizeChangeShuffle =
02458       isa<ShuffleVectorInst>(L) &&
02459         (LOp->getType() != L->getType() || HOp->getType() != H->getType());
02460 
02461     if ((LEE || LSV) && (HEE || HSV) && !IsSizeChangeShuffle) {
02462       // We can have at most two unique vector inputs.
02463       bool CanUseInputs = true;
02464       Value *I1, *I2 = nullptr;
02465       if (LEE) {
02466         I1 = LEE->getOperand(0);
02467       } else {
02468         I1 = LSV->getOperand(0);
02469         I2 = LSV->getOperand(1);
02470         if (I2 == I1 || isa<UndefValue>(I2))
02471           I2 = nullptr;
02472       }
02473   
02474       if (HEE) {
02475         Value *I3 = HEE->getOperand(0);
02476         if (!I2 && I3 != I1)
02477           I2 = I3;
02478         else if (I3 != I1 && I3 != I2)
02479           CanUseInputs = false;
02480       } else {
02481         Value *I3 = HSV->getOperand(0);
02482         if (!I2 && I3 != I1)
02483           I2 = I3;
02484         else if (I3 != I1 && I3 != I2)
02485           CanUseInputs = false;
02486 
02487         if (CanUseInputs) {
02488           Value *I4 = HSV->getOperand(1);
02489           if (!isa<UndefValue>(I4)) {
02490             if (!I2 && I4 != I1)
02491               I2 = I4;
02492             else if (I4 != I1 && I4 != I2)
02493               CanUseInputs = false;
02494           }
02495         }
02496       }
02497 
02498       if (CanUseInputs) {
02499         unsigned LOpElem =
02500           cast<Instruction>(LOp)->getOperand(0)->getType()
02501             ->getVectorNumElements();
02502 
02503         unsigned HOpElem =
02504           cast<Instruction>(HOp)->getOperand(0)->getType()
02505             ->getVectorNumElements();
02506 
02507         // We have one or two input vectors. We need to map each index of the
02508         // operands to the index of the original vector.
02509         SmallVector<std::pair<int, int>, 8>  II(numElem);
02510         for (unsigned i = 0; i < numElemL; ++i) {
02511           int Idx, INum;
02512           if (LEE) {
02513             Idx =
02514               cast<ConstantInt>(LEE->getOperand(1))->getSExtValue();
02515             INum = LEE->getOperand(0) == I1 ? 0 : 1;
02516           } else {
02517             Idx = LSV->getMaskValue(i);
02518             if (Idx < (int) LOpElem) {
02519               INum = LSV->getOperand(0) == I1 ? 0 : 1;
02520             } else {
02521               Idx -= LOpElem;
02522               INum = LSV->getOperand(1) == I1 ? 0 : 1;
02523             }
02524           }
02525 
02526           II[i] = std::pair<int, int>(Idx, INum);
02527         }
02528         for (unsigned i = 0; i < numElemH; ++i) {
02529           int Idx, INum;
02530           if (HEE) {
02531             Idx =
02532               cast<ConstantInt>(HEE->getOperand(1))->getSExtValue();
02533             INum = HEE->getOperand(0) == I1 ? 0 : 1;
02534           } else {
02535             Idx = HSV->getMaskValue(i);
02536             if (Idx < (int) HOpElem) {
02537               INum = HSV->getOperand(0) == I1 ? 0 : 1;
02538             } else {
02539               Idx -= HOpElem;
02540               INum = HSV->getOperand(1) == I1 ? 0 : 1;
02541             }
02542           }
02543 
02544           II[i + numElemL] = std::pair<int, int>(Idx, INum);
02545         }
02546 
02547         // We now have an array which tells us from which index of which
02548         // input vector each element of the operand comes.
02549         VectorType *I1T = cast<VectorType>(I1->getType());
02550         unsigned I1Elem = I1T->getNumElements();
02551 
02552         if (!I2) {
02553           // In this case there is only one underlying vector input. Check for
02554           // the trivial case where we can use the input directly.
02555           if (I1Elem == numElem) {
02556             bool ElemInOrder = true;
02557             for (unsigned i = 0; i < numElem; ++i) {
02558               if (II[i].first != (int) i && II[i].first != -1) {
02559                 ElemInOrder = false;
02560                 break;
02561               }
02562             }
02563 
02564             if (ElemInOrder)
02565               return I1;
02566           }
02567 
02568           // A shuffle is needed.
02569           std::vector<Constant *> Mask(numElem);
02570           for (unsigned i = 0; i < numElem; ++i) {
02571             int Idx = II[i].first;
02572             if (Idx == -1)
02573               Mask[i] = UndefValue::get(Type::getInt32Ty(Context));
02574             else
02575               Mask[i] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
02576           }
02577 
02578           Instruction *S =
02579             new ShuffleVectorInst(I1, UndefValue::get(I1T),
02580                                   ConstantVector::get(Mask),
02581                                   getReplacementName(IBeforeJ ? I : J,
02582                                                      true, o));
02583           S->insertBefore(IBeforeJ ? J : I);
02584           return S;
02585         }
02586 
02587         VectorType *I2T = cast<VectorType>(I2->getType());
02588         unsigned I2Elem = I2T->getNumElements();
02589 
02590         // This input comes from two distinct vectors. The first step is to
02591         // make sure that both vectors are the same length. If not, the
02592         // smaller one will need to grow before they can be shuffled together.
02593         if (I1Elem < I2Elem) {
02594           std::vector<Constant *> Mask(I2Elem);
02595           unsigned v = 0;
02596           for (; v < I1Elem; ++v)
02597             Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
02598           for (; v < I2Elem; ++v)
02599             Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
02600 
02601           Instruction *NewI1 =
02602             new ShuffleVectorInst(I1, UndefValue::get(I1T),
02603                                   ConstantVector::get(Mask),
02604                                   getReplacementName(IBeforeJ ? I : J,
02605                                                      true, o, 1));
02606           NewI1->insertBefore(IBeforeJ ? J : I);
02607           I1 = NewI1;
02608           I1Elem = I2Elem;
02609         } else if (I1Elem > I2Elem) {
02610           std::vector<Constant *> Mask(I1Elem);
02611           unsigned v = 0;
02612           for (; v < I2Elem; ++v)
02613             Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
02614           for (; v < I1Elem; ++v)
02615             Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
02616 
02617           Instruction *NewI2 =
02618             new ShuffleVectorInst(I2, UndefValue::get(I2T),
02619                                   ConstantVector::get(Mask),
02620                                   getReplacementName(IBeforeJ ? I : J,
02621                                                      true, o, 1));
02622           NewI2->insertBefore(IBeforeJ ? J : I);
02623           I2 = NewI2;
02624         }
02625 
02626         // Now that both I1 and I2 are the same length we can shuffle them
02627         // together (and use the result).
02628         std::vector<Constant *> Mask(numElem);
02629         for (unsigned v = 0; v < numElem; ++v) {
02630           if (II[v].first == -1) {
02631             Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
02632           } else {
02633             int Idx = II[v].first + II[v].second * I1Elem;
02634             Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
02635           }
02636         }
02637 
02638         Instruction *NewOp =
02639           new ShuffleVectorInst(I1, I2, ConstantVector::get(Mask),
02640                                 getReplacementName(IBeforeJ ? I : J, true, o));
02641         NewOp->insertBefore(IBeforeJ ? J : I);
02642         return NewOp;
02643       }
02644     }
02645 
02646     Type *ArgType = ArgTypeL;
02647     if (numElemL < numElemH) {
02648       if (numElemL == 1 && expandIEChain(Context, I, J, o, HOp, numElemH,
02649                                          ArgTypeL, VArgType, IBeforeJ, 1)) {
02650         // This is another short-circuit case: we're combining a scalar into
02651         // a vector that is formed by an IE chain. We've just expanded the IE
02652         // chain, now insert the scalar and we're done.
02653 
02654         Instruction *S = InsertElementInst::Create(HOp, LOp, CV0,
02655                            getReplacementName(IBeforeJ ? I : J, true, o));
02656         S->insertBefore(IBeforeJ ? J : I);
02657         return S;
02658       } else if (!expandIEChain(Context, I, J, o, LOp, numElemL, ArgTypeL,
02659                                 ArgTypeH, IBeforeJ)) {
02660         // The two vector inputs to the shuffle must be the same length,
02661         // so extend the smaller vector to be the same length as the larger one.
02662         Instruction *NLOp;
02663         if (numElemL > 1) {
02664   
02665           std::vector<Constant *> Mask(numElemH);
02666           unsigned v = 0;
02667           for (; v < numElemL; ++v)
02668             Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
02669           for (; v < numElemH; ++v)
02670             Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
02671     
02672           NLOp = new ShuffleVectorInst(LOp, UndefValue::get(ArgTypeL),
02673                                        ConstantVector::get(Mask),
02674                                        getReplacementName(IBeforeJ ? I : J,
02675                                                           true, o, 1));
02676         } else {
02677           NLOp = InsertElementInst::Create(UndefValue::get(ArgTypeH), LOp, CV0,
02678                                            getReplacementName(IBeforeJ ? I : J,
02679                                                               true, o, 1));
02680         }
02681   
02682         NLOp->insertBefore(IBeforeJ ? J : I);
02683         LOp = NLOp;
02684       }
02685 
02686       ArgType = ArgTypeH;
02687     } else if (numElemL > numElemH) {
02688       if (numElemH == 1 && expandIEChain(Context, I, J, o, LOp, numElemL,
02689                                          ArgTypeH, VArgType, IBeforeJ)) {
02690         Instruction *S =
02691           InsertElementInst::Create(LOp, HOp, 
02692                                     ConstantInt::get(Type::getInt32Ty(Context),
02693                                                      numElemL),
02694                                     getReplacementName(IBeforeJ ? I : J,
02695                                                        true, o));
02696         S->insertBefore(IBeforeJ ? J : I);
02697         return S;
02698       } else if (!expandIEChain(Context, I, J, o, HOp, numElemH, ArgTypeH,
02699                                 ArgTypeL, IBeforeJ)) {
02700         Instruction *NHOp;
02701         if (numElemH > 1) {
02702           std::vector<Constant *> Mask(numElemL);
02703           unsigned v = 0;
02704           for (; v < numElemH; ++v)
02705             Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
02706           for (; v < numElemL; ++v)
02707             Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
02708     
02709           NHOp = new ShuffleVectorInst(HOp, UndefValue::get(ArgTypeH),
02710                                        ConstantVector::get(Mask),
02711                                        getReplacementName(IBeforeJ ? I : J,
02712                                                           true, o, 1));
02713         } else {
02714           NHOp = InsertElementInst::Create(UndefValue::get(ArgTypeL), HOp, CV0,
02715                                            getReplacementName(IBeforeJ ? I : J,
02716                                                               true, o, 1));
02717         }
02718 
02719         NHOp->insertBefore(IBeforeJ ? J : I);
02720         HOp = NHOp;
02721       }
02722     }
02723 
02724     if (ArgType->isVectorTy()) {
02725       unsigned numElem = VArgType->getVectorNumElements();
02726       std::vector<Constant*> Mask(numElem);
02727       for (unsigned v = 0; v < numElem; ++v) {
02728         unsigned Idx = v;
02729         // If the low vector was expanded, we need to skip the extra
02730         // undefined entries.
02731         if (v >= numElemL && numElemH > numElemL)
02732           Idx += (numElemH - numElemL);
02733         Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
02734       }
02735 
02736       Instruction *BV = new ShuffleVectorInst(LOp, HOp,
02737                           ConstantVector::get(Mask),
02738                           getReplacementName(IBeforeJ ? I : J, true, o));
02739       BV->insertBefore(IBeforeJ ? J : I);
02740       return BV;
02741     }
02742 
02743     Instruction *BV1 = InsertElementInst::Create(
02744                                           UndefValue::get(VArgType), LOp, CV0,
02745                                           getReplacementName(IBeforeJ ? I : J,
02746                                                              true, o, 1));
02747     BV1->insertBefore(IBeforeJ ? J : I);
02748     Instruction *BV2 = InsertElementInst::Create(BV1, HOp, CV1,
02749                                           getReplacementName(IBeforeJ ? I : J,
02750                                                              true, o, 2));
02751     BV2->insertBefore(IBeforeJ ? J : I);
02752     return BV2;
02753   }
02754 
02755   // This function creates an array of values that will be used as the inputs
02756   // to the vector instruction that fuses I with J.
02757   void BBVectorize::getReplacementInputsForPair(LLVMContext& Context,
02758                      Instruction *I, Instruction *J,
02759                      SmallVectorImpl<Value *> &ReplacedOperands,
02760                      bool IBeforeJ) {
02761     unsigned NumOperands = I->getNumOperands();
02762 
02763     for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) {
02764       // Iterate backward so that we look at the store pointer
02765       // first and know whether or not we need to flip the inputs.
02766 
02767       if (isa<LoadInst>(I) || (o == 1 && isa<StoreInst>(I))) {
02768         // This is the pointer for a load/store instruction.
02769         ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o);
02770         continue;
02771       } else if (isa<CallInst>(I)) {
02772         Function *F = cast<CallInst>(I)->getCalledFunction();
02773         Intrinsic::ID IID = (Intrinsic::ID) F->getIntrinsicID();
02774         if (o == NumOperands-1) {
02775           BasicBlock &BB = *I->getParent();
02776 
02777           Module *M = BB.getParent()->getParent();
02778           Type *ArgTypeI = I->getType();
02779           Type *ArgTypeJ = J->getType();
02780           Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
02781 
02782           ReplacedOperands[o] = Intrinsic::getDeclaration(M, IID, VArgType);
02783           continue;
02784         } else if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz ||
02785                     IID == Intrinsic::cttz) && o == 1) {
02786           // The second argument of powi/ctlz/cttz is a single integer/constant
02787           // and we've already checked that both arguments are equal.
02788           // As a result, we just keep I's second argument.
02789           ReplacedOperands[o] = I->getOperand(o);
02790           continue;
02791         }
02792       } else if (isa<ShuffleVectorInst>(I) && o == NumOperands-1) {
02793         ReplacedOperands[o] = getReplacementShuffleMask(Context, I, J);
02794         continue;
02795       }
02796 
02797       ReplacedOperands[o] = getReplacementInput(Context, I, J, o, IBeforeJ);
02798     }
02799   }
02800 
02801   // This function creates two values that represent the outputs of the
02802   // original I and J instructions. These are generally vector shuffles
02803   // or extracts. In many cases, these will end up being unused and, thus,
02804   // eliminated by later passes.
02805   void BBVectorize::replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
02806                      Instruction *J, Instruction *K,
02807                      Instruction *&InsertionPt,
02808                      Instruction *&K1, Instruction *&K2) {
02809     if (isa<StoreInst>(I)) {
02810       AA->replaceWithNewValue(I, K);
02811       AA->replaceWithNewValue(J, K);
02812     } else {
02813       Type *IType = I->getType();
02814       Type *JType = J->getType();
02815 
02816       VectorType *VType = getVecTypeForPair(IType, JType);
02817       unsigned numElem = VType->getNumElements();
02818 
02819       unsigned numElemI = getNumScalarElements(IType);
02820       unsigned numElemJ = getNumScalarElements(JType);
02821 
02822       if (IType->isVectorTy()) {
02823         std::vector<Constant*> Mask1(numElemI), Mask2(numElemI);
02824         for (unsigned v = 0; v < numElemI; ++v) {
02825           Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
02826           Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemJ+v);
02827         }
02828 
02829         K1 = new ShuffleVectorInst(K, UndefValue::get(VType),
02830                                    ConstantVector::get( Mask1),
02831                                    getReplacementName(K, false, 1));
02832       } else {
02833         Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
02834         K1 = ExtractElementInst::Create(K, CV0,
02835                                           getReplacementName(K, false, 1));
02836       }
02837 
02838       if (JType->isVectorTy()) {
02839         std::vector<Constant*> Mask1(numElemJ), Mask2(numElemJ);
02840         for (unsigned v = 0; v < numElemJ; ++v) {
02841           Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
02842           Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemI+v);
02843         }
02844 
02845         K2 = new ShuffleVectorInst(K, UndefValue::get(VType),
02846                                    ConstantVector::get( Mask2),
02847                                    getReplacementName(K, false, 2));
02848       } else {
02849         Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem-1);
02850         K2 = ExtractElementInst::Create(K, CV1,
02851                                           getReplacementName(K, false, 2));
02852       }
02853 
02854       K1->insertAfter(K);
02855       K2->insertAfter(K1);
02856       InsertionPt = K2;
02857     }
02858   }
02859 
02860   // Move all uses of the function I (including pairing-induced uses) after J.
02861   bool BBVectorize::canMoveUsesOfIAfterJ(BasicBlock &BB,
02862                      DenseSet<ValuePair> &LoadMoveSetPairs,
02863                      Instruction *I, Instruction *J) {
02864     // Skip to the first instruction past I.
02865     BasicBlock::iterator L = std::next(BasicBlock::iterator(I));
02866 
02867     DenseSet<Value *> Users;
02868     AliasSetTracker WriteSet(*AA);
02869     if (I->mayWriteToMemory()) WriteSet.add(I);
02870 
02871     for (; cast<Instruction>(L) != J; ++L)
02872       (void) trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs);
02873 
02874     assert(cast<Instruction>(L) == J &&
02875       "Tracking has not proceeded far enough to check for dependencies");
02876     // If J is now in the use set of I, then trackUsesOfI will return true
02877     // and we have a dependency cycle (and the fusing operation must abort).
02878     return !trackUsesOfI(Users, WriteSet, I, J, true, &LoadMoveSetPairs);
02879   }
02880 
02881   // Move all uses of the function I (including pairing-induced uses) after J.
02882   void BBVectorize::moveUsesOfIAfterJ(BasicBlock &BB,
02883                      DenseSet<ValuePair> &LoadMoveSetPairs,
02884                      Instruction *&InsertionPt,
02885                      Instruction *I, Instruction *J) {
02886     // Skip to the first instruction past I.
02887     BasicBlock::iterator L = std::next(BasicBlock::iterator(I));
02888 
02889     DenseSet<Value *> Users;
02890     AliasSetTracker WriteSet(*AA);
02891     if (I->mayWriteToMemory()) WriteSet.add(I);
02892 
02893     for (; cast<Instruction>(L) != J;) {
02894       if (trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs)) {
02895         // Move this instruction
02896         Instruction *InstToMove = L; ++L;
02897 
02898         DEBUG(dbgs() << "BBV: moving: " << *InstToMove <<
02899                         " to after " << *InsertionPt << "\n");
02900         InstToMove->removeFromParent();
02901         InstToMove->insertAfter(InsertionPt);
02902         InsertionPt = InstToMove;
02903       } else {
02904         ++L;
02905       }
02906     }
02907   }
02908 
02909   // Collect all load instruction that are in the move set of a given first
02910   // pair member.  These loads depend on the first instruction, I, and so need
02911   // to be moved after J (the second instruction) when the pair is fused.
02912   void BBVectorize::collectPairLoadMoveSet(BasicBlock &BB,
02913                      DenseMap<Value *, Value *> &ChosenPairs,
02914                      DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
02915                      DenseSet<ValuePair> &LoadMoveSetPairs,
02916                      Instruction *I) {
02917     // Skip to the first instruction past I.
02918     BasicBlock::iterator L = std::next(BasicBlock::iterator(I));
02919 
02920     DenseSet<Value *> Users;
02921     AliasSetTracker WriteSet(*AA);
02922     if (I->mayWriteToMemory()) WriteSet.add(I);
02923 
02924     // Note: We cannot end the loop when we reach J because J could be moved
02925     // farther down the use chain by another instruction pairing. Also, J
02926     // could be before I if this is an inverted input.
02927     for (BasicBlock::iterator E = BB.end(); cast<Instruction>(L) != E; ++L) {
02928       if (trackUsesOfI(Users, WriteSet, I, L)) {
02929         if (L->mayReadFromMemory()) {
02930           LoadMoveSet[L].push_back(I);
02931           LoadMoveSetPairs.insert(ValuePair(L, I));
02932         }
02933       }
02934     }
02935   }
02936 
02937   // In cases where both load/stores and the computation of their pointers
02938   // are chosen for vectorization, we can end up in a situation where the
02939   // aliasing analysis starts returning different query results as the
02940   // process of fusing instruction pairs continues. Because the algorithm
02941   // relies on finding the same use dags here as were found earlier, we'll
02942   // need to precompute the necessary aliasing information here and then
02943   // manually update it during the fusion process.
02944   void BBVectorize::collectLoadMoveSet(BasicBlock &BB,
02945                      std::vector<Value *> &PairableInsts,
02946                      DenseMap<Value *, Value *> &ChosenPairs,
02947                      DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
02948                      DenseSet<ValuePair> &LoadMoveSetPairs) {
02949     for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
02950          PIE = PairableInsts.end(); PI != PIE; ++PI) {
02951       DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI);
02952       if (P == ChosenPairs.end()) continue;
02953 
02954       Instruction *I = cast<Instruction>(P->first);
02955       collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet,
02956                              LoadMoveSetPairs, I);
02957     }
02958   }
02959 
02960   // This function fuses the chosen instruction pairs into vector instructions,
02961   // taking care preserve any needed scalar outputs and, then, it reorders the
02962   // remaining instructions as needed (users of the first member of the pair
02963   // need to be moved to after the location of the second member of the pair
02964   // because the vector instruction is inserted in the location of the pair's
02965   // second member).
02966   void BBVectorize::fuseChosenPairs(BasicBlock &BB,
02967              std::vector<Value *> &PairableInsts,
02968              DenseMap<Value *, Value *> &ChosenPairs,
02969              DenseSet<ValuePair> &FixedOrderPairs,
02970              DenseMap<VPPair, unsigned> &PairConnectionTypes,
02971              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
02972              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps) {
02973     LLVMContext& Context = BB.getContext();
02974 
02975     // During the vectorization process, the order of the pairs to be fused
02976     // could be flipped. So we'll add each pair, flipped, into the ChosenPairs
02977     // list. After a pair is fused, the flipped pair is removed from the list.
02978     DenseSet<ValuePair> FlippedPairs;
02979     for (DenseMap<Value *, Value *>::iterator P = ChosenPairs.begin(),
02980          E = ChosenPairs.end(); P != E; ++P)
02981       FlippedPairs.insert(ValuePair(P->second, P->first));
02982     for (DenseSet<ValuePair>::iterator P = FlippedPairs.begin(),
02983          E = FlippedPairs.end(); P != E; ++P)
02984       ChosenPairs.insert(*P);
02985 
02986     DenseMap<Value *, std::vector<Value *> > LoadMoveSet;
02987     DenseSet<ValuePair> LoadMoveSetPairs;
02988     collectLoadMoveSet(BB, PairableInsts, ChosenPairs,
02989                        LoadMoveSet, LoadMoveSetPairs);
02990 
02991     DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n");
02992 
02993     for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) {
02994       DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(PI);
02995       if (P == ChosenPairs.end()) {
02996         ++PI;
02997         continue;
02998       }
02999 
03000       if (getDepthFactor(P->first) == 0) {
03001         // These instructions are not really fused, but are tracked as though
03002         // they are. Any case in which it would be interesting to fuse them
03003         // will be taken care of by InstCombine.
03004         --NumFusedOps;
03005         ++PI;
03006         continue;
03007       }
03008 
03009       Instruction *I = cast<Instruction>(P->first),
03010         *J = cast<Instruction>(P->second);
03011 
03012       DEBUG(dbgs() << "BBV: fusing: " << *I <<
03013              " <-> " << *J << "\n");
03014 
03015       // Remove the pair and flipped pair from the list.
03016       DenseMap<Value *, Value *>::iterator FP = ChosenPairs.find(P->second);
03017       assert(FP != ChosenPairs.end() && "Flipped pair not found in list");
03018       ChosenPairs.erase(FP);
03019       ChosenPairs.erase(P);
03020 
03021       if (!canMoveUsesOfIAfterJ(BB, LoadMoveSetPairs, I, J)) {
03022         DEBUG(dbgs() << "BBV: fusion of: " << *I <<
03023                " <-> " << *J <<
03024                " aborted because of non-trivial dependency cycle\n");
03025         --NumFusedOps;
03026         ++PI;
03027         continue;
03028       }
03029 
03030       // If the pair must have the other order, then flip it.
03031       bool FlipPairOrder = FixedOrderPairs.count(ValuePair(J, I));
03032       if (!FlipPairOrder && !FixedOrderPairs.count(ValuePair(I, J))) {
03033         // This pair does not have a fixed order, and so we might want to
03034         // flip it if that will yield fewer shuffles. We count the number
03035         // of dependencies connected via swaps, and those directly connected,
03036         // and flip the order if the number of swaps is greater.
03037         bool OrigOrder = true;
03038         DenseMap<ValuePair, std::vector<ValuePair> >::iterator IJ =
03039           ConnectedPairDeps.find(ValuePair(I, J));
03040         if (IJ == ConnectedPairDeps.end()) {
03041           IJ = ConnectedPairDeps.find(ValuePair(J, I));
03042           OrigOrder = false;
03043         }
03044 
03045         if (IJ != ConnectedPairDeps.end()) {
03046           unsigned NumDepsDirect = 0, NumDepsSwap = 0;
03047           for (std::vector<ValuePair>::iterator T = IJ->second.begin(),
03048                TE = IJ->second.end(); T != TE; ++T) {
03049             VPPair Q(IJ->first, *T);
03050             DenseMap<VPPair, unsigned>::iterator R =
03051               PairConnectionTypes.find(VPPair(Q.second, Q.first));
03052             assert(R != PairConnectionTypes.end() &&
03053                    "Cannot find pair connection type");
03054             if (R->second == PairConnectionDirect)
03055               ++NumDepsDirect;
03056             else if (R->second == PairConnectionSwap)
03057               ++NumDepsSwap;
03058           }
03059 
03060           if (!OrigOrder)
03061             std::swap(NumDepsDirect, NumDepsSwap);
03062 
03063           if (NumDepsSwap > NumDepsDirect) {
03064             FlipPairOrder = true;
03065             DEBUG(dbgs() << "BBV: reordering pair: " << *I <<
03066                             " <-> " << *J << "\n");
03067           }
03068         }
03069       }
03070 
03071       Instruction *L = I, *H = J;
03072       if (FlipPairOrder)
03073         std::swap(H, L);
03074 
03075       // If the pair being fused uses the opposite order from that in the pair
03076       // connection map, then we need to flip the types.
03077       DenseMap<ValuePair, std::vector<ValuePair> >::iterator HL =
03078         ConnectedPairs.find(ValuePair(H, L));
03079       if (HL != ConnectedPairs.end())
03080         for (std::vector<ValuePair>::iterator T = HL->second.begin(),
03081              TE = HL->second.end(); T != TE; ++T) {
03082           VPPair Q(HL->first, *T);
03083           DenseMap<VPPair, unsigned>::iterator R = PairConnectionTypes.find(Q);
03084           assert(R != PairConnectionTypes.end() &&
03085                  "Cannot find pair connection type");
03086           if (R->second == PairConnectionDirect)
03087             R->second = PairConnectionSwap;
03088           else if (R->second == PairConnectionSwap)
03089             R->second = PairConnectionDirect;
03090         }
03091 
03092       bool LBeforeH = !FlipPairOrder;
03093       unsigned NumOperands = I->getNumOperands();
03094       SmallVector<Value *, 3> ReplacedOperands(NumOperands);
03095       getReplacementInputsForPair(Context, L, H, ReplacedOperands,
03096                                   LBeforeH);
03097 
03098       // Make a copy of the original operation, change its type to the vector
03099       // type and replace its operands with the vector operands.
03100       Instruction *K = L->clone();
03101       if (L->hasName())
03102         K->takeName(L);
03103       else if (H->hasName())
03104         K->takeName(H);
03105 
03106       if (!isa<StoreInst>(K))
03107         K->mutateType(getVecTypeForPair(L->getType(), H->getType()));
03108 
03109       unsigned KnownIDs[] = {
03110         LLVMContext::MD_tbaa,
03111         LLVMContext::MD_alias_scope,
03112         LLVMContext::MD_noalias,
03113         LLVMContext::MD_fpmath
03114       };
03115       combineMetadata(K, H, KnownIDs);
03116       K->intersectOptionalDataWith(H);
03117 
03118       for (unsigned o = 0; o < NumOperands; ++o)
03119         K->setOperand(o, ReplacedOperands[o]);
03120 
03121       K->insertAfter(J);
03122 
03123       // Instruction insertion point:
03124       Instruction *InsertionPt = K;
03125       Instruction *K1 = nullptr, *K2 = nullptr;
03126       replaceOutputsOfPair(Context, L, H, K, InsertionPt, K1, K2);
03127 
03128       // The use dag of the first original instruction must be moved to after
03129       // the location of the second instruction. The entire use dag of the
03130       // first instruction is disjoint from the input dag of the second
03131       // (by definition), and so commutes with it.
03132 
03133       moveUsesOfIAfterJ(BB, LoadMoveSetPairs, InsertionPt, I, J);
03134 
03135       if (!isa<StoreInst>(I)) {
03136         L->replaceAllUsesWith(K1);
03137         H->replaceAllUsesWith(K2);
03138         AA->replaceWithNewValue(L, K1);
03139         AA->replaceWithNewValue(H, K2);
03140       }
03141 
03142       // Instructions that may read from memory may be in the load move set.
03143       // Once an instruction is fused, we no longer need its move set, and so
03144       // the values of the map never need to be updated. However, when a load
03145       // is fused, we need to merge the entries from both instructions in the
03146       // pair in case those instructions were in the move set of some other
03147       // yet-to-be-fused pair. The loads in question are the keys of the map.
03148       if (I->mayReadFromMemory()) {
03149         std::vector<ValuePair> NewSetMembers;
03150         DenseMap<Value *, std::vector<Value *> >::iterator II =
03151           LoadMoveSet.find(I);
03152         if (II != LoadMoveSet.end())
03153           for (std::vector<Value *>::iterator N = II->second.begin(),
03154                NE = II->second.end(); N != NE; ++N)
03155             NewSetMembers.push_back(ValuePair(K, *N));
03156         DenseMap<Value *, std::vector<Value *> >::iterator JJ =
03157           LoadMoveSet.find(J);
03158         if (JJ != LoadMoveSet.end())
03159           for (std::vector<Value *>::iterator N = JJ->second.begin(),
03160                NE = JJ->second.end(); N != NE; ++N)
03161             NewSetMembers.push_back(ValuePair(K, *N));
03162         for (std::vector<ValuePair>::iterator A = NewSetMembers.begin(),
03163              AE = NewSetMembers.end(); A != AE; ++A) {
03164           LoadMoveSet[A->first].push_back(A->second);
03165           LoadMoveSetPairs.insert(*A);
03166         }
03167       }
03168 
03169       // Before removing I, set the iterator to the next instruction.
03170       PI = std::next(BasicBlock::iterator(I));
03171       if (cast<Instruction>(PI) == J)
03172         ++PI;
03173 
03174       SE->forgetValue(I);
03175       SE->forgetValue(J);
03176       I->eraseFromParent();
03177       J->eraseFromParent();
03178 
03179       DEBUG(if (PrintAfterEveryPair) dbgs() << "BBV: block is now: \n" <<
03180                                                BB << "\n");
03181     }
03182 
03183     DEBUG(dbgs() << "BBV: final: \n" << BB << "\n");
03184   }
03185 }
03186 
03187 char BBVectorize::ID = 0;
03188 static const char bb_vectorize_name[] = "Basic-Block Vectorization";
03189 INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
03190 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
03191 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
03192 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
03193 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
03194 INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
03195 
03196 BasicBlockPass *llvm::createBBVectorizePass(const VectorizeConfig &C) {
03197   return new BBVectorize(C);
03198 }
03199 
03200 bool
03201 llvm::vectorizeBasicBlock(Pass *P, BasicBlock &BB, const VectorizeConfig &C) {
03202   BBVectorize BBVectorizer(P, *BB.getParent(), C);
03203   return BBVectorizer.vectorizeBB(BB);
03204 }
03205 
03206 //===----------------------------------------------------------------------===//
03207 VectorizeConfig::VectorizeConfig() {
03208   VectorBits = ::VectorBits;
03209   VectorizeBools = !::NoBools;
03210   VectorizeInts = !::NoInts;
03211   VectorizeFloats = !::NoFloats;
03212   VectorizePointers = !::NoPointers;
03213   VectorizeCasts = !::NoCasts;
03214   VectorizeMath = !::NoMath;
03215   VectorizeBitManipulations = !::NoBitManipulation;
03216   VectorizeFMA = !::NoFMA;
03217   VectorizeSelect = !::NoSelect;
03218   VectorizeCmp = !::NoCmp;
03219   VectorizeGEP = !::NoGEP;
03220   VectorizeMemOps = !::NoMemOps;
03221   AlignedOnly = ::AlignedOnly;
03222   ReqChainDepth= ::ReqChainDepth;
03223   SearchLimit = ::SearchLimit;
03224   MaxCandPairsForCycleCheck = ::MaxCandPairsForCycleCheck;
03225   SplatBreaksChain = ::SplatBreaksChain;
03226   MaxInsts = ::MaxInsts;
03227   MaxPairs = ::MaxPairs;
03228   MaxIter = ::MaxIter;
03229   Pow2LenOnly = ::Pow2LenOnly;
03230   NoMemOpBoost = ::NoMemOpBoost;
03231   FastDep = ::FastDep;
03232 }