LLVM  3.7.0
BBVectorize.cpp
Go to the documentation of this file.
1 //===- BBVectorize.cpp - A Basic-Block Vectorizer -------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements a basic-block vectorization pass. The algorithm was
11 // inspired by that used by the Vienna MAP Vectorizor by Franchetti and Kral,
12 // et al. It works by looking for chains of pairable operations and then
13 // pairing them.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #define BBV_NAME "bb-vectorize"
19 #include "llvm/ADT/DenseMap.h"
20 #include "llvm/ADT/DenseSet.h"
21 #include "llvm/ADT/STLExtras.h"
22 #include "llvm/ADT/SmallSet.h"
23 #include "llvm/ADT/SmallVector.h"
24 #include "llvm/ADT/Statistic.h"
25 #include "llvm/ADT/StringExtras.h"
32 #include "llvm/IR/Constants.h"
33 #include "llvm/IR/DataLayout.h"
34 #include "llvm/IR/DerivedTypes.h"
35 #include "llvm/IR/Dominators.h"
36 #include "llvm/IR/Function.h"
37 #include "llvm/IR/Instructions.h"
38 #include "llvm/IR/IntrinsicInst.h"
39 #include "llvm/IR/Intrinsics.h"
40 #include "llvm/IR/LLVMContext.h"
41 #include "llvm/IR/Metadata.h"
42 #include "llvm/IR/Module.h"
43 #include "llvm/IR/Type.h"
44 #include "llvm/IR/ValueHandle.h"
45 #include "llvm/Pass.h"
47 #include "llvm/Support/Debug.h"
50 #include <algorithm>
51 using namespace llvm;
52 
53 #define DEBUG_TYPE BBV_NAME
54 
55 static cl::opt<bool>
56 IgnoreTargetInfo("bb-vectorize-ignore-target-info", cl::init(false),
57  cl::Hidden, cl::desc("Ignore target information"));
58 
59 static cl::opt<unsigned>
60 ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden,
61  cl::desc("The required chain depth for vectorization"));
62 
63 static cl::opt<bool>
64 UseChainDepthWithTI("bb-vectorize-use-chain-depth", cl::init(false),
65  cl::Hidden, cl::desc("Use the chain depth requirement with"
66  " target information"));
67 
68 static cl::opt<unsigned>
69 SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden,
70  cl::desc("The maximum search distance for instruction pairs"));
71 
72 static cl::opt<bool>
73 SplatBreaksChain("bb-vectorize-splat-breaks-chain", cl::init(false), cl::Hidden,
74  cl::desc("Replicating one element to a pair breaks the chain"));
75 
76 static cl::opt<unsigned>
77 VectorBits("bb-vectorize-vector-bits", cl::init(128), cl::Hidden,
78  cl::desc("The size of the native vector registers"));
79 
80 static cl::opt<unsigned>
81 MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden,
82  cl::desc("The maximum number of pairing iterations"));
83 
84 static cl::opt<bool>
85 Pow2LenOnly("bb-vectorize-pow2-len-only", cl::init(false), cl::Hidden,
86  cl::desc("Don't try to form non-2^n-length vectors"));
87 
88 static cl::opt<unsigned>
89 MaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden,
90  cl::desc("The maximum number of pairable instructions per group"));
91 
92 static cl::opt<unsigned>
93 MaxPairs("bb-vectorize-max-pairs-per-group", cl::init(3000), cl::Hidden,
94  cl::desc("The maximum number of candidate instruction pairs per group"));
95 
96 static cl::opt<unsigned>
97 MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200),
98  cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use"
99  " a full cycle check"));
100 
101 static cl::opt<bool>
102 NoBools("bb-vectorize-no-bools", cl::init(false), cl::Hidden,
103  cl::desc("Don't try to vectorize boolean (i1) values"));
104 
105 static cl::opt<bool>
106 NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden,
107  cl::desc("Don't try to vectorize integer values"));
108 
109 static cl::opt<bool>
110 NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden,
111  cl::desc("Don't try to vectorize floating-point values"));
112 
113 // FIXME: This should default to false once pointer vector support works.
114 static cl::opt<bool>
115 NoPointers("bb-vectorize-no-pointers", cl::init(/*false*/ true), cl::Hidden,
116  cl::desc("Don't try to vectorize pointer values"));
117 
118 static cl::opt<bool>
119 NoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden,
120  cl::desc("Don't try to vectorize casting (conversion) operations"));
121 
122 static cl::opt<bool>
123 NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden,
124  cl::desc("Don't try to vectorize floating-point math intrinsics"));
125 
126 static cl::opt<bool>
127  NoBitManipulation("bb-vectorize-no-bitmanip", cl::init(false), cl::Hidden,
128  cl::desc("Don't try to vectorize BitManipulation intrinsics"));
129 
130 static cl::opt<bool>
131 NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden,
132  cl::desc("Don't try to vectorize the fused-multiply-add intrinsic"));
133 
134 static cl::opt<bool>
135 NoSelect("bb-vectorize-no-select", cl::init(false), cl::Hidden,
136  cl::desc("Don't try to vectorize select instructions"));
137 
138 static cl::opt<bool>
139 NoCmp("bb-vectorize-no-cmp", cl::init(false), cl::Hidden,
140  cl::desc("Don't try to vectorize comparison instructions"));
141 
142 static cl::opt<bool>
143 NoGEP("bb-vectorize-no-gep", cl::init(false), cl::Hidden,
144  cl::desc("Don't try to vectorize getelementptr instructions"));
145 
146 static cl::opt<bool>
147 NoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden,
148  cl::desc("Don't try to vectorize loads and stores"));
149 
150 static cl::opt<bool>
151 AlignedOnly("bb-vectorize-aligned-only", cl::init(false), cl::Hidden,
152  cl::desc("Only generate aligned loads and stores"));
153 
154 static cl::opt<bool>
155 NoMemOpBoost("bb-vectorize-no-mem-op-boost",
156  cl::init(false), cl::Hidden,
157  cl::desc("Don't boost the chain-depth contribution of loads and stores"));
158 
159 static cl::opt<bool>
160 FastDep("bb-vectorize-fast-dep", cl::init(false), cl::Hidden,
161  cl::desc("Use a fast instruction dependency analysis"));
162 
163 #ifndef NDEBUG
164 static cl::opt<bool>
165 DebugInstructionExamination("bb-vectorize-debug-instruction-examination",
166  cl::init(false), cl::Hidden,
167  cl::desc("When debugging is enabled, output information on the"
168  " instruction-examination process"));
169 static cl::opt<bool>
170 DebugCandidateSelection("bb-vectorize-debug-candidate-selection",
171  cl::init(false), cl::Hidden,
172  cl::desc("When debugging is enabled, output information on the"
173  " candidate-selection process"));
174 static cl::opt<bool>
175 DebugPairSelection("bb-vectorize-debug-pair-selection",
176  cl::init(false), cl::Hidden,
177  cl::desc("When debugging is enabled, output information on the"
178  " pair-selection process"));
179 static cl::opt<bool>
180 DebugCycleCheck("bb-vectorize-debug-cycle-check",
181  cl::init(false), cl::Hidden,
182  cl::desc("When debugging is enabled, output information on the"
183  " cycle-checking process"));
184 
185 static cl::opt<bool>
186 PrintAfterEveryPair("bb-vectorize-debug-print-after-every-pair",
187  cl::init(false), cl::Hidden,
188  cl::desc("When debugging is enabled, dump the basic block after"
189  " every pair is fused"));
190 #endif
191 
192 STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize");
193 
194 namespace {
195  struct BBVectorize : public BasicBlockPass {
196  static char ID; // Pass identification, replacement for typeid
197 
198  const VectorizeConfig Config;
199 
200  BBVectorize(const VectorizeConfig &C = VectorizeConfig())
201  : BasicBlockPass(ID), Config(C) {
203  }
204 
205  BBVectorize(Pass *P, Function &F, const VectorizeConfig &C)
206  : BasicBlockPass(ID), Config(C) {
207  AA = &P->getAnalysis<AliasAnalysis>();
208  DT = &P->getAnalysis<DominatorTreeWrapperPass>().getDomTree();
209  SE = &P->getAnalysis<ScalarEvolution>();
210  TTI = IgnoreTargetInfo
211  ? nullptr
212  : &P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
213  }
214 
215  typedef std::pair<Value *, Value *> ValuePair;
216  typedef std::pair<ValuePair, int> ValuePairWithCost;
217  typedef std::pair<ValuePair, size_t> ValuePairWithDepth;
218  typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair
219  typedef std::pair<VPPair, unsigned> VPPairWithType;
220 
221  AliasAnalysis *AA;
222  DominatorTree *DT;
223  ScalarEvolution *SE;
224  const TargetTransformInfo *TTI;
225 
226  // FIXME: const correct?
227 
228  bool vectorizePairs(BasicBlock &BB, bool NonPow2Len = false);
229 
230  bool getCandidatePairs(BasicBlock &BB,
231  BasicBlock::iterator &Start,
232  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
233  DenseSet<ValuePair> &FixedOrderPairs,
234  DenseMap<ValuePair, int> &CandidatePairCostSavings,
235  std::vector<Value *> &PairableInsts, bool NonPow2Len);
236 
237  // FIXME: The current implementation does not account for pairs that
238  // are connected in multiple ways. For example:
239  // C1 = A1 / A2; C2 = A2 / A1 (which may be both direct and a swap)
240  enum PairConnectionType {
241  PairConnectionDirect,
242  PairConnectionSwap,
243  PairConnectionSplat
244  };
245 
246  void computeConnectedPairs(
247  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
248  DenseSet<ValuePair> &CandidatePairsSet,
249  std::vector<Value *> &PairableInsts,
250  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
251  DenseMap<VPPair, unsigned> &PairConnectionTypes);
252 
253  void buildDepMap(BasicBlock &BB,
254  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
255  std::vector<Value *> &PairableInsts,
256  DenseSet<ValuePair> &PairableInstUsers);
257 
258  void choosePairs(DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
259  DenseSet<ValuePair> &CandidatePairsSet,
260  DenseMap<ValuePair, int> &CandidatePairCostSavings,
261  std::vector<Value *> &PairableInsts,
262  DenseSet<ValuePair> &FixedOrderPairs,
263  DenseMap<VPPair, unsigned> &PairConnectionTypes,
264  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
265  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
266  DenseSet<ValuePair> &PairableInstUsers,
267  DenseMap<Value *, Value *>& ChosenPairs);
268 
269  void fuseChosenPairs(BasicBlock &BB,
270  std::vector<Value *> &PairableInsts,
271  DenseMap<Value *, Value *>& ChosenPairs,
272  DenseSet<ValuePair> &FixedOrderPairs,
273  DenseMap<VPPair, unsigned> &PairConnectionTypes,
274  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
275  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps);
276 
277 
278  bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore);
279 
280  bool areInstsCompatible(Instruction *I, Instruction *J,
281  bool IsSimpleLoadStore, bool NonPow2Len,
282  int &CostSavings, int &FixedOrder);
283 
284  bool trackUsesOfI(DenseSet<Value *> &Users,
285  AliasSetTracker &WriteSet, Instruction *I,
286  Instruction *J, bool UpdateUsers = true,
287  DenseSet<ValuePair> *LoadMoveSetPairs = nullptr);
288 
289  void computePairsConnectedTo(
290  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
291  DenseSet<ValuePair> &CandidatePairsSet,
292  std::vector<Value *> &PairableInsts,
293  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
294  DenseMap<VPPair, unsigned> &PairConnectionTypes,
295  ValuePair P);
296 
297  bool pairsConflict(ValuePair P, ValuePair Q,
298  DenseSet<ValuePair> &PairableInstUsers,
299  DenseMap<ValuePair, std::vector<ValuePair> >
300  *PairableInstUserMap = nullptr,
301  DenseSet<VPPair> *PairableInstUserPairSet = nullptr);
302 
303  bool pairWillFormCycle(ValuePair P,
304  DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUsers,
305  DenseSet<ValuePair> &CurrentPairs);
306 
307  void pruneDAGFor(
308  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
309  std::vector<Value *> &PairableInsts,
310  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
311  DenseSet<ValuePair> &PairableInstUsers,
312  DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
313  DenseSet<VPPair> &PairableInstUserPairSet,
314  DenseMap<Value *, Value *> &ChosenPairs,
316  DenseSet<ValuePair> &PrunedDAG, ValuePair J,
317  bool UseCycleCheck);
318 
319  void buildInitialDAGFor(
320  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
321  DenseSet<ValuePair> &CandidatePairsSet,
322  std::vector<Value *> &PairableInsts,
323  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
324  DenseSet<ValuePair> &PairableInstUsers,
325  DenseMap<Value *, Value *> &ChosenPairs,
326  DenseMap<ValuePair, size_t> &DAG, ValuePair J);
327 
328  void findBestDAGFor(
329  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
330  DenseSet<ValuePair> &CandidatePairsSet,
331  DenseMap<ValuePair, int> &CandidatePairCostSavings,
332  std::vector<Value *> &PairableInsts,
333  DenseSet<ValuePair> &FixedOrderPairs,
334  DenseMap<VPPair, unsigned> &PairConnectionTypes,
335  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
336  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
337  DenseSet<ValuePair> &PairableInstUsers,
338  DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
339  DenseSet<VPPair> &PairableInstUserPairSet,
340  DenseMap<Value *, Value *> &ChosenPairs,
341  DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth,
342  int &BestEffSize, Value *II, std::vector<Value *>&JJ,
343  bool UseCycleCheck);
344 
345  Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I,
346  Instruction *J, unsigned o);
347 
348  void fillNewShuffleMask(LLVMContext& Context, Instruction *J,
349  unsigned MaskOffset, unsigned NumInElem,
350  unsigned NumInElem1, unsigned IdxOffset,
351  std::vector<Constant*> &Mask);
352 
353  Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I,
354  Instruction *J);
355 
356  bool expandIEChain(LLVMContext& Context, Instruction *I, Instruction *J,
357  unsigned o, Value *&LOp, unsigned numElemL,
358  Type *ArgTypeL, Type *ArgTypeR, bool IBeforeJ,
359  unsigned IdxOff = 0);
360 
361  Value *getReplacementInput(LLVMContext& Context, Instruction *I,
362  Instruction *J, unsigned o, bool IBeforeJ);
363 
364  void getReplacementInputsForPair(LLVMContext& Context, Instruction *I,
365  Instruction *J, SmallVectorImpl<Value *> &ReplacedOperands,
366  bool IBeforeJ);
367 
368  void replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
369  Instruction *J, Instruction *K,
370  Instruction *&InsertionPt, Instruction *&K1,
371  Instruction *&K2);
372 
373  void collectPairLoadMoveSet(BasicBlock &BB,
374  DenseMap<Value *, Value *> &ChosenPairs,
375  DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
376  DenseSet<ValuePair> &LoadMoveSetPairs,
377  Instruction *I);
378 
379  void collectLoadMoveSet(BasicBlock &BB,
380  std::vector<Value *> &PairableInsts,
381  DenseMap<Value *, Value *> &ChosenPairs,
382  DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
383  DenseSet<ValuePair> &LoadMoveSetPairs);
384 
385  bool canMoveUsesOfIAfterJ(BasicBlock &BB,
386  DenseSet<ValuePair> &LoadMoveSetPairs,
387  Instruction *I, Instruction *J);
388 
389  void moveUsesOfIAfterJ(BasicBlock &BB,
390  DenseSet<ValuePair> &LoadMoveSetPairs,
391  Instruction *&InsertionPt,
392  Instruction *I, Instruction *J);
393 
394  bool vectorizeBB(BasicBlock &BB) {
395  if (skipOptnoneFunction(BB))
396  return false;
397  if (!DT->isReachableFromEntry(&BB)) {
398  DEBUG(dbgs() << "BBV: skipping unreachable " << BB.getName() <<
399  " in " << BB.getParent()->getName() << "\n");
400  return false;
401  }
402 
403  DEBUG(if (TTI) dbgs() << "BBV: using target information\n");
404 
405  bool changed = false;
406  // Iterate a sufficient number of times to merge types of size 1 bit,
407  // then 2 bits, then 4, etc. up to half of the target vector width of the
408  // target vector register.
409  unsigned n = 1;
410  for (unsigned v = 2;
411  (TTI || v <= Config.VectorBits) &&
412  (!Config.MaxIter || n <= Config.MaxIter);
413  v *= 2, ++n) {
414  DEBUG(dbgs() << "BBV: fusing loop #" << n <<
415  " for " << BB.getName() << " in " <<
416  BB.getParent()->getName() << "...\n");
417  if (vectorizePairs(BB))
418  changed = true;
419  else
420  break;
421  }
422 
423  if (changed && !Pow2LenOnly) {
424  ++n;
425  for (; !Config.MaxIter || n <= Config.MaxIter; ++n) {
426  DEBUG(dbgs() << "BBV: fusing for non-2^n-length vectors loop #: " <<
427  n << " for " << BB.getName() << " in " <<
428  BB.getParent()->getName() << "...\n");
429  if (!vectorizePairs(BB, true)) break;
430  }
431  }
432 
433  DEBUG(dbgs() << "BBV: done!\n");
434  return changed;
435  }
436 
437  bool runOnBasicBlock(BasicBlock &BB) override {
438  // OptimizeNone check deferred to vectorizeBB().
439 
440  AA = &getAnalysis<AliasAnalysis>();
441  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
442  SE = &getAnalysis<ScalarEvolution>();
443  TTI = IgnoreTargetInfo
444  ? nullptr
445  : &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
446  *BB.getParent());
447 
448  return vectorizeBB(BB);
449  }
450 
451  void getAnalysisUsage(AnalysisUsage &AU) const override {
460  AU.setPreservesCFG();
461  }
462 
463  static inline VectorType *getVecTypeForPair(Type *ElemTy, Type *Elem2Ty) {
464  assert(ElemTy->getScalarType() == Elem2Ty->getScalarType() &&
465  "Cannot form vector from incompatible scalar types");
466  Type *STy = ElemTy->getScalarType();
467 
468  unsigned numElem;
469  if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) {
470  numElem = VTy->getNumElements();
471  } else {
472  numElem = 1;
473  }
474 
475  if (VectorType *VTy = dyn_cast<VectorType>(Elem2Ty)) {
476  numElem += VTy->getNumElements();
477  } else {
478  numElem += 1;
479  }
480 
481  return VectorType::get(STy, numElem);
482  }
483 
484  static inline void getInstructionTypes(Instruction *I,
485  Type *&T1, Type *&T2) {
486  if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
487  // For stores, it is the value type, not the pointer type that matters
488  // because the value is what will come from a vector register.
489 
490  Value *IVal = SI->getValueOperand();
491  T1 = IVal->getType();
492  } else {
493  T1 = I->getType();
494  }
495 
496  if (CastInst *CI = dyn_cast<CastInst>(I))
497  T2 = CI->getSrcTy();
498  else
499  T2 = T1;
500 
501  if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
502  T2 = SI->getCondition()->getType();
503  } else if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(I)) {
504  T2 = SI->getOperand(0)->getType();
505  } else if (CmpInst *CI = dyn_cast<CmpInst>(I)) {
506  T2 = CI->getOperand(0)->getType();
507  }
508  }
509 
510  // Returns the weight associated with the provided value. A chain of
511  // candidate pairs has a length given by the sum of the weights of its
512  // members (one weight per pair; the weight of each member of the pair
513  // is assumed to be the same). This length is then compared to the
514  // chain-length threshold to determine if a given chain is significant
515  // enough to be vectorized. The length is also used in comparing
516  // candidate chains where longer chains are considered to be better.
517  // Note: when this function returns 0, the resulting instructions are
518  // not actually fused.
519  inline size_t getDepthFactor(Value *V) {
520  // InsertElement and ExtractElement have a depth factor of zero. This is
521  // for two reasons: First, they cannot be usefully fused. Second, because
522  // the pass generates a lot of these, they can confuse the simple metric
523  // used to compare the dags in the next iteration. Thus, giving them a
524  // weight of zero allows the pass to essentially ignore them in
525  // subsequent iterations when looking for vectorization opportunities
526  // while still tracking dependency chains that flow through those
527  // instructions.
528  if (isa<InsertElementInst>(V) || isa<ExtractElementInst>(V))
529  return 0;
530 
531  // Give a load or store half of the required depth so that load/store
532  // pairs will vectorize.
533  if (!Config.NoMemOpBoost && (isa<LoadInst>(V) || isa<StoreInst>(V)))
534  return Config.ReqChainDepth/2;
535 
536  return 1;
537  }
538 
539  // Returns the cost of the provided instruction using TTI.
540  // This does not handle loads and stores.
541  unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2,
546  switch (Opcode) {
547  default: break;
548  case Instruction::GetElementPtr:
549  // We mark this instruction as zero-cost because scalar GEPs are usually
550  // lowered to the instruction addressing mode. At the moment we don't
551  // generate vector GEPs.
552  return 0;
553  case Instruction::Br:
554  return TTI->getCFInstrCost(Opcode);
555  case Instruction::PHI:
556  return 0;
557  case Instruction::Add:
558  case Instruction::FAdd:
559  case Instruction::Sub:
560  case Instruction::FSub:
561  case Instruction::Mul:
562  case Instruction::FMul:
563  case Instruction::UDiv:
564  case Instruction::SDiv:
565  case Instruction::FDiv:
566  case Instruction::URem:
567  case Instruction::SRem:
568  case Instruction::FRem:
569  case Instruction::Shl:
570  case Instruction::LShr:
571  case Instruction::AShr:
572  case Instruction::And:
573  case Instruction::Or:
574  case Instruction::Xor:
575  return TTI->getArithmeticInstrCost(Opcode, T1, Op1VK, Op2VK);
576  case Instruction::Select:
577  case Instruction::ICmp:
578  case Instruction::FCmp:
579  return TTI->getCmpSelInstrCost(Opcode, T1, T2);
580  case Instruction::ZExt:
581  case Instruction::SExt:
582  case Instruction::FPToUI:
583  case Instruction::FPToSI:
584  case Instruction::FPExt:
585  case Instruction::PtrToInt:
586  case Instruction::IntToPtr:
587  case Instruction::SIToFP:
588  case Instruction::UIToFP:
589  case Instruction::Trunc:
590  case Instruction::FPTrunc:
591  case Instruction::BitCast:
592  case Instruction::ShuffleVector:
593  return TTI->getCastInstrCost(Opcode, T1, T2);
594  }
595 
596  return 1;
597  }
598 
599  // This determines the relative offset of two loads or stores, returning
600  // true if the offset could be determined to be some constant value.
601  // For example, if OffsetInElmts == 1, then J accesses the memory directly
602  // after I; if OffsetInElmts == -1 then I accesses the memory
603  // directly after J.
604  bool getPairPtrInfo(Instruction *I, Instruction *J,
605  Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment,
606  unsigned &IAddressSpace, unsigned &JAddressSpace,
607  int64_t &OffsetInElmts, bool ComputeOffset = true) {
608  OffsetInElmts = 0;
609  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
610  LoadInst *LJ = cast<LoadInst>(J);
611  IPtr = LI->getPointerOperand();
612  JPtr = LJ->getPointerOperand();
613  IAlignment = LI->getAlignment();
614  JAlignment = LJ->getAlignment();
615  IAddressSpace = LI->getPointerAddressSpace();
616  JAddressSpace = LJ->getPointerAddressSpace();
617  } else {
618  StoreInst *SI = cast<StoreInst>(I), *SJ = cast<StoreInst>(J);
619  IPtr = SI->getPointerOperand();
620  JPtr = SJ->getPointerOperand();
621  IAlignment = SI->getAlignment();
622  JAlignment = SJ->getAlignment();
623  IAddressSpace = SI->getPointerAddressSpace();
624  JAddressSpace = SJ->getPointerAddressSpace();
625  }
626 
627  if (!ComputeOffset)
628  return true;
629 
630  const SCEV *IPtrSCEV = SE->getSCEV(IPtr);
631  const SCEV *JPtrSCEV = SE->getSCEV(JPtr);
632 
633  // If this is a trivial offset, then we'll get something like
634  // 1*sizeof(type). With target data, which we need anyway, this will get
635  // constant folded into a number.
636  const SCEV *OffsetSCEV = SE->getMinusSCEV(JPtrSCEV, IPtrSCEV);
637  if (const SCEVConstant *ConstOffSCEV =
638  dyn_cast<SCEVConstant>(OffsetSCEV)) {
639  ConstantInt *IntOff = ConstOffSCEV->getValue();
640  int64_t Offset = IntOff->getSExtValue();
641  const DataLayout &DL = I->getModule()->getDataLayout();
642  Type *VTy = IPtr->getType()->getPointerElementType();
643  int64_t VTyTSS = (int64_t)DL.getTypeStoreSize(VTy);
644 
645  Type *VTy2 = JPtr->getType()->getPointerElementType();
646  if (VTy != VTy2 && Offset < 0) {
647  int64_t VTy2TSS = (int64_t)DL.getTypeStoreSize(VTy2);
648  OffsetInElmts = Offset/VTy2TSS;
649  return (std::abs(Offset) % VTy2TSS) == 0;
650  }
651 
652  OffsetInElmts = Offset/VTyTSS;
653  return (std::abs(Offset) % VTyTSS) == 0;
654  }
655 
656  return false;
657  }
658 
659  // Returns true if the provided CallInst represents an intrinsic that can
660  // be vectorized.
661  bool isVectorizableIntrinsic(CallInst* I) {
662  Function *F = I->getCalledFunction();
663  if (!F) return false;
664 
665  Intrinsic::ID IID = F->getIntrinsicID();
666  if (!IID) return false;
667 
668  switch(IID) {
669  default:
670  return false;
671  case Intrinsic::sqrt:
672  case Intrinsic::powi:
673  case Intrinsic::sin:
674  case Intrinsic::cos:
675  case Intrinsic::log:
676  case Intrinsic::log2:
677  case Intrinsic::log10:
678  case Intrinsic::exp:
679  case Intrinsic::exp2:
680  case Intrinsic::pow:
681  case Intrinsic::round:
682  case Intrinsic::copysign:
683  case Intrinsic::ceil:
684  case Intrinsic::nearbyint:
685  case Intrinsic::rint:
686  case Intrinsic::trunc:
687  case Intrinsic::floor:
688  case Intrinsic::fabs:
689  case Intrinsic::minnum:
690  case Intrinsic::maxnum:
691  return Config.VectorizeMath;
692  case Intrinsic::bswap:
693  case Intrinsic::ctpop:
694  case Intrinsic::ctlz:
695  case Intrinsic::cttz:
696  return Config.VectorizeBitManipulations;
697  case Intrinsic::fma:
698  case Intrinsic::fmuladd:
699  return Config.VectorizeFMA;
700  }
701  }
702 
703  bool isPureIEChain(InsertElementInst *IE) {
704  InsertElementInst *IENext = IE;
705  do {
706  if (!isa<UndefValue>(IENext->getOperand(0)) &&
707  !isa<InsertElementInst>(IENext->getOperand(0))) {
708  return false;
709  }
710  } while ((IENext =
711  dyn_cast<InsertElementInst>(IENext->getOperand(0))));
712 
713  return true;
714  }
715  };
716 
717  // This function implements one vectorization iteration on the provided
718  // basic block. It returns true if the block is changed.
719  bool BBVectorize::vectorizePairs(BasicBlock &BB, bool NonPow2Len) {
720  bool ShouldContinue;
722 
723  std::vector<Value *> AllPairableInsts;
724  DenseMap<Value *, Value *> AllChosenPairs;
725  DenseSet<ValuePair> AllFixedOrderPairs;
726  DenseMap<VPPair, unsigned> AllPairConnectionTypes;
727  DenseMap<ValuePair, std::vector<ValuePair> > AllConnectedPairs,
728  AllConnectedPairDeps;
729 
730  do {
731  std::vector<Value *> PairableInsts;
733  DenseSet<ValuePair> FixedOrderPairs;
734  DenseMap<ValuePair, int> CandidatePairCostSavings;
735  ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs,
736  FixedOrderPairs,
737  CandidatePairCostSavings,
738  PairableInsts, NonPow2Len);
739  if (PairableInsts.empty()) continue;
740 
741  // Build the candidate pair set for faster lookups.
742  DenseSet<ValuePair> CandidatePairsSet;
743  for (DenseMap<Value *, std::vector<Value *> >::iterator I =
744  CandidatePairs.begin(), E = CandidatePairs.end(); I != E; ++I)
745  for (std::vector<Value *>::iterator J = I->second.begin(),
746  JE = I->second.end(); J != JE; ++J)
747  CandidatePairsSet.insert(ValuePair(I->first, *J));
748 
749  // Now we have a map of all of the pairable instructions and we need to
750  // select the best possible pairing. A good pairing is one such that the
751  // users of the pair are also paired. This defines a (directed) forest
752  // over the pairs such that two pairs are connected iff the second pair
753  // uses the first.
754 
755  // Note that it only matters that both members of the second pair use some
756  // element of the first pair (to allow for splatting).
757 
759  ConnectedPairDeps;
760  DenseMap<VPPair, unsigned> PairConnectionTypes;
761  computeConnectedPairs(CandidatePairs, CandidatePairsSet,
762  PairableInsts, ConnectedPairs, PairConnectionTypes);
763  if (ConnectedPairs.empty()) continue;
764 
765  for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator
766  I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
767  I != IE; ++I)
768  for (std::vector<ValuePair>::iterator J = I->second.begin(),
769  JE = I->second.end(); J != JE; ++J)
770  ConnectedPairDeps[*J].push_back(I->first);
771 
772  // Build the pairable-instruction dependency map
773  DenseSet<ValuePair> PairableInstUsers;
774  buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers);
775 
776  // There is now a graph of the connected pairs. For each variable, pick
777  // the pairing with the largest dag meeting the depth requirement on at
778  // least one branch. Then select all pairings that are part of that dag
779  // and remove them from the list of available pairings and pairable
780  // variables.
781 
782  DenseMap<Value *, Value *> ChosenPairs;
783  choosePairs(CandidatePairs, CandidatePairsSet,
784  CandidatePairCostSavings,
785  PairableInsts, FixedOrderPairs, PairConnectionTypes,
786  ConnectedPairs, ConnectedPairDeps,
787  PairableInstUsers, ChosenPairs);
788 
789  if (ChosenPairs.empty()) continue;
790  AllPairableInsts.insert(AllPairableInsts.end(), PairableInsts.begin(),
791  PairableInsts.end());
792  AllChosenPairs.insert(ChosenPairs.begin(), ChosenPairs.end());
793 
794  // Only for the chosen pairs, propagate information on fixed-order pairs,
795  // pair connections, and their types to the data structures used by the
796  // pair fusion procedures.
797  for (DenseMap<Value *, Value *>::iterator I = ChosenPairs.begin(),
798  IE = ChosenPairs.end(); I != IE; ++I) {
799  if (FixedOrderPairs.count(*I))
800  AllFixedOrderPairs.insert(*I);
801  else if (FixedOrderPairs.count(ValuePair(I->second, I->first)))
802  AllFixedOrderPairs.insert(ValuePair(I->second, I->first));
803 
804  for (DenseMap<Value *, Value *>::iterator J = ChosenPairs.begin();
805  J != IE; ++J) {
807  PairConnectionTypes.find(VPPair(*I, *J));
808  if (K != PairConnectionTypes.end()) {
809  AllPairConnectionTypes.insert(*K);
810  } else {
811  K = PairConnectionTypes.find(VPPair(*J, *I));
812  if (K != PairConnectionTypes.end())
813  AllPairConnectionTypes.insert(*K);
814  }
815  }
816  }
817 
818  for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator
819  I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
820  I != IE; ++I)
821  for (std::vector<ValuePair>::iterator J = I->second.begin(),
822  JE = I->second.end(); J != JE; ++J)
823  if (AllPairConnectionTypes.count(VPPair(I->first, *J))) {
824  AllConnectedPairs[I->first].push_back(*J);
825  AllConnectedPairDeps[*J].push_back(I->first);
826  }
827  } while (ShouldContinue);
828 
829  if (AllChosenPairs.empty()) return false;
830  NumFusedOps += AllChosenPairs.size();
831 
832  // A set of pairs has now been selected. It is now necessary to replace the
833  // paired instructions with vector instructions. For this procedure each
834  // operand must be replaced with a vector operand. This vector is formed
835  // by using build_vector on the old operands. The replaced values are then
836  // replaced with a vector_extract on the result. Subsequent optimization
837  // passes should coalesce the build/extract combinations.
838 
839  fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs, AllFixedOrderPairs,
840  AllPairConnectionTypes,
841  AllConnectedPairs, AllConnectedPairDeps);
842 
843  // It is important to cleanup here so that future iterations of this
844  // function have less work to do.
845  (void)SimplifyInstructionsInBlock(&BB, AA->getTargetLibraryInfo());
846  return true;
847  }
848 
849  // This function returns true if the provided instruction is capable of being
850  // fused into a vector instruction. This determination is based only on the
851  // type and other attributes of the instruction.
852  bool BBVectorize::isInstVectorizable(Instruction *I,
853  bool &IsSimpleLoadStore) {
854  IsSimpleLoadStore = false;
855 
856  if (CallInst *C = dyn_cast<CallInst>(I)) {
857  if (!isVectorizableIntrinsic(C))
858  return false;
859  } else if (LoadInst *L = dyn_cast<LoadInst>(I)) {
860  // Vectorize simple loads if possbile:
861  IsSimpleLoadStore = L->isSimple();
862  if (!IsSimpleLoadStore || !Config.VectorizeMemOps)
863  return false;
864  } else if (StoreInst *S = dyn_cast<StoreInst>(I)) {
865  // Vectorize simple stores if possbile:
866  IsSimpleLoadStore = S->isSimple();
867  if (!IsSimpleLoadStore || !Config.VectorizeMemOps)
868  return false;
869  } else if (CastInst *C = dyn_cast<CastInst>(I)) {
870  // We can vectorize casts, but not casts of pointer types, etc.
871  if (!Config.VectorizeCasts)
872  return false;
873 
874  Type *SrcTy = C->getSrcTy();
875  if (!SrcTy->isSingleValueType())
876  return false;
877 
878  Type *DestTy = C->getDestTy();
879  if (!DestTy->isSingleValueType())
880  return false;
881  } else if (isa<SelectInst>(I)) {
882  if (!Config.VectorizeSelect)
883  return false;
884  } else if (isa<CmpInst>(I)) {
885  if (!Config.VectorizeCmp)
886  return false;
887  } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(I)) {
888  if (!Config.VectorizeGEP)
889  return false;
890 
891  // Currently, vector GEPs exist only with one index.
892  if (G->getNumIndices() != 1)
893  return false;
894  } else if (!(I->isBinaryOp() || isa<ShuffleVectorInst>(I) ||
895  isa<ExtractElementInst>(I) || isa<InsertElementInst>(I))) {
896  return false;
897  }
898 
899  Type *T1, *T2;
900  getInstructionTypes(I, T1, T2);
901 
902  // Not every type can be vectorized...
903  if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) ||
905  return false;
906 
907  if (T1->getScalarSizeInBits() == 1) {
908  if (!Config.VectorizeBools)
909  return false;
910  } else {
911  if (!Config.VectorizeInts && T1->isIntOrIntVectorTy())
912  return false;
913  }
914 
915  if (T2->getScalarSizeInBits() == 1) {
916  if (!Config.VectorizeBools)
917  return false;
918  } else {
919  if (!Config.VectorizeInts && T2->isIntOrIntVectorTy())
920  return false;
921  }
922 
923  if (!Config.VectorizeFloats
924  && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy()))
925  return false;
926 
927  // Don't vectorize target-specific types.
928  if (T1->isX86_FP80Ty() || T1->isPPC_FP128Ty() || T1->isX86_MMXTy())
929  return false;
930  if (T2->isX86_FP80Ty() || T2->isPPC_FP128Ty() || T2->isX86_MMXTy())
931  return false;
932 
933  if (!Config.VectorizePointers && (T1->getScalarType()->isPointerTy() ||
934  T2->getScalarType()->isPointerTy()))
935  return false;
936 
937  if (!TTI && (T1->getPrimitiveSizeInBits() >= Config.VectorBits ||
938  T2->getPrimitiveSizeInBits() >= Config.VectorBits))
939  return false;
940 
941  return true;
942  }
943 
944  // This function returns true if the two provided instructions are compatible
945  // (meaning that they can be fused into a vector instruction). This assumes
946  // that I has already been determined to be vectorizable and that J is not
947  // in the use dag of I.
948  bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J,
949  bool IsSimpleLoadStore, bool NonPow2Len,
950  int &CostSavings, int &FixedOrder) {
951  DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I <<
952  " <-> " << *J << "\n");
953 
954  CostSavings = 0;
955  FixedOrder = 0;
956 
957  // Loads and stores can be merged if they have different alignments,
958  // but are otherwise the same.
960  (NonPow2Len ? Instruction::CompareUsingScalarTypes : 0)))
961  return false;
962 
963  Type *IT1, *IT2, *JT1, *JT2;
964  getInstructionTypes(I, IT1, IT2);
965  getInstructionTypes(J, JT1, JT2);
966  unsigned MaxTypeBits = std::max(
969  if (!TTI && MaxTypeBits > Config.VectorBits)
970  return false;
971 
972  // FIXME: handle addsub-type operations!
973 
974  if (IsSimpleLoadStore) {
975  Value *IPtr, *JPtr;
976  unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
977  int64_t OffsetInElmts = 0;
978  if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
979  IAddressSpace, JAddressSpace, OffsetInElmts) &&
980  std::abs(OffsetInElmts) == 1) {
981  FixedOrder = (int) OffsetInElmts;
982  unsigned BottomAlignment = IAlignment;
983  if (OffsetInElmts < 0) BottomAlignment = JAlignment;
984 
985  Type *aTypeI = isa<StoreInst>(I) ?
986  cast<StoreInst>(I)->getValueOperand()->getType() : I->getType();
987  Type *aTypeJ = isa<StoreInst>(J) ?
988  cast<StoreInst>(J)->getValueOperand()->getType() : J->getType();
989  Type *VType = getVecTypeForPair(aTypeI, aTypeJ);
990 
991  if (Config.AlignedOnly) {
992  // An aligned load or store is possible only if the instruction
993  // with the lower offset has an alignment suitable for the
994  // vector type.
995  const DataLayout &DL = I->getModule()->getDataLayout();
996  unsigned VecAlignment = DL.getPrefTypeAlignment(VType);
997  if (BottomAlignment < VecAlignment)
998  return false;
999  }
1000 
1001  if (TTI) {
1002  unsigned ICost = TTI->getMemoryOpCost(I->getOpcode(), aTypeI,
1003  IAlignment, IAddressSpace);
1004  unsigned JCost = TTI->getMemoryOpCost(J->getOpcode(), aTypeJ,
1005  JAlignment, JAddressSpace);
1006  unsigned VCost = TTI->getMemoryOpCost(I->getOpcode(), VType,
1007  BottomAlignment,
1008  IAddressSpace);
1009 
1010  ICost += TTI->getAddressComputationCost(aTypeI);
1011  JCost += TTI->getAddressComputationCost(aTypeJ);
1012  VCost += TTI->getAddressComputationCost(VType);
1013 
1014  if (VCost > ICost + JCost)
1015  return false;
1016 
1017  // We don't want to fuse to a type that will be split, even
1018  // if the two input types will also be split and there is no other
1019  // associated cost.
1020  unsigned VParts = TTI->getNumberOfParts(VType);
1021  if (VParts > 1)
1022  return false;
1023  else if (!VParts && VCost == ICost + JCost)
1024  return false;
1025 
1026  CostSavings = ICost + JCost - VCost;
1027  }
1028  } else {
1029  return false;
1030  }
1031  } else if (TTI) {
1032  unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2);
1033  unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2);
1034  Type *VT1 = getVecTypeForPair(IT1, JT1),
1035  *VT2 = getVecTypeForPair(IT2, JT2);
1040 
1041  // On some targets (example X86) the cost of a vector shift may vary
1042  // depending on whether the second operand is a Uniform or
1043  // NonUniform Constant.
1044  switch (I->getOpcode()) {
1045  default : break;
1046  case Instruction::Shl:
1047  case Instruction::LShr:
1048  case Instruction::AShr:
1049 
1050  // If both I and J are scalar shifts by constant, then the
1051  // merged vector shift count would be either a constant splat value
1052  // or a non-uniform vector of constants.
1053  if (ConstantInt *CII = dyn_cast<ConstantInt>(I->getOperand(1))) {
1054  if (ConstantInt *CIJ = dyn_cast<ConstantInt>(J->getOperand(1)))
1055  Op2VK = CII == CIJ ? TargetTransformInfo::OK_UniformConstantValue :
1057  } else {
1058  // Check for a splat of a constant or for a non uniform vector
1059  // of constants.
1060  Value *IOp = I->getOperand(1);
1061  Value *JOp = J->getOperand(1);
1062  if ((isa<ConstantVector>(IOp) || isa<ConstantDataVector>(IOp)) &&
1063  (isa<ConstantVector>(JOp) || isa<ConstantDataVector>(JOp))) {
1065  Constant *SplatValue = cast<Constant>(IOp)->getSplatValue();
1066  if (SplatValue != nullptr &&
1067  SplatValue == cast<Constant>(JOp)->getSplatValue())
1069  }
1070  }
1071  }
1072 
1073  // Note that this procedure is incorrect for insert and extract element
1074  // instructions (because combining these often results in a shuffle),
1075  // but this cost is ignored (because insert and extract element
1076  // instructions are assigned a zero depth factor and are not really
1077  // fused in general).
1078  unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2, Op1VK, Op2VK);
1079 
1080  if (VCost > ICost + JCost)
1081  return false;
1082 
1083  // We don't want to fuse to a type that will be split, even
1084  // if the two input types will also be split and there is no other
1085  // associated cost.
1086  unsigned VParts1 = TTI->getNumberOfParts(VT1),
1087  VParts2 = TTI->getNumberOfParts(VT2);
1088  if (VParts1 > 1 || VParts2 > 1)
1089  return false;
1090  else if ((!VParts1 || !VParts2) && VCost == ICost + JCost)
1091  return false;
1092 
1093  CostSavings = ICost + JCost - VCost;
1094  }
1095 
1096  // The powi,ctlz,cttz intrinsics are special because only the first
1097  // argument is vectorized, the second arguments must be equal.
1098  CallInst *CI = dyn_cast<CallInst>(I);
1099  Function *FI;
1100  if (CI && (FI = CI->getCalledFunction())) {
1101  Intrinsic::ID IID = FI->getIntrinsicID();
1102  if (IID == Intrinsic::powi || IID == Intrinsic::ctlz ||
1103  IID == Intrinsic::cttz) {
1104  Value *A1I = CI->getArgOperand(1),
1105  *A1J = cast<CallInst>(J)->getArgOperand(1);
1106  const SCEV *A1ISCEV = SE->getSCEV(A1I),
1107  *A1JSCEV = SE->getSCEV(A1J);
1108  return (A1ISCEV == A1JSCEV);
1109  }
1110 
1111  if (IID && TTI) {
1113  for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
1114  Tys.push_back(CI->getArgOperand(i)->getType());
1115  unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, Tys);
1116 
1117  Tys.clear();
1118  CallInst *CJ = cast<CallInst>(J);
1119  for (unsigned i = 0, ie = CJ->getNumArgOperands(); i != ie; ++i)
1120  Tys.push_back(CJ->getArgOperand(i)->getType());
1121  unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, Tys);
1122 
1123  Tys.clear();
1124  assert(CI->getNumArgOperands() == CJ->getNumArgOperands() &&
1125  "Intrinsic argument counts differ");
1126  for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
1127  if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz ||
1128  IID == Intrinsic::cttz) && i == 1)
1129  Tys.push_back(CI->getArgOperand(i)->getType());
1130  else
1131  Tys.push_back(getVecTypeForPair(CI->getArgOperand(i)->getType(),
1132  CJ->getArgOperand(i)->getType()));
1133  }
1134 
1135  Type *RetTy = getVecTypeForPair(IT1, JT1);
1136  unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys);
1137 
1138  if (VCost > ICost + JCost)
1139  return false;
1140 
1141  // We don't want to fuse to a type that will be split, even
1142  // if the two input types will also be split and there is no other
1143  // associated cost.
1144  unsigned RetParts = TTI->getNumberOfParts(RetTy);
1145  if (RetParts > 1)
1146  return false;
1147  else if (!RetParts && VCost == ICost + JCost)
1148  return false;
1149 
1150  for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
1151  if (!Tys[i]->isVectorTy())
1152  continue;
1153 
1154  unsigned NumParts = TTI->getNumberOfParts(Tys[i]);
1155  if (NumParts > 1)
1156  return false;
1157  else if (!NumParts && VCost == ICost + JCost)
1158  return false;
1159  }
1160 
1161  CostSavings = ICost + JCost - VCost;
1162  }
1163  }
1164 
1165  return true;
1166  }
1167 
1168  // Figure out whether or not J uses I and update the users and write-set
1169  // structures associated with I. Specifically, Users represents the set of
1170  // instructions that depend on I. WriteSet represents the set
1171  // of memory locations that are dependent on I. If UpdateUsers is true,
1172  // and J uses I, then Users is updated to contain J and WriteSet is updated
1173  // to contain any memory locations to which J writes. The function returns
1174  // true if J uses I. By default, alias analysis is used to determine
1175  // whether J reads from memory that overlaps with a location in WriteSet.
1176  // If LoadMoveSet is not null, then it is a previously-computed map
1177  // where the key is the memory-based user instruction and the value is
1178  // the instruction to be compared with I. So, if LoadMoveSet is provided,
1179  // then the alias analysis is not used. This is necessary because this
1180  // function is called during the process of moving instructions during
1181  // vectorization and the results of the alias analysis are not stable during
1182  // that process.
1183  bool BBVectorize::trackUsesOfI(DenseSet<Value *> &Users,
1184  AliasSetTracker &WriteSet, Instruction *I,
1185  Instruction *J, bool UpdateUsers,
1186  DenseSet<ValuePair> *LoadMoveSetPairs) {
1187  bool UsesI = false;
1188 
1189  // This instruction may already be marked as a user due, for example, to
1190  // being a member of a selected pair.
1191  if (Users.count(J))
1192  UsesI = true;
1193 
1194  if (!UsesI)
1195  for (User::op_iterator JU = J->op_begin(), JE = J->op_end();
1196  JU != JE; ++JU) {
1197  Value *V = *JU;
1198  if (I == V || Users.count(V)) {
1199  UsesI = true;
1200  break;
1201  }
1202  }
1203  if (!UsesI && J->mayReadFromMemory()) {
1204  if (LoadMoveSetPairs) {
1205  UsesI = LoadMoveSetPairs->count(ValuePair(J, I));
1206  } else {
1207  for (AliasSetTracker::iterator W = WriteSet.begin(),
1208  WE = WriteSet.end(); W != WE; ++W) {
1209  if (W->aliasesUnknownInst(J, *AA)) {
1210  UsesI = true;
1211  break;
1212  }
1213  }
1214  }
1215  }
1216 
1217  if (UsesI && UpdateUsers) {
1218  if (J->mayWriteToMemory()) WriteSet.add(J);
1219  Users.insert(J);
1220  }
1221 
1222  return UsesI;
1223  }
1224 
1225  // This function iterates over all instruction pairs in the provided
1226  // basic block and collects all candidate pairs for vectorization.
1227  bool BBVectorize::getCandidatePairs(BasicBlock &BB,
1228  BasicBlock::iterator &Start,
1229  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
1230  DenseSet<ValuePair> &FixedOrderPairs,
1231  DenseMap<ValuePair, int> &CandidatePairCostSavings,
1232  std::vector<Value *> &PairableInsts, bool NonPow2Len) {
1233  size_t TotalPairs = 0;
1234  BasicBlock::iterator E = BB.end();
1235  if (Start == E) return false;
1236 
1237  bool ShouldContinue = false, IAfterStart = false;
1238  for (BasicBlock::iterator I = Start++; I != E; ++I) {
1239  if (I == Start) IAfterStart = true;
1240 
1241  bool IsSimpleLoadStore;
1242  if (!isInstVectorizable(I, IsSimpleLoadStore)) continue;
1243 
1244  // Look for an instruction with which to pair instruction *I...
1246  AliasSetTracker WriteSet(*AA);
1247  if (I->mayWriteToMemory()) WriteSet.add(I);
1248 
1249  bool JAfterStart = IAfterStart;
1250  BasicBlock::iterator J = std::next(I);
1251  for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) {
1252  if (J == Start) JAfterStart = true;
1253 
1254  // Determine if J uses I, if so, exit the loop.
1255  bool UsesI = trackUsesOfI(Users, WriteSet, I, J, !Config.FastDep);
1256  if (Config.FastDep) {
1257  // Note: For this heuristic to be effective, independent operations
1258  // must tend to be intermixed. This is likely to be true from some
1259  // kinds of grouped loop unrolling (but not the generic LLVM pass),
1260  // but otherwise may require some kind of reordering pass.
1261 
1262  // When using fast dependency analysis,
1263  // stop searching after first use:
1264  if (UsesI) break;
1265  } else {
1266  if (UsesI) continue;
1267  }
1268 
1269  // J does not use I, and comes before the first use of I, so it can be
1270  // merged with I if the instructions are compatible.
1271  int CostSavings, FixedOrder;
1272  if (!areInstsCompatible(I, J, IsSimpleLoadStore, NonPow2Len,
1273  CostSavings, FixedOrder)) continue;
1274 
1275  // J is a candidate for merging with I.
1276  if (PairableInsts.empty() ||
1277  PairableInsts[PairableInsts.size()-1] != I) {
1278  PairableInsts.push_back(I);
1279  }
1280 
1281  CandidatePairs[I].push_back(J);
1282  ++TotalPairs;
1283  if (TTI)
1284  CandidatePairCostSavings.insert(ValuePairWithCost(ValuePair(I, J),
1285  CostSavings));
1286 
1287  if (FixedOrder == 1)
1288  FixedOrderPairs.insert(ValuePair(I, J));
1289  else if (FixedOrder == -1)
1290  FixedOrderPairs.insert(ValuePair(J, I));
1291 
1292  // The next call to this function must start after the last instruction
1293  // selected during this invocation.
1294  if (JAfterStart) {
1295  Start = std::next(J);
1296  IAfterStart = JAfterStart = false;
1297  }
1298 
1299  DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair "
1300  << *I << " <-> " << *J << " (cost savings: " <<
1301  CostSavings << ")\n");
1302 
1303  // If we have already found too many pairs, break here and this function
1304  // will be called again starting after the last instruction selected
1305  // during this invocation.
1306  if (PairableInsts.size() >= Config.MaxInsts ||
1307  TotalPairs >= Config.MaxPairs) {
1308  ShouldContinue = true;
1309  break;
1310  }
1311  }
1312 
1313  if (ShouldContinue)
1314  break;
1315  }
1316 
1317  DEBUG(dbgs() << "BBV: found " << PairableInsts.size()
1318  << " instructions with candidate pairs\n");
1319 
1320  return ShouldContinue;
1321  }
1322 
1323  // Finds candidate pairs connected to the pair P = <PI, PJ>. This means that
1324  // it looks for pairs such that both members have an input which is an
1325  // output of PI or PJ.
1326  void BBVectorize::computePairsConnectedTo(
1327  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
1328  DenseSet<ValuePair> &CandidatePairsSet,
1329  std::vector<Value *> &PairableInsts,
1330  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
1331  DenseMap<VPPair, unsigned> &PairConnectionTypes,
1332  ValuePair P) {
1333  StoreInst *SI, *SJ;
1334 
1335  // For each possible pairing for this variable, look at the uses of
1336  // the first value...
1337  for (Value::user_iterator I = P.first->user_begin(),
1338  E = P.first->user_end();
1339  I != E; ++I) {
1340  User *UI = *I;
1341  if (isa<LoadInst>(UI)) {
1342  // A pair cannot be connected to a load because the load only takes one
1343  // operand (the address) and it is a scalar even after vectorization.
1344  continue;
1345  } else if ((SI = dyn_cast<StoreInst>(UI)) &&
1346  P.first == SI->getPointerOperand()) {
1347  // Similarly, a pair cannot be connected to a store through its
1348  // pointer operand.
1349  continue;
1350  }
1351 
1352  // For each use of the first variable, look for uses of the second
1353  // variable...
1354  for (User *UJ : P.second->users()) {
1355  if ((SJ = dyn_cast<StoreInst>(UJ)) &&
1356  P.second == SJ->getPointerOperand())
1357  continue;
1358 
1359  // Look for <I, J>:
1360  if (CandidatePairsSet.count(ValuePair(UI, UJ))) {
1361  VPPair VP(P, ValuePair(UI, UJ));
1362  ConnectedPairs[VP.first].push_back(VP.second);
1363  PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionDirect));
1364  }
1365 
1366  // Look for <J, I>:
1367  if (CandidatePairsSet.count(ValuePair(UJ, UI))) {
1368  VPPair VP(P, ValuePair(UJ, UI));
1369  ConnectedPairs[VP.first].push_back(VP.second);
1370  PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSwap));
1371  }
1372  }
1373 
1374  if (Config.SplatBreaksChain) continue;
1375  // Look for cases where just the first value in the pair is used by
1376  // both members of another pair (splatting).
1377  for (Value::user_iterator J = P.first->user_begin(); J != E; ++J) {
1378  User *UJ = *J;
1379  if ((SJ = dyn_cast<StoreInst>(UJ)) &&
1380  P.first == SJ->getPointerOperand())
1381  continue;
1382 
1383  if (CandidatePairsSet.count(ValuePair(UI, UJ))) {
1384  VPPair VP(P, ValuePair(UI, UJ));
1385  ConnectedPairs[VP.first].push_back(VP.second);
1386  PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
1387  }
1388  }
1389  }
1390 
1391  if (Config.SplatBreaksChain) return;
1392  // Look for cases where just the second value in the pair is used by
1393  // both members of another pair (splatting).
1394  for (Value::user_iterator I = P.second->user_begin(),
1395  E = P.second->user_end();
1396  I != E; ++I) {
1397  User *UI = *I;
1398  if (isa<LoadInst>(UI))
1399  continue;
1400  else if ((SI = dyn_cast<StoreInst>(UI)) &&
1401  P.second == SI->getPointerOperand())
1402  continue;
1403 
1404  for (Value::user_iterator J = P.second->user_begin(); J != E; ++J) {
1405  User *UJ = *J;
1406  if ((SJ = dyn_cast<StoreInst>(UJ)) &&
1407  P.second == SJ->getPointerOperand())
1408  continue;
1409 
1410  if (CandidatePairsSet.count(ValuePair(UI, UJ))) {
1411  VPPair VP(P, ValuePair(UI, UJ));
1412  ConnectedPairs[VP.first].push_back(VP.second);
1413  PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
1414  }
1415  }
1416  }
1417  }
1418 
1419  // This function figures out which pairs are connected. Two pairs are
1420  // connected if some output of the first pair forms an input to both members
1421  // of the second pair.
1422  void BBVectorize::computeConnectedPairs(
1423  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
1424  DenseSet<ValuePair> &CandidatePairsSet,
1425  std::vector<Value *> &PairableInsts,
1426  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
1427  DenseMap<VPPair, unsigned> &PairConnectionTypes) {
1428  for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
1429  PE = PairableInsts.end(); PI != PE; ++PI) {
1431  CandidatePairs.find(*PI);
1432  if (PP == CandidatePairs.end())
1433  continue;
1434 
1435  for (std::vector<Value *>::iterator P = PP->second.begin(),
1436  E = PP->second.end(); P != E; ++P)
1437  computePairsConnectedTo(CandidatePairs, CandidatePairsSet,
1438  PairableInsts, ConnectedPairs,
1439  PairConnectionTypes, ValuePair(*PI, *P));
1440  }
1441 
1442  DEBUG(size_t TotalPairs = 0;
1443  for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator I =
1444  ConnectedPairs.begin(), IE = ConnectedPairs.end(); I != IE; ++I)
1445  TotalPairs += I->second.size();
1446  dbgs() << "BBV: found " << TotalPairs
1447  << " pair connections.\n");
1448  }
1449 
1450  // This function builds a set of use tuples such that <A, B> is in the set
1451  // if B is in the use dag of A. If B is in the use dag of A, then B
1452  // depends on the output of A.
1453  void BBVectorize::buildDepMap(
1454  BasicBlock &BB,
1455  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
1456  std::vector<Value *> &PairableInsts,
1457  DenseSet<ValuePair> &PairableInstUsers) {
1458  DenseSet<Value *> IsInPair;
1459  for (DenseMap<Value *, std::vector<Value *> >::iterator C =
1460  CandidatePairs.begin(), E = CandidatePairs.end(); C != E; ++C) {
1461  IsInPair.insert(C->first);
1462  IsInPair.insert(C->second.begin(), C->second.end());
1463  }
1464 
1465  // Iterate through the basic block, recording all users of each
1466  // pairable instruction.
1467 
1468  BasicBlock::iterator E = BB.end(), EL =
1469  BasicBlock::iterator(cast<Instruction>(PairableInsts.back()));
1470  for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) {
1471  if (IsInPair.find(I) == IsInPair.end()) continue;
1472 
1474  AliasSetTracker WriteSet(*AA);
1475  if (I->mayWriteToMemory()) WriteSet.add(I);
1476 
1477  for (BasicBlock::iterator J = std::next(I); J != E; ++J) {
1478  (void) trackUsesOfI(Users, WriteSet, I, J);
1479 
1480  if (J == EL)
1481  break;
1482  }
1483 
1484  for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end();
1485  U != E; ++U) {
1486  if (IsInPair.find(*U) == IsInPair.end()) continue;
1487  PairableInstUsers.insert(ValuePair(I, *U));
1488  }
1489 
1490  if (I == EL)
1491  break;
1492  }
1493  }
1494 
1495  // Returns true if an input to pair P is an output of pair Q and also an
1496  // input of pair Q is an output of pair P. If this is the case, then these
1497  // two pairs cannot be simultaneously fused.
1498  bool BBVectorize::pairsConflict(ValuePair P, ValuePair Q,
1499  DenseSet<ValuePair> &PairableInstUsers,
1500  DenseMap<ValuePair, std::vector<ValuePair> > *PairableInstUserMap,
1501  DenseSet<VPPair> *PairableInstUserPairSet) {
1502  // Two pairs are in conflict if they are mutual Users of eachother.
1503  bool QUsesP = PairableInstUsers.count(ValuePair(P.first, Q.first)) ||
1504  PairableInstUsers.count(ValuePair(P.first, Q.second)) ||
1505  PairableInstUsers.count(ValuePair(P.second, Q.first)) ||
1506  PairableInstUsers.count(ValuePair(P.second, Q.second));
1507  bool PUsesQ = PairableInstUsers.count(ValuePair(Q.first, P.first)) ||
1508  PairableInstUsers.count(ValuePair(Q.first, P.second)) ||
1509  PairableInstUsers.count(ValuePair(Q.second, P.first)) ||
1510  PairableInstUsers.count(ValuePair(Q.second, P.second));
1511  if (PairableInstUserMap) {
1512  // FIXME: The expensive part of the cycle check is not so much the cycle
1513  // check itself but this edge insertion procedure. This needs some
1514  // profiling and probably a different data structure.
1515  if (PUsesQ) {
1516  if (PairableInstUserPairSet->insert(VPPair(Q, P)).second)
1517  (*PairableInstUserMap)[Q].push_back(P);
1518  }
1519  if (QUsesP) {
1520  if (PairableInstUserPairSet->insert(VPPair(P, Q)).second)
1521  (*PairableInstUserMap)[P].push_back(Q);
1522  }
1523  }
1524 
1525  return (QUsesP && PUsesQ);
1526  }
1527 
1528  // This function walks the use graph of current pairs to see if, starting
1529  // from P, the walk returns to P.
1530  bool BBVectorize::pairWillFormCycle(ValuePair P,
1531  DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
1532  DenseSet<ValuePair> &CurrentPairs) {
1533  DEBUG(if (DebugCycleCheck)
1534  dbgs() << "BBV: starting cycle check for : " << *P.first << " <-> "
1535  << *P.second << "\n");
1536  // A lookup table of visisted pairs is kept because the PairableInstUserMap
1537  // contains non-direct associations.
1538  DenseSet<ValuePair> Visited;
1540  // General depth-first post-order traversal:
1541  Q.push_back(P);
1542  do {
1543  ValuePair QTop = Q.pop_back_val();
1544  Visited.insert(QTop);
1545 
1546  DEBUG(if (DebugCycleCheck)
1547  dbgs() << "BBV: cycle check visiting: " << *QTop.first << " <-> "
1548  << *QTop.second << "\n");
1550  PairableInstUserMap.find(QTop);
1551  if (QQ == PairableInstUserMap.end())
1552  continue;
1553 
1554  for (std::vector<ValuePair>::iterator C = QQ->second.begin(),
1555  CE = QQ->second.end(); C != CE; ++C) {
1556  if (*C == P) {
1557  DEBUG(dbgs()
1558  << "BBV: rejected to prevent non-trivial cycle formation: "
1559  << QTop.first << " <-> " << C->second << "\n");
1560  return true;
1561  }
1562 
1563  if (CurrentPairs.count(*C) && !Visited.count(*C))
1564  Q.push_back(*C);
1565  }
1566  } while (!Q.empty());
1567 
1568  return false;
1569  }
1570 
1571  // This function builds the initial dag of connected pairs with the
1572  // pair J at the root.
1573  void BBVectorize::buildInitialDAGFor(
1574  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
1575  DenseSet<ValuePair> &CandidatePairsSet,
1576  std::vector<Value *> &PairableInsts,
1577  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
1578  DenseSet<ValuePair> &PairableInstUsers,
1579  DenseMap<Value *, Value *> &ChosenPairs,
1580  DenseMap<ValuePair, size_t> &DAG, ValuePair J) {
1581  // Each of these pairs is viewed as the root node of a DAG. The DAG
1582  // is then walked (depth-first). As this happens, we keep track of
1583  // the pairs that compose the DAG and the maximum depth of the DAG.
1585  // General depth-first post-order traversal:
1586  Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
1587  do {
1588  ValuePairWithDepth QTop = Q.back();
1589 
1590  // Push each child onto the queue:
1591  bool MoreChildren = false;
1592  size_t MaxChildDepth = QTop.second;
1594  ConnectedPairs.find(QTop.first);
1595  if (QQ != ConnectedPairs.end())
1596  for (std::vector<ValuePair>::iterator k = QQ->second.begin(),
1597  ke = QQ->second.end(); k != ke; ++k) {
1598  // Make sure that this child pair is still a candidate:
1599  if (CandidatePairsSet.count(*k)) {
1601  if (C == DAG.end()) {
1602  size_t d = getDepthFactor(k->first);
1603  Q.push_back(ValuePairWithDepth(*k, QTop.second+d));
1604  MoreChildren = true;
1605  } else {
1606  MaxChildDepth = std::max(MaxChildDepth, C->second);
1607  }
1608  }
1609  }
1610 
1611  if (!MoreChildren) {
1612  // Record the current pair as part of the DAG:
1613  DAG.insert(ValuePairWithDepth(QTop.first, MaxChildDepth));
1614  Q.pop_back();
1615  }
1616  } while (!Q.empty());
1617  }
1618 
1619  // Given some initial dag, prune it by removing conflicting pairs (pairs
1620  // that cannot be simultaneously chosen for vectorization).
1621  void BBVectorize::pruneDAGFor(
1622  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
1623  std::vector<Value *> &PairableInsts,
1624  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
1625  DenseSet<ValuePair> &PairableInstUsers,
1626  DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
1627  DenseSet<VPPair> &PairableInstUserPairSet,
1628  DenseMap<Value *, Value *> &ChosenPairs,
1630  DenseSet<ValuePair> &PrunedDAG, ValuePair J,
1631  bool UseCycleCheck) {
1633  // General depth-first post-order traversal:
1634  Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
1635  do {
1636  ValuePairWithDepth QTop = Q.pop_back_val();
1637  PrunedDAG.insert(QTop.first);
1638 
1639  // Visit each child, pruning as necessary...
1642  ConnectedPairs.find(QTop.first);
1643  if (QQ == ConnectedPairs.end())
1644  continue;
1645 
1646  for (std::vector<ValuePair>::iterator K = QQ->second.begin(),
1647  KE = QQ->second.end(); K != KE; ++K) {
1649  if (C == DAG.end()) continue;
1650 
1651  // This child is in the DAG, now we need to make sure it is the
1652  // best of any conflicting children. There could be multiple
1653  // conflicting children, so first, determine if we're keeping
1654  // this child, then delete conflicting children as necessary.
1655 
1656  // It is also necessary to guard against pairing-induced
1657  // dependencies. Consider instructions a .. x .. y .. b
1658  // such that (a,b) are to be fused and (x,y) are to be fused
1659  // but a is an input to x and b is an output from y. This
1660  // means that y cannot be moved after b but x must be moved
1661  // after b for (a,b) to be fused. In other words, after
1662  // fusing (a,b) we have y .. a/b .. x where y is an input
1663  // to a/b and x is an output to a/b: x and y can no longer
1664  // be legally fused. To prevent this condition, we must
1665  // make sure that a child pair added to the DAG is not
1666  // both an input and output of an already-selected pair.
1667 
1668  // Pairing-induced dependencies can also form from more complicated
1669  // cycles. The pair vs. pair conflicts are easy to check, and so
1670  // that is done explicitly for "fast rejection", and because for
1671  // child vs. child conflicts, we may prefer to keep the current
1672  // pair in preference to the already-selected child.
1673  DenseSet<ValuePair> CurrentPairs;
1674 
1675  bool CanAdd = true;
1677  = BestChildren.begin(), E2 = BestChildren.end();
1678  C2 != E2; ++C2) {
1679  if (C2->first.first == C->first.first ||
1680  C2->first.first == C->first.second ||
1681  C2->first.second == C->first.first ||
1682  C2->first.second == C->first.second ||
1683  pairsConflict(C2->first, C->first, PairableInstUsers,
1684  UseCycleCheck ? &PairableInstUserMap : nullptr,
1685  UseCycleCheck ? &PairableInstUserPairSet
1686  : nullptr)) {
1687  if (C2->second >= C->second) {
1688  CanAdd = false;
1689  break;
1690  }
1691 
1692  CurrentPairs.insert(C2->first);
1693  }
1694  }
1695  if (!CanAdd) continue;
1696 
1697  // Even worse, this child could conflict with another node already
1698  // selected for the DAG. If that is the case, ignore this child.
1699  for (DenseSet<ValuePair>::iterator T = PrunedDAG.begin(),
1700  E2 = PrunedDAG.end(); T != E2; ++T) {
1701  if (T->first == C->first.first ||
1702  T->first == C->first.second ||
1703  T->second == C->first.first ||
1704  T->second == C->first.second ||
1705  pairsConflict(*T, C->first, PairableInstUsers,
1706  UseCycleCheck ? &PairableInstUserMap : nullptr,
1707  UseCycleCheck ? &PairableInstUserPairSet
1708  : nullptr)) {
1709  CanAdd = false;
1710  break;
1711  }
1712 
1713  CurrentPairs.insert(*T);
1714  }
1715  if (!CanAdd) continue;
1716 
1717  // And check the queue too...
1719  E2 = Q.end(); C2 != E2; ++C2) {
1720  if (C2->first.first == C->first.first ||
1721  C2->first.first == C->first.second ||
1722  C2->first.second == C->first.first ||
1723  C2->first.second == C->first.second ||
1724  pairsConflict(C2->first, C->first, PairableInstUsers,
1725  UseCycleCheck ? &PairableInstUserMap : nullptr,
1726  UseCycleCheck ? &PairableInstUserPairSet
1727  : nullptr)) {
1728  CanAdd = false;
1729  break;
1730  }
1731 
1732  CurrentPairs.insert(C2->first);
1733  }
1734  if (!CanAdd) continue;
1735 
1736  // Last but not least, check for a conflict with any of the
1737  // already-chosen pairs.
1739  ChosenPairs.begin(), E2 = ChosenPairs.end();
1740  C2 != E2; ++C2) {
1741  if (pairsConflict(*C2, C->first, PairableInstUsers,
1742  UseCycleCheck ? &PairableInstUserMap : nullptr,
1743  UseCycleCheck ? &PairableInstUserPairSet
1744  : nullptr)) {
1745  CanAdd = false;
1746  break;
1747  }
1748 
1749  CurrentPairs.insert(*C2);
1750  }
1751  if (!CanAdd) continue;
1752 
1753  // To check for non-trivial cycles formed by the addition of the
1754  // current pair we've formed a list of all relevant pairs, now use a
1755  // graph walk to check for a cycle. We start from the current pair and
1756  // walk the use dag to see if we again reach the current pair. If we
1757  // do, then the current pair is rejected.
1758 
1759  // FIXME: It may be more efficient to use a topological-ordering
1760  // algorithm to improve the cycle check. This should be investigated.
1761  if (UseCycleCheck &&
1762  pairWillFormCycle(C->first, PairableInstUserMap, CurrentPairs))
1763  continue;
1764 
1765  // This child can be added, but we may have chosen it in preference
1766  // to an already-selected child. Check for this here, and if a
1767  // conflict is found, then remove the previously-selected child
1768  // before adding this one in its place.
1770  = BestChildren.begin(); C2 != BestChildren.end();) {
1771  if (C2->first.first == C->first.first ||
1772  C2->first.first == C->first.second ||
1773  C2->first.second == C->first.first ||
1774  C2->first.second == C->first.second ||
1775  pairsConflict(C2->first, C->first, PairableInstUsers))
1776  C2 = BestChildren.erase(C2);
1777  else
1778  ++C2;
1779  }
1780 
1781  BestChildren.push_back(ValuePairWithDepth(C->first, C->second));
1782  }
1783 
1785  = BestChildren.begin(), E2 = BestChildren.end();
1786  C != E2; ++C) {
1787  size_t DepthF = getDepthFactor(C->first.first);
1788  Q.push_back(ValuePairWithDepth(C->first, QTop.second+DepthF));
1789  }
1790  } while (!Q.empty());
1791  }
1792 
1793  // This function finds the best dag of mututally-compatible connected
1794  // pairs, given the choice of root pairs as an iterator range.
1795  void BBVectorize::findBestDAGFor(
1796  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
1797  DenseSet<ValuePair> &CandidatePairsSet,
1798  DenseMap<ValuePair, int> &CandidatePairCostSavings,
1799  std::vector<Value *> &PairableInsts,
1800  DenseSet<ValuePair> &FixedOrderPairs,
1801  DenseMap<VPPair, unsigned> &PairConnectionTypes,
1802  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
1803  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
1804  DenseSet<ValuePair> &PairableInstUsers,
1805  DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
1806  DenseSet<VPPair> &PairableInstUserPairSet,
1807  DenseMap<Value *, Value *> &ChosenPairs,
1808  DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth,
1809  int &BestEffSize, Value *II, std::vector<Value *>&JJ,
1810  bool UseCycleCheck) {
1811  for (std::vector<Value *>::iterator J = JJ.begin(), JE = JJ.end();
1812  J != JE; ++J) {
1813  ValuePair IJ(II, *J);
1814  if (!CandidatePairsSet.count(IJ))
1815  continue;
1816 
1817  // Before going any further, make sure that this pair does not
1818  // conflict with any already-selected pairs (see comment below
1819  // near the DAG pruning for more details).
1820  DenseSet<ValuePair> ChosenPairSet;
1821  bool DoesConflict = false;
1822  for (DenseMap<Value *, Value *>::iterator C = ChosenPairs.begin(),
1823  E = ChosenPairs.end(); C != E; ++C) {
1824  if (pairsConflict(*C, IJ, PairableInstUsers,
1825  UseCycleCheck ? &PairableInstUserMap : nullptr,
1826  UseCycleCheck ? &PairableInstUserPairSet : nullptr)) {
1827  DoesConflict = true;
1828  break;
1829  }
1830 
1831  ChosenPairSet.insert(*C);
1832  }
1833  if (DoesConflict) continue;
1834 
1835  if (UseCycleCheck &&
1836  pairWillFormCycle(IJ, PairableInstUserMap, ChosenPairSet))
1837  continue;
1838 
1840  buildInitialDAGFor(CandidatePairs, CandidatePairsSet,
1841  PairableInsts, ConnectedPairs,
1842  PairableInstUsers, ChosenPairs, DAG, IJ);
1843 
1844  // Because we'll keep the child with the largest depth, the largest
1845  // depth is still the same in the unpruned DAG.
1846  size_t MaxDepth = DAG.lookup(IJ);
1847 
1848  DEBUG(if (DebugPairSelection) dbgs() << "BBV: found DAG for pair {"
1849  << *IJ.first << " <-> " << *IJ.second << "} of depth " <<
1850  MaxDepth << " and size " << DAG.size() << "\n");
1851 
1852  // At this point the DAG has been constructed, but, may contain
1853  // contradictory children (meaning that different children of
1854  // some dag node may be attempting to fuse the same instruction).
1855  // So now we walk the dag again, in the case of a conflict,
1856  // keep only the child with the largest depth. To break a tie,
1857  // favor the first child.
1858 
1859  DenseSet<ValuePair> PrunedDAG;
1860  pruneDAGFor(CandidatePairs, PairableInsts, ConnectedPairs,
1861  PairableInstUsers, PairableInstUserMap,
1862  PairableInstUserPairSet,
1863  ChosenPairs, DAG, PrunedDAG, IJ, UseCycleCheck);
1864 
1865  int EffSize = 0;
1866  if (TTI) {
1867  DenseSet<Value *> PrunedDAGInstrs;
1868  for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
1869  E = PrunedDAG.end(); S != E; ++S) {
1870  PrunedDAGInstrs.insert(S->first);
1871  PrunedDAGInstrs.insert(S->second);
1872  }
1873 
1874  // The set of pairs that have already contributed to the total cost.
1875  DenseSet<ValuePair> IncomingPairs;
1876 
1877  // If the cost model were perfect, this might not be necessary; but we
1878  // need to make sure that we don't get stuck vectorizing our own
1879  // shuffle chains.
1880  bool HasNontrivialInsts = false;
1881 
1882  // The node weights represent the cost savings associated with
1883  // fusing the pair of instructions.
1884  for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
1885  E = PrunedDAG.end(); S != E; ++S) {
1886  if (!isa<ShuffleVectorInst>(S->first) &&
1887  !isa<InsertElementInst>(S->first) &&
1888  !isa<ExtractElementInst>(S->first))
1889  HasNontrivialInsts = true;
1890 
1891  bool FlipOrder = false;
1892 
1893  if (getDepthFactor(S->first)) {
1894  int ESContrib = CandidatePairCostSavings.find(*S)->second;
1895  DEBUG(if (DebugPairSelection) dbgs() << "\tweight {"
1896  << *S->first << " <-> " << *S->second << "} = " <<
1897  ESContrib << "\n");
1898  EffSize += ESContrib;
1899  }
1900 
1901  // The edge weights contribute in a negative sense: they represent
1902  // the cost of shuffles.
1904  ConnectedPairDeps.find(*S);
1905  if (SS != ConnectedPairDeps.end()) {
1906  unsigned NumDepsDirect = 0, NumDepsSwap = 0;
1907  for (std::vector<ValuePair>::iterator T = SS->second.begin(),
1908  TE = SS->second.end(); T != TE; ++T) {
1909  VPPair Q(*S, *T);
1910  if (!PrunedDAG.count(Q.second))
1911  continue;
1913  PairConnectionTypes.find(VPPair(Q.second, Q.first));
1914  assert(R != PairConnectionTypes.end() &&
1915  "Cannot find pair connection type");
1916  if (R->second == PairConnectionDirect)
1917  ++NumDepsDirect;
1918  else if (R->second == PairConnectionSwap)
1919  ++NumDepsSwap;
1920  }
1921 
1922  // If there are more swaps than direct connections, then
1923  // the pair order will be flipped during fusion. So the real
1924  // number of swaps is the minimum number.
1925  FlipOrder = !FixedOrderPairs.count(*S) &&
1926  ((NumDepsSwap > NumDepsDirect) ||
1927  FixedOrderPairs.count(ValuePair(S->second, S->first)));
1928 
1929  for (std::vector<ValuePair>::iterator T = SS->second.begin(),
1930  TE = SS->second.end(); T != TE; ++T) {
1931  VPPair Q(*S, *T);
1932  if (!PrunedDAG.count(Q.second))
1933  continue;
1935  PairConnectionTypes.find(VPPair(Q.second, Q.first));
1936  assert(R != PairConnectionTypes.end() &&
1937  "Cannot find pair connection type");
1938  Type *Ty1 = Q.second.first->getType(),
1939  *Ty2 = Q.second.second->getType();
1940  Type *VTy = getVecTypeForPair(Ty1, Ty2);
1941  if ((R->second == PairConnectionDirect && FlipOrder) ||
1942  (R->second == PairConnectionSwap && !FlipOrder) ||
1943  R->second == PairConnectionSplat) {
1944  int ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
1945  VTy, VTy);
1946 
1947  if (VTy->getVectorNumElements() == 2) {
1948  if (R->second == PairConnectionSplat)
1949  ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
1951  else
1952  ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
1954  }
1955 
1956  DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
1957  *Q.second.first << " <-> " << *Q.second.second <<
1958  "} -> {" <<
1959  *S->first << " <-> " << *S->second << "} = " <<
1960  ESContrib << "\n");
1961  EffSize -= ESContrib;
1962  }
1963  }
1964  }
1965 
1966  // Compute the cost of outgoing edges. We assume that edges outgoing
1967  // to shuffles, inserts or extracts can be merged, and so contribute
1968  // no additional cost.
1969  if (!S->first->getType()->isVoidTy()) {
1970  Type *Ty1 = S->first->getType(),
1971  *Ty2 = S->second->getType();
1972  Type *VTy = getVecTypeForPair(Ty1, Ty2);
1973 
1974  bool NeedsExtraction = false;
1975  for (User *U : S->first->users()) {
1976  if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(U)) {
1977  // Shuffle can be folded if it has no other input
1978  if (isa<UndefValue>(SI->getOperand(1)))
1979  continue;
1980  }
1981  if (isa<ExtractElementInst>(U))
1982  continue;
1983  if (PrunedDAGInstrs.count(U))
1984  continue;
1985  NeedsExtraction = true;
1986  break;
1987  }
1988 
1989  if (NeedsExtraction) {
1990  int ESContrib;
1991  if (Ty1->isVectorTy()) {
1992  ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
1993  Ty1, VTy);
1994  ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
1996  } else
1997  ESContrib = (int) TTI->getVectorInstrCost(
1998  Instruction::ExtractElement, VTy, 0);
1999 
2000  DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
2001  *S->first << "} = " << ESContrib << "\n");
2002  EffSize -= ESContrib;
2003  }
2004 
2005  NeedsExtraction = false;
2006  for (User *U : S->second->users()) {
2007  if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(U)) {
2008  // Shuffle can be folded if it has no other input
2009  if (isa<UndefValue>(SI->getOperand(1)))
2010  continue;
2011  }
2012  if (isa<ExtractElementInst>(U))
2013  continue;
2014  if (PrunedDAGInstrs.count(U))
2015  continue;
2016  NeedsExtraction = true;
2017  break;
2018  }
2019 
2020  if (NeedsExtraction) {
2021  int ESContrib;
2022  if (Ty2->isVectorTy()) {
2023  ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
2024  Ty2, VTy);
2025  ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
2027  Ty1->isVectorTy() ? Ty1->getVectorNumElements() : 1, Ty2));
2028  } else
2029  ESContrib = (int) TTI->getVectorInstrCost(
2030  Instruction::ExtractElement, VTy, 1);
2031  DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
2032  *S->second << "} = " << ESContrib << "\n");
2033  EffSize -= ESContrib;
2034  }
2035  }
2036 
2037  // Compute the cost of incoming edges.
2038  if (!isa<LoadInst>(S->first) && !isa<StoreInst>(S->first)) {
2039  Instruction *S1 = cast<Instruction>(S->first),
2040  *S2 = cast<Instruction>(S->second);
2041  for (unsigned o = 0; o < S1->getNumOperands(); ++o) {
2042  Value *O1 = S1->getOperand(o), *O2 = S2->getOperand(o);
2043 
2044  // Combining constants into vector constants (or small vector
2045  // constants into larger ones are assumed free).
2046  if (isa<Constant>(O1) && isa<Constant>(O2))
2047  continue;
2048 
2049  if (FlipOrder)
2050  std::swap(O1, O2);
2051 
2052  ValuePair VP = ValuePair(O1, O2);
2053  ValuePair VPR = ValuePair(O2, O1);
2054 
2055  // Internal edges are not handled here.
2056  if (PrunedDAG.count(VP) || PrunedDAG.count(VPR))
2057  continue;
2058 
2059  Type *Ty1 = O1->getType(),
2060  *Ty2 = O2->getType();
2061  Type *VTy = getVecTypeForPair(Ty1, Ty2);
2062 
2063  // Combining vector operations of the same type is also assumed
2064  // folded with other operations.
2065  if (Ty1 == Ty2) {
2066  // If both are insert elements, then both can be widened.
2068  *IEO2 = dyn_cast<InsertElementInst>(O2);
2069  if (IEO1 && IEO2 && isPureIEChain(IEO1) && isPureIEChain(IEO2))
2070  continue;
2071  // If both are extract elements, and both have the same input
2072  // type, then they can be replaced with a shuffle
2074  *EIO2 = dyn_cast<ExtractElementInst>(O2);
2075  if (EIO1 && EIO2 &&
2076  EIO1->getOperand(0)->getType() ==
2077  EIO2->getOperand(0)->getType())
2078  continue;
2079  // If both are a shuffle with equal operand types and only two
2080  // unqiue operands, then they can be replaced with a single
2081  // shuffle
2083  *SIO2 = dyn_cast<ShuffleVectorInst>(O2);
2084  if (SIO1 && SIO2 &&
2085  SIO1->getOperand(0)->getType() ==
2086  SIO2->getOperand(0)->getType()) {
2087  SmallSet<Value *, 4> SIOps;
2088  SIOps.insert(SIO1->getOperand(0));
2089  SIOps.insert(SIO1->getOperand(1));
2090  SIOps.insert(SIO2->getOperand(0));
2091  SIOps.insert(SIO2->getOperand(1));
2092  if (SIOps.size() <= 2)
2093  continue;
2094  }
2095  }
2096 
2097  int ESContrib;
2098  // This pair has already been formed.
2099  if (IncomingPairs.count(VP)) {
2100  continue;
2101  } else if (IncomingPairs.count(VPR)) {
2102  ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
2103  VTy, VTy);
2104 
2105  if (VTy->getVectorNumElements() == 2)
2106  ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
2108  } else if (!Ty1->isVectorTy() && !Ty2->isVectorTy()) {
2109  ESContrib = (int) TTI->getVectorInstrCost(
2110  Instruction::InsertElement, VTy, 0);
2111  ESContrib += (int) TTI->getVectorInstrCost(
2112  Instruction::InsertElement, VTy, 1);
2113  } else if (!Ty1->isVectorTy()) {
2114  // O1 needs to be inserted into a vector of size O2, and then
2115  // both need to be shuffled together.
2116  ESContrib = (int) TTI->getVectorInstrCost(
2117  Instruction::InsertElement, Ty2, 0);
2118  ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
2119  VTy, Ty2);
2120  } else if (!Ty2->isVectorTy()) {
2121  // O2 needs to be inserted into a vector of size O1, and then
2122  // both need to be shuffled together.
2123  ESContrib = (int) TTI->getVectorInstrCost(
2124  Instruction::InsertElement, Ty1, 0);
2125  ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
2126  VTy, Ty1);
2127  } else {
2128  Type *TyBig = Ty1, *TySmall = Ty2;
2129  if (Ty2->getVectorNumElements() > Ty1->getVectorNumElements())
2130  std::swap(TyBig, TySmall);
2131 
2132  ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
2133  VTy, TyBig);
2134  if (TyBig != TySmall)
2135  ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
2136  TyBig, TySmall);
2137  }
2138 
2139  DEBUG(if (DebugPairSelection) dbgs() << "\tcost {"
2140  << *O1 << " <-> " << *O2 << "} = " <<
2141  ESContrib << "\n");
2142  EffSize -= ESContrib;
2143  IncomingPairs.insert(VP);
2144  }
2145  }
2146  }
2147 
2148  if (!HasNontrivialInsts) {
2149  DEBUG(if (DebugPairSelection) dbgs() <<
2150  "\tNo non-trivial instructions in DAG;"
2151  " override to zero effective size\n");
2152  EffSize = 0;
2153  }
2154  } else {
2155  for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
2156  E = PrunedDAG.end(); S != E; ++S)
2157  EffSize += (int) getDepthFactor(S->first);
2158  }
2159 
2161  dbgs() << "BBV: found pruned DAG for pair {"
2162  << *IJ.first << " <-> " << *IJ.second << "} of depth " <<
2163  MaxDepth << " and size " << PrunedDAG.size() <<
2164  " (effective size: " << EffSize << ")\n");
2165  if (((TTI && !UseChainDepthWithTI) ||
2166  MaxDepth >= Config.ReqChainDepth) &&
2167  EffSize > 0 && EffSize > BestEffSize) {
2168  BestMaxDepth = MaxDepth;
2169  BestEffSize = EffSize;
2170  BestDAG = PrunedDAG;
2171  }
2172  }
2173  }
2174 
2175  // Given the list of candidate pairs, this function selects those
2176  // that will be fused into vector instructions.
2177  void BBVectorize::choosePairs(
2178  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
2179  DenseSet<ValuePair> &CandidatePairsSet,
2180  DenseMap<ValuePair, int> &CandidatePairCostSavings,
2181  std::vector<Value *> &PairableInsts,
2182  DenseSet<ValuePair> &FixedOrderPairs,
2183  DenseMap<VPPair, unsigned> &PairConnectionTypes,
2184  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
2185  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
2186  DenseSet<ValuePair> &PairableInstUsers,
2187  DenseMap<Value *, Value *>& ChosenPairs) {
2188  bool UseCycleCheck =
2189  CandidatePairsSet.size() <= Config.MaxCandPairsForCycleCheck;
2190 
2191  DenseMap<Value *, std::vector<Value *> > CandidatePairs2;
2192  for (DenseSet<ValuePair>::iterator I = CandidatePairsSet.begin(),
2193  E = CandidatePairsSet.end(); I != E; ++I) {
2194  std::vector<Value *> &JJ = CandidatePairs2[I->second];
2195  if (JJ.empty()) JJ.reserve(32);
2196  JJ.push_back(I->first);
2197  }
2198 
2199  DenseMap<ValuePair, std::vector<ValuePair> > PairableInstUserMap;
2200  DenseSet<VPPair> PairableInstUserPairSet;
2201  for (std::vector<Value *>::iterator I = PairableInsts.begin(),
2202  E = PairableInsts.end(); I != E; ++I) {
2203  // The number of possible pairings for this variable:
2204  size_t NumChoices = CandidatePairs.lookup(*I).size();
2205  if (!NumChoices) continue;
2206 
2207  std::vector<Value *> &JJ = CandidatePairs[*I];
2208 
2209  // The best pair to choose and its dag:
2210  size_t BestMaxDepth = 0;
2211  int BestEffSize = 0;
2212  DenseSet<ValuePair> BestDAG;
2213  findBestDAGFor(CandidatePairs, CandidatePairsSet,
2214  CandidatePairCostSavings,
2215  PairableInsts, FixedOrderPairs, PairConnectionTypes,
2216  ConnectedPairs, ConnectedPairDeps,
2217  PairableInstUsers, PairableInstUserMap,
2218  PairableInstUserPairSet, ChosenPairs,
2219  BestDAG, BestMaxDepth, BestEffSize, *I, JJ,
2220  UseCycleCheck);
2221 
2222  if (BestDAG.empty())
2223  continue;
2224 
2225  // A dag has been chosen (or not) at this point. If no dag was
2226  // chosen, then this instruction, I, cannot be paired (and is no longer
2227  // considered).
2228 
2229  DEBUG(dbgs() << "BBV: selected pairs in the best DAG for: "
2230  << *cast<Instruction>(*I) << "\n");
2231 
2232  for (DenseSet<ValuePair>::iterator S = BestDAG.begin(),
2233  SE2 = BestDAG.end(); S != SE2; ++S) {
2234  // Insert the members of this dag into the list of chosen pairs.
2235  ChosenPairs.insert(ValuePair(S->first, S->second));
2236  DEBUG(dbgs() << "BBV: selected pair: " << *S->first << " <-> " <<
2237  *S->second << "\n");
2238 
2239  // Remove all candidate pairs that have values in the chosen dag.
2240  std::vector<Value *> &KK = CandidatePairs[S->first];
2241  for (std::vector<Value *>::iterator K = KK.begin(), KE = KK.end();
2242  K != KE; ++K) {
2243  if (*K == S->second)
2244  continue;
2245 
2246  CandidatePairsSet.erase(ValuePair(S->first, *K));
2247  }
2248 
2249  std::vector<Value *> &LL = CandidatePairs2[S->second];
2250  for (std::vector<Value *>::iterator L = LL.begin(), LE = LL.end();
2251  L != LE; ++L) {
2252  if (*L == S->first)
2253  continue;
2254 
2255  CandidatePairsSet.erase(ValuePair(*L, S->second));
2256  }
2257 
2258  std::vector<Value *> &MM = CandidatePairs[S->second];
2259  for (std::vector<Value *>::iterator M = MM.begin(), ME = MM.end();
2260  M != ME; ++M) {
2261  assert(*M != S->first && "Flipped pair in candidate list?");
2262  CandidatePairsSet.erase(ValuePair(S->second, *M));
2263  }
2264 
2265  std::vector<Value *> &NN = CandidatePairs2[S->first];
2266  for (std::vector<Value *>::iterator N = NN.begin(), NE = NN.end();
2267  N != NE; ++N) {
2268  assert(*N != S->second && "Flipped pair in candidate list?");
2269  CandidatePairsSet.erase(ValuePair(*N, S->first));
2270  }
2271  }
2272  }
2273 
2274  DEBUG(dbgs() << "BBV: selected " << ChosenPairs.size() << " pairs.\n");
2275  }
2276 
2277  std::string getReplacementName(Instruction *I, bool IsInput, unsigned o,
2278  unsigned n = 0) {
2279  if (!I->hasName())
2280  return "";
2281 
2282  return (I->getName() + (IsInput ? ".v.i" : ".v.r") + utostr(o) +
2283  (n > 0 ? "." + utostr(n) : "")).str();
2284  }
2285 
2286  // Returns the value that is to be used as the pointer input to the vector
2287  // instruction that fuses I with J.
2288  Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context,
2289  Instruction *I, Instruction *J, unsigned o) {
2290  Value *IPtr, *JPtr;
2291  unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
2292  int64_t OffsetInElmts;
2293 
2294  // Note: the analysis might fail here, that is why the pair order has
2295  // been precomputed (OffsetInElmts must be unused here).
2296  (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
2297  IAddressSpace, JAddressSpace,
2298  OffsetInElmts, false);
2299 
2300  // The pointer value is taken to be the one with the lowest offset.
2301  Value *VPtr = IPtr;
2302 
2303  Type *ArgTypeI = IPtr->getType()->getPointerElementType();
2304  Type *ArgTypeJ = JPtr->getType()->getPointerElementType();
2305  Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
2306  Type *VArgPtrType
2307  = PointerType::get(VArgType,
2308  IPtr->getType()->getPointerAddressSpace());
2309  return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o),
2310  /* insert before */ I);
2311  }
2312 
2313  void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J,
2314  unsigned MaskOffset, unsigned NumInElem,
2315  unsigned NumInElem1, unsigned IdxOffset,
2316  std::vector<Constant*> &Mask) {
2317  unsigned NumElem1 = J->getType()->getVectorNumElements();
2318  for (unsigned v = 0; v < NumElem1; ++v) {
2319  int m = cast<ShuffleVectorInst>(J)->getMaskValue(v);
2320  if (m < 0) {
2321  Mask[v+MaskOffset] = UndefValue::get(Type::getInt32Ty(Context));
2322  } else {
2323  unsigned mm = m + (int) IdxOffset;
2324  if (m >= (int) NumInElem1)
2325  mm += (int) NumInElem;
2326 
2327  Mask[v+MaskOffset] =
2328  ConstantInt::get(Type::getInt32Ty(Context), mm);
2329  }
2330  }
2331  }
2332 
2333  // Returns the value that is to be used as the vector-shuffle mask to the
2334  // vector instruction that fuses I with J.
2335  Value *BBVectorize::getReplacementShuffleMask(LLVMContext& Context,
2336  Instruction *I, Instruction *J) {
2337  // This is the shuffle mask. We need to append the second
2338  // mask to the first, and the numbers need to be adjusted.
2339 
2340  Type *ArgTypeI = I->getType();
2341  Type *ArgTypeJ = J->getType();
2342  Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
2343 
2344  unsigned NumElemI = ArgTypeI->getVectorNumElements();
2345 
2346  // Get the total number of elements in the fused vector type.
2347  // By definition, this must equal the number of elements in
2348  // the final mask.
2349  unsigned NumElem = VArgType->getVectorNumElements();
2350  std::vector<Constant*> Mask(NumElem);
2351 
2352  Type *OpTypeI = I->getOperand(0)->getType();
2353  unsigned NumInElemI = OpTypeI->getVectorNumElements();
2354  Type *OpTypeJ = J->getOperand(0)->getType();
2355  unsigned NumInElemJ = OpTypeJ->getVectorNumElements();
2356 
2357  // The fused vector will be:
2358  // -----------------------------------------------------
2359  // | NumInElemI | NumInElemJ | NumInElemI | NumInElemJ |
2360  // -----------------------------------------------------
2361  // from which we'll extract NumElem total elements (where the first NumElemI
2362  // of them come from the mask in I and the remainder come from the mask
2363  // in J.
2364 
2365  // For the mask from the first pair...
2366  fillNewShuffleMask(Context, I, 0, NumInElemJ, NumInElemI,
2367  0, Mask);
2368 
2369  // For the mask from the second pair...
2370  fillNewShuffleMask(Context, J, NumElemI, NumInElemI, NumInElemJ,
2371  NumInElemI, Mask);
2372 
2373  return ConstantVector::get(Mask);
2374  }
2375 
2376  bool BBVectorize::expandIEChain(LLVMContext& Context, Instruction *I,
2377  Instruction *J, unsigned o, Value *&LOp,
2378  unsigned numElemL,
2379  Type *ArgTypeL, Type *ArgTypeH,
2380  bool IBeforeJ, unsigned IdxOff) {
2381  bool ExpandedIEChain = false;
2382  if (InsertElementInst *LIE = dyn_cast<InsertElementInst>(LOp)) {
2383  // If we have a pure insertelement chain, then this can be rewritten
2384  // into a chain that directly builds the larger type.
2385  if (isPureIEChain(LIE)) {
2386  SmallVector<Value *, 8> VectElemts(numElemL,
2387  UndefValue::get(ArgTypeL->getScalarType()));
2388  InsertElementInst *LIENext = LIE;
2389  do {
2390  unsigned Idx =
2391  cast<ConstantInt>(LIENext->getOperand(2))->getSExtValue();
2392  VectElemts[Idx] = LIENext->getOperand(1);
2393  } while ((LIENext =
2394  dyn_cast<InsertElementInst>(LIENext->getOperand(0))));
2395 
2396  LIENext = nullptr;
2397  Value *LIEPrev = UndefValue::get(ArgTypeH);
2398  for (unsigned i = 0; i < numElemL; ++i) {
2399  if (isa<UndefValue>(VectElemts[i])) continue;
2400  LIENext = InsertElementInst::Create(LIEPrev, VectElemts[i],
2402  i + IdxOff),
2403  getReplacementName(IBeforeJ ? I : J,
2404  true, o, i+1));
2405  LIENext->insertBefore(IBeforeJ ? J : I);
2406  LIEPrev = LIENext;
2407  }
2408 
2409  LOp = LIENext ? (Value*) LIENext : UndefValue::get(ArgTypeH);
2410  ExpandedIEChain = true;
2411  }
2412  }
2413 
2414  return ExpandedIEChain;
2415  }
2416 
2417  static unsigned getNumScalarElements(Type *Ty) {
2418  if (VectorType *VecTy = dyn_cast<VectorType>(Ty))
2419  return VecTy->getNumElements();
2420  return 1;
2421  }
2422 
2423  // Returns the value to be used as the specified operand of the vector
2424  // instruction that fuses I with J.
2425  Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I,
2426  Instruction *J, unsigned o, bool IBeforeJ) {
2427  Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
2428  Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1);
2429 
2430  // Compute the fused vector type for this operand
2431  Type *ArgTypeI = I->getOperand(o)->getType();
2432  Type *ArgTypeJ = J->getOperand(o)->getType();
2433  VectorType *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
2434 
2435  Instruction *L = I, *H = J;
2436  Type *ArgTypeL = ArgTypeI, *ArgTypeH = ArgTypeJ;
2437 
2438  unsigned numElemL = getNumScalarElements(ArgTypeL);
2439  unsigned numElemH = getNumScalarElements(ArgTypeH);
2440 
2441  Value *LOp = L->getOperand(o);
2442  Value *HOp = H->getOperand(o);
2443  unsigned numElem = VArgType->getNumElements();
2444 
2445  // First, we check if we can reuse the "original" vector outputs (if these
2446  // exist). We might need a shuffle.
2451 
2452  // FIXME: If we're fusing shuffle instructions, then we can't apply this
2453  // optimization. The input vectors to the shuffle might be a different
2454  // length from the shuffle outputs. Unfortunately, the replacement
2455  // shuffle mask has already been formed, and the mask entries are sensitive
2456  // to the sizes of the inputs.
2457  bool IsSizeChangeShuffle =
2458  isa<ShuffleVectorInst>(L) &&
2459  (LOp->getType() != L->getType() || HOp->getType() != H->getType());
2460 
2461  if ((LEE || LSV) && (HEE || HSV) && !IsSizeChangeShuffle) {
2462  // We can have at most two unique vector inputs.
2463  bool CanUseInputs = true;
2464  Value *I1, *I2 = nullptr;
2465  if (LEE) {
2466  I1 = LEE->getOperand(0);
2467  } else {
2468  I1 = LSV->getOperand(0);
2469  I2 = LSV->getOperand(1);
2470  if (I2 == I1 || isa<UndefValue>(I2))
2471  I2 = nullptr;
2472  }
2473 
2474  if (HEE) {
2475  Value *I3 = HEE->getOperand(0);
2476  if (!I2 && I3 != I1)
2477  I2 = I3;
2478  else if (I3 != I1 && I3 != I2)
2479  CanUseInputs = false;
2480  } else {
2481  Value *I3 = HSV->getOperand(0);
2482  if (!I2 && I3 != I1)
2483  I2 = I3;
2484  else if (I3 != I1 && I3 != I2)
2485  CanUseInputs = false;
2486 
2487  if (CanUseInputs) {
2488  Value *I4 = HSV->getOperand(1);
2489  if (!isa<UndefValue>(I4)) {
2490  if (!I2 && I4 != I1)
2491  I2 = I4;
2492  else if (I4 != I1 && I4 != I2)
2493  CanUseInputs = false;
2494  }
2495  }
2496  }
2497 
2498  if (CanUseInputs) {
2499  unsigned LOpElem =
2500  cast<Instruction>(LOp)->getOperand(0)->getType()
2501  ->getVectorNumElements();
2502 
2503  unsigned HOpElem =
2504  cast<Instruction>(HOp)->getOperand(0)->getType()
2505  ->getVectorNumElements();
2506 
2507  // We have one or two input vectors. We need to map each index of the
2508  // operands to the index of the original vector.
2509  SmallVector<std::pair<int, int>, 8> II(numElem);
2510  for (unsigned i = 0; i < numElemL; ++i) {
2511  int Idx, INum;
2512  if (LEE) {
2513  Idx =
2514  cast<ConstantInt>(LEE->getOperand(1))->getSExtValue();
2515  INum = LEE->getOperand(0) == I1 ? 0 : 1;
2516  } else {
2517  Idx = LSV->getMaskValue(i);
2518  if (Idx < (int) LOpElem) {
2519  INum = LSV->getOperand(0) == I1 ? 0 : 1;
2520  } else {
2521  Idx -= LOpElem;
2522  INum = LSV->getOperand(1) == I1 ? 0 : 1;
2523  }
2524  }
2525 
2526  II[i] = std::pair<int, int>(Idx, INum);
2527  }
2528  for (unsigned i = 0; i < numElemH; ++i) {
2529  int Idx, INum;
2530  if (HEE) {
2531  Idx =
2532  cast<ConstantInt>(HEE->getOperand(1))->getSExtValue();
2533  INum = HEE->getOperand(0) == I1 ? 0 : 1;
2534  } else {
2535  Idx = HSV->getMaskValue(i);
2536  if (Idx < (int) HOpElem) {
2537  INum = HSV->getOperand(0) == I1 ? 0 : 1;
2538  } else {
2539  Idx -= HOpElem;
2540  INum = HSV->getOperand(1) == I1 ? 0 : 1;
2541  }
2542  }
2543 
2544  II[i + numElemL] = std::pair<int, int>(Idx, INum);
2545  }
2546 
2547  // We now have an array which tells us from which index of which
2548  // input vector each element of the operand comes.
2549  VectorType *I1T = cast<VectorType>(I1->getType());
2550  unsigned I1Elem = I1T->getNumElements();
2551 
2552  if (!I2) {
2553  // In this case there is only one underlying vector input. Check for
2554  // the trivial case where we can use the input directly.
2555  if (I1Elem == numElem) {
2556  bool ElemInOrder = true;
2557  for (unsigned i = 0; i < numElem; ++i) {
2558  if (II[i].first != (int) i && II[i].first != -1) {
2559  ElemInOrder = false;
2560  break;
2561  }
2562  }
2563 
2564  if (ElemInOrder)
2565  return I1;
2566  }
2567 
2568  // A shuffle is needed.
2569  std::vector<Constant *> Mask(numElem);
2570  for (unsigned i = 0; i < numElem; ++i) {
2571  int Idx = II[i].first;
2572  if (Idx == -1)
2573  Mask[i] = UndefValue::get(Type::getInt32Ty(Context));
2574  else
2575  Mask[i] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
2576  }
2577 
2578  Instruction *S =
2579  new ShuffleVectorInst(I1, UndefValue::get(I1T),
2580  ConstantVector::get(Mask),
2581  getReplacementName(IBeforeJ ? I : J,
2582  true, o));
2583  S->insertBefore(IBeforeJ ? J : I);
2584  return S;
2585  }
2586 
2587  VectorType *I2T = cast<VectorType>(I2->getType());
2588  unsigned I2Elem = I2T->getNumElements();
2589 
2590  // This input comes from two distinct vectors. The first step is to
2591  // make sure that both vectors are the same length. If not, the
2592  // smaller one will need to grow before they can be shuffled together.
2593  if (I1Elem < I2Elem) {
2594  std::vector<Constant *> Mask(I2Elem);
2595  unsigned v = 0;
2596  for (; v < I1Elem; ++v)
2597  Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2598  for (; v < I2Elem; ++v)
2599  Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
2600 
2601  Instruction *NewI1 =
2602  new ShuffleVectorInst(I1, UndefValue::get(I1T),
2603  ConstantVector::get(Mask),
2604  getReplacementName(IBeforeJ ? I : J,
2605  true, o, 1));
2606  NewI1->insertBefore(IBeforeJ ? J : I);
2607  I1 = NewI1;
2608  I1Elem = I2Elem;
2609  } else if (I1Elem > I2Elem) {
2610  std::vector<Constant *> Mask(I1Elem);
2611  unsigned v = 0;
2612  for (; v < I2Elem; ++v)
2613  Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2614  for (; v < I1Elem; ++v)
2615  Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
2616 
2617  Instruction *NewI2 =
2618  new ShuffleVectorInst(I2, UndefValue::get(I2T),
2619  ConstantVector::get(Mask),
2620  getReplacementName(IBeforeJ ? I : J,
2621  true, o, 1));
2622  NewI2->insertBefore(IBeforeJ ? J : I);
2623  I2 = NewI2;
2624  }
2625 
2626  // Now that both I1 and I2 are the same length we can shuffle them
2627  // together (and use the result).
2628  std::vector<Constant *> Mask(numElem);
2629  for (unsigned v = 0; v < numElem; ++v) {
2630  if (II[v].first == -1) {
2631  Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
2632  } else {
2633  int Idx = II[v].first + II[v].second * I1Elem;
2634  Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
2635  }
2636  }
2637 
2638  Instruction *NewOp =
2639  new ShuffleVectorInst(I1, I2, ConstantVector::get(Mask),
2640  getReplacementName(IBeforeJ ? I : J, true, o));
2641  NewOp->insertBefore(IBeforeJ ? J : I);
2642  return NewOp;
2643  }
2644  }
2645 
2646  Type *ArgType = ArgTypeL;
2647  if (numElemL < numElemH) {
2648  if (numElemL == 1 && expandIEChain(Context, I, J, o, HOp, numElemH,
2649  ArgTypeL, VArgType, IBeforeJ, 1)) {
2650  // This is another short-circuit case: we're combining a scalar into
2651  // a vector that is formed by an IE chain. We've just expanded the IE
2652  // chain, now insert the scalar and we're done.
2653 
2654  Instruction *S = InsertElementInst::Create(HOp, LOp, CV0,
2655  getReplacementName(IBeforeJ ? I : J, true, o));
2656  S->insertBefore(IBeforeJ ? J : I);
2657  return S;
2658  } else if (!expandIEChain(Context, I, J, o, LOp, numElemL, ArgTypeL,
2659  ArgTypeH, IBeforeJ)) {
2660  // The two vector inputs to the shuffle must be the same length,
2661  // so extend the smaller vector to be the same length as the larger one.
2662  Instruction *NLOp;
2663  if (numElemL > 1) {
2664 
2665  std::vector<Constant *> Mask(numElemH);
2666  unsigned v = 0;
2667  for (; v < numElemL; ++v)
2668  Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2669  for (; v < numElemH; ++v)
2670  Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
2671 
2672  NLOp = new ShuffleVectorInst(LOp, UndefValue::get(ArgTypeL),
2673  ConstantVector::get(Mask),
2674  getReplacementName(IBeforeJ ? I : J,
2675  true, o, 1));
2676  } else {
2677  NLOp = InsertElementInst::Create(UndefValue::get(ArgTypeH), LOp, CV0,
2678  getReplacementName(IBeforeJ ? I : J,
2679  true, o, 1));
2680  }
2681 
2682  NLOp->insertBefore(IBeforeJ ? J : I);
2683  LOp = NLOp;
2684  }
2685 
2686  ArgType = ArgTypeH;
2687  } else if (numElemL > numElemH) {
2688  if (numElemH == 1 && expandIEChain(Context, I, J, o, LOp, numElemL,
2689  ArgTypeH, VArgType, IBeforeJ)) {
2690  Instruction *S =
2691  InsertElementInst::Create(LOp, HOp,
2693  numElemL),
2694  getReplacementName(IBeforeJ ? I : J,
2695  true, o));
2696  S->insertBefore(IBeforeJ ? J : I);
2697  return S;
2698  } else if (!expandIEChain(Context, I, J, o, HOp, numElemH, ArgTypeH,
2699  ArgTypeL, IBeforeJ)) {
2700  Instruction *NHOp;
2701  if (numElemH > 1) {
2702  std::vector<Constant *> Mask(numElemL);
2703  unsigned v = 0;
2704  for (; v < numElemH; ++v)
2705  Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2706  for (; v < numElemL; ++v)
2707  Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
2708 
2709  NHOp = new ShuffleVectorInst(HOp, UndefValue::get(ArgTypeH),
2710  ConstantVector::get(Mask),
2711  getReplacementName(IBeforeJ ? I : J,
2712  true, o, 1));
2713  } else {
2714  NHOp = InsertElementInst::Create(UndefValue::get(ArgTypeL), HOp, CV0,
2715  getReplacementName(IBeforeJ ? I : J,
2716  true, o, 1));
2717  }
2718 
2719  NHOp->insertBefore(IBeforeJ ? J : I);
2720  HOp = NHOp;
2721  }
2722  }
2723 
2724  if (ArgType->isVectorTy()) {
2725  unsigned numElem = VArgType->getVectorNumElements();
2726  std::vector<Constant*> Mask(numElem);
2727  for (unsigned v = 0; v < numElem; ++v) {
2728  unsigned Idx = v;
2729  // If the low vector was expanded, we need to skip the extra
2730  // undefined entries.
2731  if (v >= numElemL && numElemH > numElemL)
2732  Idx += (numElemH - numElemL);
2733  Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
2734  }
2735 
2736  Instruction *BV = new ShuffleVectorInst(LOp, HOp,
2737  ConstantVector::get(Mask),
2738  getReplacementName(IBeforeJ ? I : J, true, o));
2739  BV->insertBefore(IBeforeJ ? J : I);
2740  return BV;
2741  }
2742 
2744  UndefValue::get(VArgType), LOp, CV0,
2745  getReplacementName(IBeforeJ ? I : J,
2746  true, o, 1));
2747  BV1->insertBefore(IBeforeJ ? J : I);
2748  Instruction *BV2 = InsertElementInst::Create(BV1, HOp, CV1,
2749  getReplacementName(IBeforeJ ? I : J,
2750  true, o, 2));
2751  BV2->insertBefore(IBeforeJ ? J : I);
2752  return BV2;
2753  }
2754 
2755  // This function creates an array of values that will be used as the inputs
2756  // to the vector instruction that fuses I with J.
2757  void BBVectorize::getReplacementInputsForPair(LLVMContext& Context,
2758  Instruction *I, Instruction *J,
2759  SmallVectorImpl<Value *> &ReplacedOperands,
2760  bool IBeforeJ) {
2761  unsigned NumOperands = I->getNumOperands();
2762 
2763  for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) {
2764  // Iterate backward so that we look at the store pointer
2765  // first and know whether or not we need to flip the inputs.
2766 
2767  if (isa<LoadInst>(I) || (o == 1 && isa<StoreInst>(I))) {
2768  // This is the pointer for a load/store instruction.
2769  ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o);
2770  continue;
2771  } else if (isa<CallInst>(I)) {
2772  Function *F = cast<CallInst>(I)->getCalledFunction();
2773  Intrinsic::ID IID = F->getIntrinsicID();
2774  if (o == NumOperands-1) {
2775  BasicBlock &BB = *I->getParent();
2776 
2777  Module *M = BB.getParent()->getParent();
2778  Type *ArgTypeI = I->getType();
2779  Type *ArgTypeJ = J->getType();
2780  Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
2781 
2782  ReplacedOperands[o] = Intrinsic::getDeclaration(M, IID, VArgType);
2783  continue;
2784  } else if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz ||
2785  IID == Intrinsic::cttz) && o == 1) {
2786  // The second argument of powi/ctlz/cttz is a single integer/constant
2787  // and we've already checked that both arguments are equal.
2788  // As a result, we just keep I's second argument.
2789  ReplacedOperands[o] = I->getOperand(o);
2790  continue;
2791  }
2792  } else if (isa<ShuffleVectorInst>(I) && o == NumOperands-1) {
2793  ReplacedOperands[o] = getReplacementShuffleMask(Context, I, J);
2794  continue;
2795  }
2796 
2797  ReplacedOperands[o] = getReplacementInput(Context, I, J, o, IBeforeJ);
2798  }
2799  }
2800 
2801  // This function creates two values that represent the outputs of the
2802  // original I and J instructions. These are generally vector shuffles
2803  // or extracts. In many cases, these will end up being unused and, thus,
2804  // eliminated by later passes.
2805  void BBVectorize::replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
2806  Instruction *J, Instruction *K,
2807  Instruction *&InsertionPt,
2808  Instruction *&K1, Instruction *&K2) {
2809  if (isa<StoreInst>(I)) {
2810  AA->replaceWithNewValue(I, K);
2811  AA->replaceWithNewValue(J, K);
2812  } else {
2813  Type *IType = I->getType();
2814  Type *JType = J->getType();
2815 
2816  VectorType *VType = getVecTypeForPair(IType, JType);
2817  unsigned numElem = VType->getNumElements();
2818 
2819  unsigned numElemI = getNumScalarElements(IType);
2820  unsigned numElemJ = getNumScalarElements(JType);
2821 
2822  if (IType->isVectorTy()) {
2823  std::vector<Constant*> Mask1(numElemI), Mask2(numElemI);
2824  for (unsigned v = 0; v < numElemI; ++v) {
2825  Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2826  Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemJ+v);
2827  }
2828 
2829  K1 = new ShuffleVectorInst(K, UndefValue::get(VType),
2830  ConstantVector::get( Mask1),
2831  getReplacementName(K, false, 1));
2832  } else {
2833  Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
2834  K1 = ExtractElementInst::Create(K, CV0,
2835  getReplacementName(K, false, 1));
2836  }
2837 
2838  if (JType->isVectorTy()) {
2839  std::vector<Constant*> Mask1(numElemJ), Mask2(numElemJ);
2840  for (unsigned v = 0; v < numElemJ; ++v) {
2841  Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2842  Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemI+v);
2843  }
2844 
2845  K2 = new ShuffleVectorInst(K, UndefValue::get(VType),
2846  ConstantVector::get( Mask2),
2847  getReplacementName(K, false, 2));
2848  } else {
2849  Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem-1);
2850  K2 = ExtractElementInst::Create(K, CV1,
2851  getReplacementName(K, false, 2));
2852  }
2853 
2854  K1->insertAfter(K);
2855  K2->insertAfter(K1);
2856  InsertionPt = K2;
2857  }
2858  }
2859 
2860  // Move all uses of the function I (including pairing-induced uses) after J.
2861  bool BBVectorize::canMoveUsesOfIAfterJ(BasicBlock &BB,
2862  DenseSet<ValuePair> &LoadMoveSetPairs,
2863  Instruction *I, Instruction *J) {
2864  // Skip to the first instruction past I.
2865  BasicBlock::iterator L = std::next(BasicBlock::iterator(I));
2866 
2868  AliasSetTracker WriteSet(*AA);
2869  if (I->mayWriteToMemory()) WriteSet.add(I);
2870 
2871  for (; cast<Instruction>(L) != J; ++L)
2872  (void) trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs);
2873 
2874  assert(cast<Instruction>(L) == J &&
2875  "Tracking has not proceeded far enough to check for dependencies");
2876  // If J is now in the use set of I, then trackUsesOfI will return true
2877  // and we have a dependency cycle (and the fusing operation must abort).
2878  return !trackUsesOfI(Users, WriteSet, I, J, true, &LoadMoveSetPairs);
2879  }
2880 
2881  // Move all uses of the function I (including pairing-induced uses) after J.
2882  void BBVectorize::moveUsesOfIAfterJ(BasicBlock &BB,
2883  DenseSet<ValuePair> &LoadMoveSetPairs,
2884  Instruction *&InsertionPt,
2885  Instruction *I, Instruction *J) {
2886  // Skip to the first instruction past I.
2887  BasicBlock::iterator L = std::next(BasicBlock::iterator(I));
2888 
2890  AliasSetTracker WriteSet(*AA);
2891  if (I->mayWriteToMemory()) WriteSet.add(I);
2892 
2893  for (; cast<Instruction>(L) != J;) {
2894  if (trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs)) {
2895  // Move this instruction
2896  Instruction *InstToMove = L; ++L;
2897 
2898  DEBUG(dbgs() << "BBV: moving: " << *InstToMove <<
2899  " to after " << *InsertionPt << "\n");
2900  InstToMove->removeFromParent();
2901  InstToMove->insertAfter(InsertionPt);
2902  InsertionPt = InstToMove;
2903  } else {
2904  ++L;
2905  }
2906  }
2907  }
2908 
2909  // Collect all load instruction that are in the move set of a given first
2910  // pair member. These loads depend on the first instruction, I, and so need
2911  // to be moved after J (the second instruction) when the pair is fused.
2912  void BBVectorize::collectPairLoadMoveSet(BasicBlock &BB,
2913  DenseMap<Value *, Value *> &ChosenPairs,
2914  DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
2915  DenseSet<ValuePair> &LoadMoveSetPairs,
2916  Instruction *I) {
2917  // Skip to the first instruction past I.
2918  BasicBlock::iterator L = std::next(BasicBlock::iterator(I));
2919 
2921  AliasSetTracker WriteSet(*AA);
2922  if (I->mayWriteToMemory()) WriteSet.add(I);
2923 
2924  // Note: We cannot end the loop when we reach J because J could be moved
2925  // farther down the use chain by another instruction pairing. Also, J
2926  // could be before I if this is an inverted input.
2927  for (BasicBlock::iterator E = BB.end(); cast<Instruction>(L) != E; ++L) {
2928  if (trackUsesOfI(Users, WriteSet, I, L)) {
2929  if (L->mayReadFromMemory()) {
2930  LoadMoveSet[L].push_back(I);
2931  LoadMoveSetPairs.insert(ValuePair(L, I));
2932  }
2933  }
2934  }
2935  }
2936 
2937  // In cases where both load/stores and the computation of their pointers
2938  // are chosen for vectorization, we can end up in a situation where the
2939  // aliasing analysis starts returning different query results as the
2940  // process of fusing instruction pairs continues. Because the algorithm
2941  // relies on finding the same use dags here as were found earlier, we'll
2942  // need to precompute the necessary aliasing information here and then
2943  // manually update it during the fusion process.
2944  void BBVectorize::collectLoadMoveSet(BasicBlock &BB,
2945  std::vector<Value *> &PairableInsts,
2946  DenseMap<Value *, Value *> &ChosenPairs,
2947  DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
2948  DenseSet<ValuePair> &LoadMoveSetPairs) {
2949  for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
2950  PIE = PairableInsts.end(); PI != PIE; ++PI) {
2951  DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI);
2952  if (P == ChosenPairs.end()) continue;
2953 
2954  Instruction *I = cast<Instruction>(P->first);
2955  collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet,
2956  LoadMoveSetPairs, I);
2957  }
2958  }
2959 
2960  // This function fuses the chosen instruction pairs into vector instructions,
2961  // taking care preserve any needed scalar outputs and, then, it reorders the
2962  // remaining instructions as needed (users of the first member of the pair
2963  // need to be moved to after the location of the second member of the pair
2964  // because the vector instruction is inserted in the location of the pair's
2965  // second member).
2966  void BBVectorize::fuseChosenPairs(BasicBlock &BB,
2967  std::vector<Value *> &PairableInsts,
2968  DenseMap<Value *, Value *> &ChosenPairs,
2969  DenseSet<ValuePair> &FixedOrderPairs,
2970  DenseMap<VPPair, unsigned> &PairConnectionTypes,
2971  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
2972  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps) {
2973  LLVMContext& Context = BB.getContext();
2974 
2975  // During the vectorization process, the order of the pairs to be fused
2976  // could be flipped. So we'll add each pair, flipped, into the ChosenPairs
2977  // list. After a pair is fused, the flipped pair is removed from the list.
2978  DenseSet<ValuePair> FlippedPairs;
2979  for (DenseMap<Value *, Value *>::iterator P = ChosenPairs.begin(),
2980  E = ChosenPairs.end(); P != E; ++P)
2981  FlippedPairs.insert(ValuePair(P->second, P->first));
2982  for (DenseSet<ValuePair>::iterator P = FlippedPairs.begin(),
2983  E = FlippedPairs.end(); P != E; ++P)
2984  ChosenPairs.insert(*P);
2985 
2987  DenseSet<ValuePair> LoadMoveSetPairs;
2988  collectLoadMoveSet(BB, PairableInsts, ChosenPairs,
2989  LoadMoveSet, LoadMoveSetPairs);
2990 
2991  DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n");
2992 
2993  for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) {
2994  DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(PI);
2995  if (P == ChosenPairs.end()) {
2996  ++PI;
2997  continue;
2998  }
2999 
3000  if (getDepthFactor(P->first) == 0) {
3001  // These instructions are not really fused, but are tracked as though
3002  // they are. Any case in which it would be interesting to fuse them
3003  // will be taken care of by InstCombine.
3004  --NumFusedOps;
3005  ++PI;
3006  continue;
3007  }
3008 
3009  Instruction *I = cast<Instruction>(P->first),
3010  *J = cast<Instruction>(P->second);
3011 
3012  DEBUG(dbgs() << "BBV: fusing: " << *I <<
3013  " <-> " << *J << "\n");
3014 
3015  // Remove the pair and flipped pair from the list.
3016  DenseMap<Value *, Value *>::iterator FP = ChosenPairs.find(P->second);
3017  assert(FP != ChosenPairs.end() && "Flipped pair not found in list");
3018  ChosenPairs.erase(FP);
3019  ChosenPairs.erase(P);
3020 
3021  if (!canMoveUsesOfIAfterJ(BB, LoadMoveSetPairs, I, J)) {
3022  DEBUG(dbgs() << "BBV: fusion of: " << *I <<
3023  " <-> " << *J <<
3024  " aborted because of non-trivial dependency cycle\n");
3025  --NumFusedOps;
3026  ++PI;
3027  continue;
3028  }
3029 
3030  // If the pair must have the other order, then flip it.
3031  bool FlipPairOrder = FixedOrderPairs.count(ValuePair(J, I));
3032  if (!FlipPairOrder && !FixedOrderPairs.count(ValuePair(I, J))) {
3033  // This pair does not have a fixed order, and so we might want to
3034  // flip it if that will yield fewer shuffles. We count the number
3035  // of dependencies connected via swaps, and those directly connected,
3036  // and flip the order if the number of swaps is greater.
3037  bool OrigOrder = true;
3039  ConnectedPairDeps.find(ValuePair(I, J));
3040  if (IJ == ConnectedPairDeps.end()) {
3041  IJ = ConnectedPairDeps.find(ValuePair(J, I));
3042  OrigOrder = false;
3043  }
3044 
3045  if (IJ != ConnectedPairDeps.end()) {
3046  unsigned NumDepsDirect = 0, NumDepsSwap = 0;
3047  for (std::vector<ValuePair>::iterator T = IJ->second.begin(),
3048  TE = IJ->second.end(); T != TE; ++T) {
3049  VPPair Q(IJ->first, *T);
3051  PairConnectionTypes.find(VPPair(Q.second, Q.first));
3052  assert(R != PairConnectionTypes.end() &&
3053  "Cannot find pair connection type");
3054  if (R->second == PairConnectionDirect)
3055  ++NumDepsDirect;
3056  else if (R->second == PairConnectionSwap)
3057  ++NumDepsSwap;
3058  }
3059 
3060  if (!OrigOrder)
3061  std::swap(NumDepsDirect, NumDepsSwap);
3062 
3063  if (NumDepsSwap > NumDepsDirect) {
3064  FlipPairOrder = true;
3065  DEBUG(dbgs() << "BBV: reordering pair: " << *I <<
3066  " <-> " << *J << "\n");
3067  }
3068  }
3069  }
3070 
3071  Instruction *L = I, *H = J;
3072  if (FlipPairOrder)
3073  std::swap(H, L);
3074 
3075  // If the pair being fused uses the opposite order from that in the pair
3076  // connection map, then we need to flip the types.
3078  ConnectedPairs.find(ValuePair(H, L));
3079  if (HL != ConnectedPairs.end())
3080  for (std::vector<ValuePair>::iterator T = HL->second.begin(),
3081  TE = HL->second.end(); T != TE; ++T) {
3082  VPPair Q(HL->first, *T);
3083  DenseMap<VPPair, unsigned>::iterator R = PairConnectionTypes.find(Q);
3084  assert(R != PairConnectionTypes.end() &&
3085  "Cannot find pair connection type");
3086  if (R->second == PairConnectionDirect)
3087  R->second = PairConnectionSwap;
3088  else if (R->second == PairConnectionSwap)
3089  R->second = PairConnectionDirect;
3090  }
3091 
3092  bool LBeforeH = !FlipPairOrder;
3093  unsigned NumOperands = I->getNumOperands();
3094  SmallVector<Value *, 3> ReplacedOperands(NumOperands);
3095  getReplacementInputsForPair(Context, L, H, ReplacedOperands,
3096  LBeforeH);
3097 
3098  // Make a copy of the original operation, change its type to the vector
3099  // type and replace its operands with the vector operands.
3100  Instruction *K = L->clone();
3101  if (L->hasName())
3102  K->takeName(L);
3103  else if (H->hasName())
3104  K->takeName(H);
3105 
3106  if (auto CS = CallSite(K)) {
3108  FunctionType *Old = CS.getFunctionType();
3109  unsigned NumOld = Old->getNumParams();
3110  assert(NumOld <= ReplacedOperands.size());
3111  for (unsigned i = 0; i != NumOld; ++i)
3112  Tys.push_back(ReplacedOperands[i]->getType());
3113  CS.mutateFunctionType(
3114  FunctionType::get(getVecTypeForPair(L->getType(), H->getType()),
3115  Tys, Old->isVarArg()));
3116  } else if (!isa<StoreInst>(K))
3117  K->mutateType(getVecTypeForPair(L->getType(), H->getType()));
3118 
3119  unsigned KnownIDs[] = {
3124  };
3125  combineMetadata(K, H, KnownIDs);
3127 
3128  for (unsigned o = 0; o < NumOperands; ++o)
3129  K->setOperand(o, ReplacedOperands[o]);
3130 
3131  K->insertAfter(J);
3132 
3133  // Instruction insertion point:
3134  Instruction *InsertionPt = K;
3135  Instruction *K1 = nullptr, *K2 = nullptr;
3136  replaceOutputsOfPair(Context, L, H, K, InsertionPt, K1, K2);
3137 
3138  // The use dag of the first original instruction must be moved to after
3139  // the location of the second instruction. The entire use dag of the
3140  // first instruction is disjoint from the input dag of the second
3141  // (by definition), and so commutes with it.
3142 
3143  moveUsesOfIAfterJ(BB, LoadMoveSetPairs, InsertionPt, I, J);
3144 
3145  if (!isa<StoreInst>(I)) {
3146  L->replaceAllUsesWith(K1);
3147  H->replaceAllUsesWith(K2);
3148  AA->replaceWithNewValue(L, K1);
3149  AA->replaceWithNewValue(H, K2);
3150  }
3151 
3152  // Instructions that may read from memory may be in the load move set.
3153  // Once an instruction is fused, we no longer need its move set, and so
3154  // the values of the map never need to be updated. However, when a load
3155  // is fused, we need to merge the entries from both instructions in the
3156  // pair in case those instructions were in the move set of some other
3157  // yet-to-be-fused pair. The loads in question are the keys of the map.
3158  if (I->mayReadFromMemory()) {
3159  std::vector<ValuePair> NewSetMembers;
3161  LoadMoveSet.find(I);
3162  if (II != LoadMoveSet.end())
3163  for (std::vector<Value *>::iterator N = II->second.begin(),
3164  NE = II->second.end(); N != NE; ++N)
3165  NewSetMembers.push_back(ValuePair(K, *N));
3167  LoadMoveSet.find(J);
3168  if (JJ != LoadMoveSet.end())
3169  for (std::vector<Value *>::iterator N = JJ->second.begin(),
3170  NE = JJ->second.end(); N != NE; ++N)
3171  NewSetMembers.push_back(ValuePair(K, *N));
3172  for (std::vector<ValuePair>::iterator A = NewSetMembers.begin(),
3173  AE = NewSetMembers.end(); A != AE; ++A) {
3174  LoadMoveSet[A->first].push_back(A->second);
3175  LoadMoveSetPairs.insert(*A);
3176  }
3177  }
3178 
3179  // Before removing I, set the iterator to the next instruction.
3180  PI = std::next(BasicBlock::iterator(I));
3181  if (cast<Instruction>(PI) == J)
3182  ++PI;
3183 
3184  SE->forgetValue(I);
3185  SE->forgetValue(J);
3186  I->eraseFromParent();
3187  J->eraseFromParent();
3188 
3189  DEBUG(if (PrintAfterEveryPair) dbgs() << "BBV: block is now: \n" <<
3190  BB << "\n");
3191  }
3192 
3193  DEBUG(dbgs() << "BBV: final: \n" << BB << "\n");
3194  }
3195 }
3196 
3197 char BBVectorize::ID = 0;
3198 static const char bb_vectorize_name[] = "Basic-Block Vectorization";
3199 INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
3204 INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
3205 
3207  return new BBVectorize(C);
3208 }
3209 
3210 bool
3212  BBVectorize BBVectorizer(P, *BB.getParent(), C);
3213  return BBVectorizer.vectorizeBB(BB);
3214 }
3215 
3216 //===----------------------------------------------------------------------===//
3226  VectorizeFMA = !::NoFMA;
3228  VectorizeCmp = !::NoCmp;
3229  VectorizeGEP = !::NoGEP;
3236  MaxInsts = ::MaxInsts;
3237  MaxPairs = ::MaxPairs;
3238  MaxIter = ::MaxIter;
3241  FastDep = ::FastDep;
3242 }
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:82
bool VectorizeFMA
Vectorize the fused-multiply-add intrinsic.
iplist< Instruction >::iterator eraseFromParent()
eraseFromParent - This method unlinks 'this' from the containing basic block and deletes it...
Definition: Instruction.cpp:70
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:104
static cl::opt< unsigned > MaxPairs("bb-vectorize-max-pairs-per-group", cl::init(3000), cl::Hidden, cl::desc("The maximum number of candidate instruction pairs per group"))
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:679
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
bool hasName() const
Definition: Value.h:228
STATISTIC(NumFunctions,"Total number of functions")
ValueT lookup(const KeyT &Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:159
A Module instance is used to store all the information related to an LLVM module. ...
Definition: Module.h:114
unsigned getNumParams() const
getNumParams - Return the number of fixed parameters this function type requires. ...
Definition: DerivedTypes.h:136
size_type size() const
Definition: DenseSet.h:53
static cl::opt< bool > DebugPairSelection("bb-vectorize-debug-pair-selection", cl::init(false), cl::Hidden, cl::desc("When debugging is enabled, output information on the"" pair-selection process"))
DenseSet - This implements a dense probed hash-table based set.
Definition: DenseSet.h:39
unsigned getNumOperands() const
Definition: User.h:138
unsigned getPrefTypeAlignment(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:684
ScalarEvolution - This class is the main scalar evolution driver.
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:78
CallInst - This class represents a function call, abstracting a target machine's calling convention...
void initializeBBVectorizePass(PassRegistry &)
This file contains the declarations for metadata subclasses.
static cl::opt< bool > NoMemOpBoost("bb-vectorize-no-mem-op-boost", cl::init(false), cl::Hidden, cl::desc("Don't boost the chain-depth contribution of loads and stores"))
static PointerType * get(Type *ElementType, unsigned AddressSpace)
PointerType::get - This constructs a pointer to an object of the specified type in a numbered address...
Definition: Type.cpp:738
static cl::opt< bool > IgnoreTargetInfo("bb-vectorize-ignore-target-info", cl::init(false), cl::Hidden, cl::desc("Ignore target information"))
ShuffleVectorInst - This instruction constructs a fixed permutation of two input vectors.
bool VectorizeMath
Vectorize floating-point math intrinsics.
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:111
bool SimplifyInstructionsInBlock(BasicBlock *BB, const TargetLibraryInfo *TLI=nullptr)
SimplifyInstructionsInBlock - Scan the specified basic block and try to simplify any instructions in ...
Definition: Local.cpp:422
F(f)
LoadInst - an instruction for reading from memory.
Definition: Instructions.h:177
iv Induction Variable Users
Definition: IVUsers.cpp:43
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: Type.cpp:216
bool add(Value *Ptr, uint64_t Size, const AAMDNodes &AAInfo)
add methods - These methods are used to add different types of instructions to the alias sets...
static cl::opt< HelpPrinterWrapper, true, parser< bool > > HOp("help", cl::desc("Display available options (-help-hidden for more)"), cl::location(WrappedNormalPrinter), cl::ValueDisallowed, cl::cat(GenericCategory))
op_iterator op_begin()
Definition: User.h:183
static cl::opt< unsigned > VectorBits("bb-vectorize-vector-bits", cl::init(128), cl::Hidden, cl::desc("The size of the native vector registers"))
Type * getPointerElementType() const
Definition: Type.h:366
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:188
bool isSingleValueType() const
isSingleValueType - Return true if the type is a valid type for a register in codegen.
Definition: Type.h:250
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:169
AnalysisUsage & addRequired()
static InsertElementInst * Create(Value *Vec, Value *NewElt, Value *Idx, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:70
static cl::opt< bool > AlignedOnly("bb-vectorize-aligned-only", cl::init(false), cl::Hidden, cl::desc("Only generate aligned loads and stores"))
SelectInst - This class represents the LLVM 'select' instruction.
unsigned MaxCandPairsForCycleCheck
The maximum number of candidate pairs with which to use a full cycle check.
bool erase(const ValueT &V)
Definition: DenseSet.h:69
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:389
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:106
const_iterator end() const
T LLVM_ATTRIBUTE_UNUSED_RESULT pop_back_val()
Definition: SmallVector.h:406
A Use represents the edge between a Value definition and its users.
Definition: Use.h:69
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:75
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APInt.h:33
unsigned getNumArgOperands() const
getNumArgOperands - Return the number of call arguments.
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1057
Number of individual test Apply this number of consecutive mutations to each input exit after the first new interesting input is found the minimized corpus is saved into the first input directory Number of jobs to run If min(jobs, NumberOfCpuCores()/2)\" is used.") FUZZER_FLAG_INT(reload
Windows NT (Windows on ARM)
size_type size() const
Definition: SmallSet.h:48
Check for equivalence ignoring load/store alignment.
Definition: Instruction.h:412
static ConstantInt * ExtractElement(Constant *V, Constant *Idx)
bool empty() const
Definition: DenseSet.h:52
Instruction * clone() const
clone() - Create a copy of 'this' instruction that is identical in all ways except the following: ...
#define false
Definition: ConvertUTF.c:65
static cl::opt< unsigned > MaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden, cl::desc("The maximum number of pairable instructions per group"))
static const char bb_vectorize_name[]
#define G(x, y, z)
Definition: MD5.cpp:52
bool Pow2LenOnly
Don't try to form odd-length vectors.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:414
const_iterator begin() const
user_iterator_impl< User > user_iterator
Definition: Value.h:292
FunctionType - Class to represent function types.
Definition: DerivedTypes.h:96
Check for equivalence treating a type and a vector of that type as equivalent.
Definition: Instruction.h:415
static cl::opt< bool > NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize floating-point math intrinsics"))
bool mayReadFromMemory() const
mayReadFromMemory - Return true if this instruction may read memory.
static cl::opt< bool > UseChainDepthWithTI("bb-vectorize-use-chain-depth", cl::init(false), cl::Hidden, cl::desc("Use the chain depth requirement with"" target information"))
bool LLVM_ATTRIBUTE_UNUSED_RESULT empty() const
Definition: SmallVector.h:57
#define T
static cl::opt< bool > NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize the fused-multiply-add intrinsic"))
static bool isValidElementType(Type *ElemTy)
isValidElementType - Return true if the specified type is valid as a element type.
Definition: Type.cpp:729
This class represents a no-op cast from one type to another.
static FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
FunctionType::get - This static method is the primary way of constructing a FunctionType.
Definition: Type.cpp:361
static std::string utostr(uint64_t X, bool isNeg=false)
Definition: StringExtras.h:93
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:866
StoreInst - an instruction for storing to memory.
Definition: Instructions.h:316
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:351
unsigned getNumElements() const
Return the number of elements in the Vector type.
Definition: DerivedTypes.h:432
Reverse the order of the vector.
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:256
bool VectorizePointers
Vectorize pointer values.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:67
bool isPPC_FP128Ty() const
isPPC_FP128Ty - Return true if this is powerpc long double.
Definition: Type.h:155
bool VectorizeCmp
Vectorize comparison instructions.
unsigned MaxIter
The maximum number of pairing iterations.
bool VectorizeMemOps
Vectorize loads and stores.
static cl::opt< bool > FastDep("bb-vectorize-fast-dep", cl::init(false), cl::Hidden, cl::desc("Use a fast instruction dependency analysis"))
ExtractSubvector Index indicates start offset.
static cl::opt< bool > NoCmp("bb-vectorize-no-cmp", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize comparison instructions"))
GetElementPtrInst - an instruction for type-safe pointer arithmetic to access elements of arrays and ...
Definition: Instructions.h:830
bool isX86_MMXTy() const
isX86_MMXTy - Return true if this is X86 MMX.
Definition: Type.h:179
bool isIntOrIntVectorTy() const
isIntOrIntVectorTy - Return true if this is an integer type or a vector of integer types...
Definition: Type.h:201
#define P(N)
void intersectOptionalDataWith(const Value *V)
Clear any optional flags not set in the given Value.
Definition: Value.h:384
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:325
InsertElementInst - This instruction inserts a single (scalar) element into a VectorType value...
unsigned getAlignment() const
getAlignment - Return the alignment of the access that is being performed
Definition: Instructions.h:365
Wrapper pass for TargetTransformInfo.
iterator find(const ValueT &V)
Definition: DenseSet.h:128
bool AlignedOnly
Only generate aligned loads and stores.
static cl::opt< bool > NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize floating-point values"))
void insertBefore(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified instruction...
Definition: Instruction.cpp:76
LLVM Basic Block Representation.
Definition: BasicBlock.h:65
static cl::opt< unsigned > SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden, cl::desc("The maximum search distance for instruction pairs"))
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:45
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:41
size_type count(const ValueT &V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:65
bool isVectorTy() const
isVectorTy - True if this is an instance of VectorType.
Definition: Type.h:226
This is an important base class in LLVM.
Definition: Constant.h:41
bool SplatBreaksChain
Replicating one element to a pair breaks the chain.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:32
This file contains the declarations for the subclasses of Constant, which represent the different fla...
#define H(x, y, z)
Definition: MD5.cpp:53
static cl::opt< bool > DebugInstructionExamination("bb-vectorize-debug-instruction-examination", cl::init(false), cl::Hidden, cl::desc("When debugging is enabled, output information on the"" instruction-examination process"))
APInt Or(const APInt &LHS, const APInt &RHS)
Bitwise OR function for APInt.
Definition: APInt.h:1895
APInt Xor(const APInt &LHS, const APInt &RHS)
Bitwise XOR function for APInt.
Definition: APInt.h:1900
static cl::opt< bool > NoBitManipulation("bb-vectorize-no-bitmanip", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize BitManipulation intrinsics"))
Represent the analysis usage information of a pass.
op_iterator op_end()
Definition: User.h:185
Value * getOperand(unsigned i) const
Definition: User.h:118
Value * getPointerOperand()
Definition: Instructions.h:284
unsigned SearchLimit
The maximum search distance for instruction pairs.
static cl::opt< bool > NoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize casting (conversion) operations"))
std::pair< NoneType, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:69
static cl::opt< unsigned > MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200), cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use"" a full cycle check"))
#define INITIALIZE_AG_DEPENDENCY(depName)
Definition: PassSupport.h:72
bool isPointerTy() const
isPointerTy - True if this is an instance of PointerType.
Definition: Type.h:217
static UndefValue * get(Type *T)
get() - Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1473
iterator erase(iterator I)
Definition: SmallVector.h:455
static cl::opt< unsigned > MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden, cl::desc("The maximum number of pairing iterations"))
bool isFPOrFPVectorTy() const
isFPOrFPVectorTy - Return true if this is a FP type or a vector of FP.
Definition: Type.h:183
LLVM_READONLY APFloat maxnum(const APFloat &A, const APFloat &B)
Implements IEEE maxNum semantics.
Definition: APFloat.h:670
bool mayWriteToMemory() const
mayWriteToMemory - Return true if this instruction may modify memory.
BasicBlockPass class - This class is used to implement most local optimizations.
Definition: Pass.h:330
unsigned MaxPairs
The maximum number of candidate instruction pairs per group.
const unsigned MaxDepth
iterator begin()
Definition: DenseSet.h:122
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:147
This is the shared class of boolean and integer constants.
Definition: Constants.h:47
bool VectorizeFloats
Vectorize floating-point values.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
unsigned getVectorNumElements() const
Definition: Type.cpp:212
iterator end()
Definition: BasicBlock.h:233
unsigned getScalarSizeInBits() const LLVM_READONLY
getScalarSizeInBits - If this is a vector type, return the getPrimitiveSizeInBits value for the eleme...
Definition: Type.cpp:139
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:57
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:861
Module.h This file contains the declarations for the Module class.
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:222
static cl::opt< bool > SplatBreaksChain("bb-vectorize-splat-breaks-chain", cl::init(false), cl::Hidden, cl::desc("Replicating one element to a pair breaks the chain"))
bool vectorizeBasicBlock(Pass *P, BasicBlock &BB, const VectorizeConfig &C=VectorizeConfig())
Vectorize the BasicBlock.
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:67
static cl::opt< bool > NoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize loads and stores"))
static Constant * get(Type *Ty, uint64_t V, bool isSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:582
Function * getCalledFunction() const
getCalledFunction - Return the function called, or null if this is an indirect function invocation...
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:263
static cl::opt< bool > NoPointers("bb-vectorize-no-pointers", cl::init(true), cl::Hidden, cl::desc("Don't try to vectorize pointer values"))
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:289
bool FastDep
Use a fast instruction dependency analysis.
Intrinsic::ID getIntrinsicID() const LLVM_READONLY
getIntrinsicID - This method returns the ID number of the specified function, or Intrinsic::not_intri...
Definition: Function.h:159
Vectorize configuration.
void setOperand(unsigned i, Value *Val)
Definition: User.h:122
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:123
size_type count(const KeyT &Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:119
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:576
Value * getArgOperand(unsigned i) const
getArgOperand/setArgOperand - Return/set the i-th call argument.
VectorType - Class to represent vector types.
Definition: DerivedTypes.h:362
static cl::opt< bool > NoGEP("bb-vectorize-no-gep", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize getelementptr instructions"))
iterator_range< user_iterator > users()
Definition: Value.h:300
static cl::opt< bool > NoBools("bb-vectorize-no-bools", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize boolean (i1) values"))
bool VectorizeCasts
Vectorize casting (conversion) operations.
LLVM_ATTRIBUTE_UNUSED_RESULT std::enable_if< !is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:285
#define BBV_NAME
Definition: BBVectorize.cpp:17
BasicBlockPass * createBBVectorizePass(const VectorizeConfig &C=VectorizeConfig())
const Type * getScalarType() const LLVM_READONLY
getScalarType - If this is a vector type, return the element type, otherwise return 'this'...
Definition: Type.cpp:51
bool VectorizeBools
Vectorize boolean values.
APInt And(const APInt &LHS, const APInt &RHS)
Bitwise AND function for APInt.
Definition: APInt.h:1890
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing basic block, but does not delete it...
Definition: Instruction.cpp:66
static cl::opt< unsigned > ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden, cl::desc("The required chain depth for vectorization"))
unsigned ReqChainDepth
The required chain depth for vectorization.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.cpp:372
bool isX86_FP80Ty() const
isX86_FP80Ty - Return true if this is x86 long double.
Definition: Type.h:149
unsigned MaxInsts
The maximum number of pairable instructions per group.
SCEV - This class represents an analyzed expression in the program.
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:239
static cl::opt< bool > NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize integer values"))
unsigned getAlignment() const
getAlignment - Return the alignment of the access that is being performed
Definition: Instructions.h:243
bool isBinaryOp() const
Definition: Instruction.h:116
void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction...
Definition: Instruction.cpp:82
bool NoMemOpBoost
Don't boost the chain-depth contribution of loads and stores.
#define I(x, y, z)
Definition: MD5.cpp:54
#define N
bool VectorizeGEP
Vectorize getelementptr instructions.
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:651
static Function * getCalledFunction(const Value *V, bool LookThroughBitCast)
ExtractElementInst - This instruction extracts a single (scalar) element from a VectorType value...
static cl::opt< bool > PrintAfterEveryPair("bb-vectorize-debug-print-after-every-pair", cl::init(false), cl::Hidden, cl::desc("When debugging is enabled, dump the basic block after"" every pair is fused"))
static cl::opt< bool > DebugCycleCheck("bb-vectorize-debug-cycle-check", cl::init(false), cl::Hidden, cl::desc("When debugging is enabled, output information on the"" cycle-checking process"))
static cl::opt< bool > DebugCandidateSelection("bb-vectorize-debug-candidate-selection", cl::init(false), cl::Hidden, cl::desc("When debugging is enabled, output information on the"" candidate-selection process"))
uint64_t getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type...
Definition: DataLayout.h:371
void mutateType(Type *Ty)
Mutate the type of this Value to be of the specified type.
Definition: Value.h:471
VectorizeConfig()
Initialize the VectorizeConfig from command line options.
bool isVarArg() const
Definition: DerivedTypes.h:120
user_iterator user_begin()
Definition: Value.h:294
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:32
aarch64 promote const
unsigned getPrimitiveSizeInBits() const LLVM_READONLY
getPrimitiveSizeInBits - Return the basic size of this type if it is a primitive type.
Definition: Type.cpp:121
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:365
LLVM Value Representation.
Definition: Value.h:69
unsigned getOpcode() const
getOpcode() returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:112
static VectorType * get(Type *ElementType, unsigned NumElements)
VectorType::get - This static method is the primary way to construct an VectorType.
Definition: Type.cpp:713
bool VectorizeInts
Vectorize integer values.
Broadcast element 0 to all other elements.
static cl::opt< bool > Pow2LenOnly("bb-vectorize-pow2-len-only", cl::init(false), cl::Hidden, cl::desc("Don't try to form non-2^n-length vectors"))
#define DEBUG(X)
Definition: Debug.h:92
bool VectorizeBitManipulations
Vectorize bit intrinsics.
OperandValueKind
Additional information about an operand's possible values.
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:203
This pass exposes codegen information to IR-level passes.
bool isSameOperationAs(const Instruction *I, unsigned flags=0) const
This function determines if the specified instruction executes the same operation as the current one...
iterator end()
Definition: DenseSet.h:123
iterator getFirstInsertionPt()
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:194
bool VectorizeSelect
Vectorize select instructions.
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:125
Value * getPointerOperand()
Definition: Instructions.h:409
void combineMetadata(Instruction *K, const Instruction *J, ArrayRef< unsigned > KnownIDs)
Combine the metadata of two instructions so that K can replace J.
Definition: Local.cpp:1286
const BasicBlock * getParent() const
Definition: Instruction.h:72
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:93
LLVM_READONLY APFloat minnum(const APFloat &A, const APFloat &B)
Implements IEEE minNum semantics.
Definition: APFloat.h:659
static cl::opt< bool > NoSelect("bb-vectorize-no-select", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize select instructions"))
#define T1
unsigned VectorBits
The size of the native vector registers.
SCEVConstant - This class represents a constant integer value.