LLVM  4.0.0
BBVectorize.cpp
Go to the documentation of this file.
1 //===- BBVectorize.cpp - A Basic-Block Vectorizer -------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements a basic-block vectorization pass. The algorithm was
11 // inspired by that used by the Vienna MAP Vectorizor by Franchetti and Kral,
12 // et al. It works by looking for chains of pairable operations and then
13 // pairing them.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #define BBV_NAME "bb-vectorize"
19 #include "llvm/ADT/DenseMap.h"
20 #include "llvm/ADT/DenseSet.h"
21 #include "llvm/ADT/STLExtras.h"
22 #include "llvm/ADT/SmallSet.h"
23 #include "llvm/ADT/SmallVector.h"
24 #include "llvm/ADT/Statistic.h"
25 #include "llvm/ADT/StringExtras.h"
35 #include "llvm/IR/Constants.h"
36 #include "llvm/IR/DataLayout.h"
37 #include "llvm/IR/DerivedTypes.h"
38 #include "llvm/IR/Dominators.h"
39 #include "llvm/IR/Function.h"
40 #include "llvm/IR/Instructions.h"
41 #include "llvm/IR/IntrinsicInst.h"
42 #include "llvm/IR/Intrinsics.h"
43 #include "llvm/IR/LLVMContext.h"
44 #include "llvm/IR/Metadata.h"
45 #include "llvm/IR/Module.h"
46 #include "llvm/IR/Type.h"
47 #include "llvm/IR/ValueHandle.h"
48 #include "llvm/Pass.h"
50 #include "llvm/Support/Debug.h"
53 #include <algorithm>
54 using namespace llvm;
55 
56 #define DEBUG_TYPE BBV_NAME
57 
58 static cl::opt<bool>
59 IgnoreTargetInfo("bb-vectorize-ignore-target-info", cl::init(false),
60  cl::Hidden, cl::desc("Ignore target information"));
61 
62 static cl::opt<unsigned>
63 ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden,
64  cl::desc("The required chain depth for vectorization"));
65 
66 static cl::opt<bool>
67 UseChainDepthWithTI("bb-vectorize-use-chain-depth", cl::init(false),
68  cl::Hidden, cl::desc("Use the chain depth requirement with"
69  " target information"));
70 
71 static cl::opt<unsigned>
72 SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden,
73  cl::desc("The maximum search distance for instruction pairs"));
74 
75 static cl::opt<bool>
76 SplatBreaksChain("bb-vectorize-splat-breaks-chain", cl::init(false), cl::Hidden,
77  cl::desc("Replicating one element to a pair breaks the chain"));
78 
79 static cl::opt<unsigned>
80 VectorBits("bb-vectorize-vector-bits", cl::init(128), cl::Hidden,
81  cl::desc("The size of the native vector registers"));
82 
83 static cl::opt<unsigned>
84 MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden,
85  cl::desc("The maximum number of pairing iterations"));
86 
87 static cl::opt<bool>
88 Pow2LenOnly("bb-vectorize-pow2-len-only", cl::init(false), cl::Hidden,
89  cl::desc("Don't try to form non-2^n-length vectors"));
90 
91 static cl::opt<unsigned>
92 MaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden,
93  cl::desc("The maximum number of pairable instructions per group"));
94 
95 static cl::opt<unsigned>
96 MaxPairs("bb-vectorize-max-pairs-per-group", cl::init(3000), cl::Hidden,
97  cl::desc("The maximum number of candidate instruction pairs per group"));
98 
99 static cl::opt<unsigned>
100 MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200),
101  cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use"
102  " a full cycle check"));
103 
104 static cl::opt<bool>
105 NoBools("bb-vectorize-no-bools", cl::init(false), cl::Hidden,
106  cl::desc("Don't try to vectorize boolean (i1) values"));
107 
108 static cl::opt<bool>
109 NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden,
110  cl::desc("Don't try to vectorize integer values"));
111 
112 static cl::opt<bool>
113 NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden,
114  cl::desc("Don't try to vectorize floating-point values"));
115 
116 // FIXME: This should default to false once pointer vector support works.
117 static cl::opt<bool>
118 NoPointers("bb-vectorize-no-pointers", cl::init(/*false*/ true), cl::Hidden,
119  cl::desc("Don't try to vectorize pointer values"));
120 
121 static cl::opt<bool>
122 NoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden,
123  cl::desc("Don't try to vectorize casting (conversion) operations"));
124 
125 static cl::opt<bool>
126 NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden,
127  cl::desc("Don't try to vectorize floating-point math intrinsics"));
128 
129 static cl::opt<bool>
130  NoBitManipulation("bb-vectorize-no-bitmanip", cl::init(false), cl::Hidden,
131  cl::desc("Don't try to vectorize BitManipulation intrinsics"));
132 
133 static cl::opt<bool>
134 NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden,
135  cl::desc("Don't try to vectorize the fused-multiply-add intrinsic"));
136 
137 static cl::opt<bool>
138 NoSelect("bb-vectorize-no-select", cl::init(false), cl::Hidden,
139  cl::desc("Don't try to vectorize select instructions"));
140 
141 static cl::opt<bool>
142 NoCmp("bb-vectorize-no-cmp", cl::init(false), cl::Hidden,
143  cl::desc("Don't try to vectorize comparison instructions"));
144 
145 static cl::opt<bool>
146 NoGEP("bb-vectorize-no-gep", cl::init(false), cl::Hidden,
147  cl::desc("Don't try to vectorize getelementptr instructions"));
148 
149 static cl::opt<bool>
150 NoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden,
151  cl::desc("Don't try to vectorize loads and stores"));
152 
153 static cl::opt<bool>
154 AlignedOnly("bb-vectorize-aligned-only", cl::init(false), cl::Hidden,
155  cl::desc("Only generate aligned loads and stores"));
156 
157 static cl::opt<bool>
158 NoMemOpBoost("bb-vectorize-no-mem-op-boost",
159  cl::init(false), cl::Hidden,
160  cl::desc("Don't boost the chain-depth contribution of loads and stores"));
161 
162 static cl::opt<bool>
163 FastDep("bb-vectorize-fast-dep", cl::init(false), cl::Hidden,
164  cl::desc("Use a fast instruction dependency analysis"));
165 
166 #ifndef NDEBUG
167 static cl::opt<bool>
168 DebugInstructionExamination("bb-vectorize-debug-instruction-examination",
169  cl::init(false), cl::Hidden,
170  cl::desc("When debugging is enabled, output information on the"
171  " instruction-examination process"));
172 static cl::opt<bool>
173 DebugCandidateSelection("bb-vectorize-debug-candidate-selection",
174  cl::init(false), cl::Hidden,
175  cl::desc("When debugging is enabled, output information on the"
176  " candidate-selection process"));
177 static cl::opt<bool>
178 DebugPairSelection("bb-vectorize-debug-pair-selection",
179  cl::init(false), cl::Hidden,
180  cl::desc("When debugging is enabled, output information on the"
181  " pair-selection process"));
182 static cl::opt<bool>
183 DebugCycleCheck("bb-vectorize-debug-cycle-check",
184  cl::init(false), cl::Hidden,
185  cl::desc("When debugging is enabled, output information on the"
186  " cycle-checking process"));
187 
188 static cl::opt<bool>
189 PrintAfterEveryPair("bb-vectorize-debug-print-after-every-pair",
190  cl::init(false), cl::Hidden,
191  cl::desc("When debugging is enabled, dump the basic block after"
192  " every pair is fused"));
193 #endif
194 
195 STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize");
196 
197 namespace {
198  struct BBVectorize : public BasicBlockPass {
199  static char ID; // Pass identification, replacement for typeid
200 
201  const VectorizeConfig Config;
202 
203  BBVectorize(const VectorizeConfig &C = VectorizeConfig())
204  : BasicBlockPass(ID), Config(C) {
206  }
207 
208  BBVectorize(Pass *P, Function &F, const VectorizeConfig &C)
209  : BasicBlockPass(ID), Config(C) {
210  AA = &P->getAnalysis<AAResultsWrapperPass>().getAAResults();
211  DT = &P->getAnalysis<DominatorTreeWrapperPass>().getDomTree();
212  SE = &P->getAnalysis<ScalarEvolutionWrapperPass>().getSE();
213  TLI = &P->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
214  TTI = IgnoreTargetInfo
215  ? nullptr
216  : &P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
217  }
218 
219  typedef std::pair<Value *, Value *> ValuePair;
220  typedef std::pair<ValuePair, int> ValuePairWithCost;
221  typedef std::pair<ValuePair, size_t> ValuePairWithDepth;
222  typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair
223  typedef std::pair<VPPair, unsigned> VPPairWithType;
224 
225  AliasAnalysis *AA;
226  DominatorTree *DT;
227  ScalarEvolution *SE;
228  const TargetLibraryInfo *TLI;
229  const TargetTransformInfo *TTI;
230 
231  // FIXME: const correct?
232 
233  bool vectorizePairs(BasicBlock &BB, bool NonPow2Len = false);
234 
235  bool getCandidatePairs(BasicBlock &BB,
236  BasicBlock::iterator &Start,
237  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
238  DenseSet<ValuePair> &FixedOrderPairs,
239  DenseMap<ValuePair, int> &CandidatePairCostSavings,
240  std::vector<Value *> &PairableInsts, bool NonPow2Len);
241 
242  // FIXME: The current implementation does not account for pairs that
243  // are connected in multiple ways. For example:
244  // C1 = A1 / A2; C2 = A2 / A1 (which may be both direct and a swap)
245  enum PairConnectionType {
246  PairConnectionDirect,
247  PairConnectionSwap,
248  PairConnectionSplat
249  };
250 
251  void computeConnectedPairs(
252  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
253  DenseSet<ValuePair> &CandidatePairsSet,
254  std::vector<Value *> &PairableInsts,
255  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
256  DenseMap<VPPair, unsigned> &PairConnectionTypes);
257 
258  void buildDepMap(BasicBlock &BB,
259  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
260  std::vector<Value *> &PairableInsts,
261  DenseSet<ValuePair> &PairableInstUsers);
262 
263  void choosePairs(DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
264  DenseSet<ValuePair> &CandidatePairsSet,
265  DenseMap<ValuePair, int> &CandidatePairCostSavings,
266  std::vector<Value *> &PairableInsts,
267  DenseSet<ValuePair> &FixedOrderPairs,
268  DenseMap<VPPair, unsigned> &PairConnectionTypes,
269  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
270  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
271  DenseSet<ValuePair> &PairableInstUsers,
272  DenseMap<Value *, Value *>& ChosenPairs);
273 
274  void fuseChosenPairs(BasicBlock &BB,
275  std::vector<Value *> &PairableInsts,
276  DenseMap<Value *, Value *>& ChosenPairs,
277  DenseSet<ValuePair> &FixedOrderPairs,
278  DenseMap<VPPair, unsigned> &PairConnectionTypes,
279  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
280  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps);
281 
282 
283  bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore);
284 
285  bool areInstsCompatible(Instruction *I, Instruction *J,
286  bool IsSimpleLoadStore, bool NonPow2Len,
287  int &CostSavings, int &FixedOrder);
288 
289  bool trackUsesOfI(DenseSet<Value *> &Users,
290  AliasSetTracker &WriteSet, Instruction *I,
291  Instruction *J, bool UpdateUsers = true,
292  DenseSet<ValuePair> *LoadMoveSetPairs = nullptr);
293 
294  void computePairsConnectedTo(
295  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
296  DenseSet<ValuePair> &CandidatePairsSet,
297  std::vector<Value *> &PairableInsts,
298  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
299  DenseMap<VPPair, unsigned> &PairConnectionTypes,
300  ValuePair P);
301 
302  bool pairsConflict(ValuePair P, ValuePair Q,
303  DenseSet<ValuePair> &PairableInstUsers,
304  DenseMap<ValuePair, std::vector<ValuePair> >
305  *PairableInstUserMap = nullptr,
306  DenseSet<VPPair> *PairableInstUserPairSet = nullptr);
307 
308  bool pairWillFormCycle(ValuePair P,
309  DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUsers,
310  DenseSet<ValuePair> &CurrentPairs);
311 
312  void pruneDAGFor(
313  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
314  std::vector<Value *> &PairableInsts,
315  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
316  DenseSet<ValuePair> &PairableInstUsers,
317  DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
318  DenseSet<VPPair> &PairableInstUserPairSet,
319  DenseMap<Value *, Value *> &ChosenPairs,
321  DenseSet<ValuePair> &PrunedDAG, ValuePair J,
322  bool UseCycleCheck);
323 
324  void buildInitialDAGFor(
325  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
326  DenseSet<ValuePair> &CandidatePairsSet,
327  std::vector<Value *> &PairableInsts,
328  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
329  DenseSet<ValuePair> &PairableInstUsers,
330  DenseMap<Value *, Value *> &ChosenPairs,
331  DenseMap<ValuePair, size_t> &DAG, ValuePair J);
332 
333  void findBestDAGFor(
334  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
335  DenseSet<ValuePair> &CandidatePairsSet,
336  DenseMap<ValuePair, int> &CandidatePairCostSavings,
337  std::vector<Value *> &PairableInsts,
338  DenseSet<ValuePair> &FixedOrderPairs,
339  DenseMap<VPPair, unsigned> &PairConnectionTypes,
340  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
341  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
342  DenseSet<ValuePair> &PairableInstUsers,
343  DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
344  DenseSet<VPPair> &PairableInstUserPairSet,
345  DenseMap<Value *, Value *> &ChosenPairs,
346  DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth,
347  int &BestEffSize, Value *II, std::vector<Value *>&JJ,
348  bool UseCycleCheck);
349 
350  Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I,
351  Instruction *J, unsigned o);
352 
353  void fillNewShuffleMask(LLVMContext& Context, Instruction *J,
354  unsigned MaskOffset, unsigned NumInElem,
355  unsigned NumInElem1, unsigned IdxOffset,
356  std::vector<Constant*> &Mask);
357 
358  Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I,
359  Instruction *J);
360 
361  bool expandIEChain(LLVMContext& Context, Instruction *I, Instruction *J,
362  unsigned o, Value *&LOp, unsigned numElemL,
363  Type *ArgTypeL, Type *ArgTypeR, bool IBeforeJ,
364  unsigned IdxOff = 0);
365 
366  Value *getReplacementInput(LLVMContext& Context, Instruction *I,
367  Instruction *J, unsigned o, bool IBeforeJ);
368 
369  void getReplacementInputsForPair(LLVMContext& Context, Instruction *I,
370  Instruction *J, SmallVectorImpl<Value *> &ReplacedOperands,
371  bool IBeforeJ);
372 
373  void replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
374  Instruction *J, Instruction *K,
375  Instruction *&InsertionPt, Instruction *&K1,
376  Instruction *&K2);
377 
378  void collectPairLoadMoveSet(BasicBlock &BB,
379  DenseMap<Value *, Value *> &ChosenPairs,
380  DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
381  DenseSet<ValuePair> &LoadMoveSetPairs,
382  Instruction *I);
383 
384  void collectLoadMoveSet(BasicBlock &BB,
385  std::vector<Value *> &PairableInsts,
386  DenseMap<Value *, Value *> &ChosenPairs,
387  DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
388  DenseSet<ValuePair> &LoadMoveSetPairs);
389 
390  bool canMoveUsesOfIAfterJ(BasicBlock &BB,
391  DenseSet<ValuePair> &LoadMoveSetPairs,
392  Instruction *I, Instruction *J);
393 
394  void moveUsesOfIAfterJ(BasicBlock &BB,
395  DenseSet<ValuePair> &LoadMoveSetPairs,
396  Instruction *&InsertionPt,
397  Instruction *I, Instruction *J);
398 
399  bool vectorizeBB(BasicBlock &BB) {
400  if (skipBasicBlock(BB))
401  return false;
402  if (!DT->isReachableFromEntry(&BB)) {
403  DEBUG(dbgs() << "BBV: skipping unreachable " << BB.getName() <<
404  " in " << BB.getParent()->getName() << "\n");
405  return false;
406  }
407 
408  DEBUG(if (TTI) dbgs() << "BBV: using target information\n");
409 
410  bool changed = false;
411  // Iterate a sufficient number of times to merge types of size 1 bit,
412  // then 2 bits, then 4, etc. up to half of the target vector width of the
413  // target vector register.
414  unsigned n = 1;
415  for (unsigned v = 2;
416  (TTI || v <= Config.VectorBits) &&
417  (!Config.MaxIter || n <= Config.MaxIter);
418  v *= 2, ++n) {
419  DEBUG(dbgs() << "BBV: fusing loop #" << n <<
420  " for " << BB.getName() << " in " <<
421  BB.getParent()->getName() << "...\n");
422  if (vectorizePairs(BB))
423  changed = true;
424  else
425  break;
426  }
427 
428  if (changed && !Pow2LenOnly) {
429  ++n;
430  for (; !Config.MaxIter || n <= Config.MaxIter; ++n) {
431  DEBUG(dbgs() << "BBV: fusing for non-2^n-length vectors loop #: " <<
432  n << " for " << BB.getName() << " in " <<
433  BB.getParent()->getName() << "...\n");
434  if (!vectorizePairs(BB, true)) break;
435  }
436  }
437 
438  DEBUG(dbgs() << "BBV: done!\n");
439  return changed;
440  }
441 
442  bool runOnBasicBlock(BasicBlock &BB) override {
443  // OptimizeNone check deferred to vectorizeBB().
444 
445  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
446  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
447  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
448  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
449  TTI = IgnoreTargetInfo
450  ? nullptr
451  : &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
452  *BB.getParent());
453 
454  return vectorizeBB(BB);
455  }
456 
457  void getAnalysisUsage(AnalysisUsage &AU) const override {
468  AU.setPreservesCFG();
469  }
470 
471  static inline VectorType *getVecTypeForPair(Type *ElemTy, Type *Elem2Ty) {
472  assert(ElemTy->getScalarType() == Elem2Ty->getScalarType() &&
473  "Cannot form vector from incompatible scalar types");
474  Type *STy = ElemTy->getScalarType();
475 
476  unsigned numElem;
477  if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) {
478  numElem = VTy->getNumElements();
479  } else {
480  numElem = 1;
481  }
482 
483  if (VectorType *VTy = dyn_cast<VectorType>(Elem2Ty)) {
484  numElem += VTy->getNumElements();
485  } else {
486  numElem += 1;
487  }
488 
489  return VectorType::get(STy, numElem);
490  }
491 
492  static inline void getInstructionTypes(Instruction *I,
493  Type *&T1, Type *&T2) {
494  if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
495  // For stores, it is the value type, not the pointer type that matters
496  // because the value is what will come from a vector register.
497 
498  Value *IVal = SI->getValueOperand();
499  T1 = IVal->getType();
500  } else {
501  T1 = I->getType();
502  }
503 
504  if (CastInst *CI = dyn_cast<CastInst>(I))
505  T2 = CI->getSrcTy();
506  else
507  T2 = T1;
508 
509  if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
510  T2 = SI->getCondition()->getType();
511  } else if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(I)) {
512  T2 = SI->getOperand(0)->getType();
513  } else if (CmpInst *CI = dyn_cast<CmpInst>(I)) {
514  T2 = CI->getOperand(0)->getType();
515  }
516  }
517 
518  // Returns the weight associated with the provided value. A chain of
519  // candidate pairs has a length given by the sum of the weights of its
520  // members (one weight per pair; the weight of each member of the pair
521  // is assumed to be the same). This length is then compared to the
522  // chain-length threshold to determine if a given chain is significant
523  // enough to be vectorized. The length is also used in comparing
524  // candidate chains where longer chains are considered to be better.
525  // Note: when this function returns 0, the resulting instructions are
526  // not actually fused.
527  inline size_t getDepthFactor(Value *V) {
528  // InsertElement and ExtractElement have a depth factor of zero. This is
529  // for two reasons: First, they cannot be usefully fused. Second, because
530  // the pass generates a lot of these, they can confuse the simple metric
531  // used to compare the dags in the next iteration. Thus, giving them a
532  // weight of zero allows the pass to essentially ignore them in
533  // subsequent iterations when looking for vectorization opportunities
534  // while still tracking dependency chains that flow through those
535  // instructions.
536  if (isa<InsertElementInst>(V) || isa<ExtractElementInst>(V))
537  return 0;
538 
539  // Give a load or store half of the required depth so that load/store
540  // pairs will vectorize.
541  if (!Config.NoMemOpBoost && (isa<LoadInst>(V) || isa<StoreInst>(V)))
542  return Config.ReqChainDepth/2;
543 
544  return 1;
545  }
546 
547  // Returns the cost of the provided instruction using TTI.
548  // This does not handle loads and stores.
549  unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2,
554  switch (Opcode) {
555  default: break;
556  case Instruction::GetElementPtr:
557  // We mark this instruction as zero-cost because scalar GEPs are usually
558  // lowered to the instruction addressing mode. At the moment we don't
559  // generate vector GEPs.
560  return 0;
561  case Instruction::Br:
562  return TTI->getCFInstrCost(Opcode);
563  case Instruction::PHI:
564  return 0;
565  case Instruction::Add:
566  case Instruction::FAdd:
567  case Instruction::Sub:
568  case Instruction::FSub:
569  case Instruction::Mul:
570  case Instruction::FMul:
571  case Instruction::UDiv:
572  case Instruction::SDiv:
573  case Instruction::FDiv:
574  case Instruction::URem:
575  case Instruction::SRem:
576  case Instruction::FRem:
577  case Instruction::Shl:
578  case Instruction::LShr:
579  case Instruction::AShr:
580  case Instruction::And:
581  case Instruction::Or:
582  case Instruction::Xor:
583  return TTI->getArithmeticInstrCost(Opcode, T1, Op1VK, Op2VK);
584  case Instruction::Select:
585  case Instruction::ICmp:
586  case Instruction::FCmp:
587  return TTI->getCmpSelInstrCost(Opcode, T1, T2);
588  case Instruction::ZExt:
589  case Instruction::SExt:
590  case Instruction::FPToUI:
591  case Instruction::FPToSI:
592  case Instruction::FPExt:
593  case Instruction::PtrToInt:
594  case Instruction::IntToPtr:
595  case Instruction::SIToFP:
596  case Instruction::UIToFP:
597  case Instruction::Trunc:
598  case Instruction::FPTrunc:
599  case Instruction::BitCast:
600  case Instruction::ShuffleVector:
601  return TTI->getCastInstrCost(Opcode, T1, T2);
602  }
603 
604  return 1;
605  }
606 
607  // This determines the relative offset of two loads or stores, returning
608  // true if the offset could be determined to be some constant value.
609  // For example, if OffsetInElmts == 1, then J accesses the memory directly
610  // after I; if OffsetInElmts == -1 then I accesses the memory
611  // directly after J.
612  bool getPairPtrInfo(Instruction *I, Instruction *J,
613  Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment,
614  unsigned &IAddressSpace, unsigned &JAddressSpace,
615  int64_t &OffsetInElmts, bool ComputeOffset = true) {
616  OffsetInElmts = 0;
617  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
618  LoadInst *LJ = cast<LoadInst>(J);
619  IPtr = LI->getPointerOperand();
620  JPtr = LJ->getPointerOperand();
621  IAlignment = LI->getAlignment();
622  JAlignment = LJ->getAlignment();
623  IAddressSpace = LI->getPointerAddressSpace();
624  JAddressSpace = LJ->getPointerAddressSpace();
625  } else {
626  StoreInst *SI = cast<StoreInst>(I), *SJ = cast<StoreInst>(J);
627  IPtr = SI->getPointerOperand();
628  JPtr = SJ->getPointerOperand();
629  IAlignment = SI->getAlignment();
630  JAlignment = SJ->getAlignment();
631  IAddressSpace = SI->getPointerAddressSpace();
632  JAddressSpace = SJ->getPointerAddressSpace();
633  }
634 
635  if (!ComputeOffset)
636  return true;
637 
638  const SCEV *IPtrSCEV = SE->getSCEV(IPtr);
639  const SCEV *JPtrSCEV = SE->getSCEV(JPtr);
640 
641  // If this is a trivial offset, then we'll get something like
642  // 1*sizeof(type). With target data, which we need anyway, this will get
643  // constant folded into a number.
644  const SCEV *OffsetSCEV = SE->getMinusSCEV(JPtrSCEV, IPtrSCEV);
645  if (const SCEVConstant *ConstOffSCEV =
646  dyn_cast<SCEVConstant>(OffsetSCEV)) {
647  ConstantInt *IntOff = ConstOffSCEV->getValue();
648  int64_t Offset = IntOff->getSExtValue();
649  const DataLayout &DL = I->getModule()->getDataLayout();
650  Type *VTy = IPtr->getType()->getPointerElementType();
651  int64_t VTyTSS = (int64_t)DL.getTypeStoreSize(VTy);
652 
653  Type *VTy2 = JPtr->getType()->getPointerElementType();
654  if (VTy != VTy2 && Offset < 0) {
655  int64_t VTy2TSS = (int64_t)DL.getTypeStoreSize(VTy2);
656  OffsetInElmts = Offset/VTy2TSS;
657  return (std::abs(Offset) % VTy2TSS) == 0;
658  }
659 
660  OffsetInElmts = Offset/VTyTSS;
661  return (std::abs(Offset) % VTyTSS) == 0;
662  }
663 
664  return false;
665  }
666 
667  // Returns true if the provided CallInst represents an intrinsic that can
668  // be vectorized.
669  bool isVectorizableIntrinsic(CallInst* I) {
670  Function *F = I->getCalledFunction();
671  if (!F) return false;
672 
673  Intrinsic::ID IID = F->getIntrinsicID();
674  if (!IID) return false;
675 
676  switch(IID) {
677  default:
678  return false;
679  case Intrinsic::sqrt:
680  case Intrinsic::powi:
681  case Intrinsic::sin:
682  case Intrinsic::cos:
683  case Intrinsic::log:
684  case Intrinsic::log2:
685  case Intrinsic::log10:
686  case Intrinsic::exp:
687  case Intrinsic::exp2:
688  case Intrinsic::pow:
689  case Intrinsic::round:
690  case Intrinsic::copysign:
691  case Intrinsic::ceil:
692  case Intrinsic::nearbyint:
693  case Intrinsic::rint:
694  case Intrinsic::trunc:
695  case Intrinsic::floor:
696  case Intrinsic::fabs:
697  case Intrinsic::minnum:
698  case Intrinsic::maxnum:
699  return Config.VectorizeMath;
700  case Intrinsic::bswap:
701  case Intrinsic::ctpop:
702  case Intrinsic::ctlz:
703  case Intrinsic::cttz:
704  return Config.VectorizeBitManipulations;
705  case Intrinsic::fma:
706  case Intrinsic::fmuladd:
707  return Config.VectorizeFMA;
708  }
709  }
710 
711  bool isPureIEChain(InsertElementInst *IE) {
712  InsertElementInst *IENext = IE;
713  do {
714  if (!isa<UndefValue>(IENext->getOperand(0)) &&
715  !isa<InsertElementInst>(IENext->getOperand(0))) {
716  return false;
717  }
718  } while ((IENext =
719  dyn_cast<InsertElementInst>(IENext->getOperand(0))));
720 
721  return true;
722  }
723  };
724 
725  // This function implements one vectorization iteration on the provided
726  // basic block. It returns true if the block is changed.
727  bool BBVectorize::vectorizePairs(BasicBlock &BB, bool NonPow2Len) {
728  bool ShouldContinue;
730 
731  std::vector<Value *> AllPairableInsts;
732  DenseMap<Value *, Value *> AllChosenPairs;
733  DenseSet<ValuePair> AllFixedOrderPairs;
734  DenseMap<VPPair, unsigned> AllPairConnectionTypes;
735  DenseMap<ValuePair, std::vector<ValuePair> > AllConnectedPairs,
736  AllConnectedPairDeps;
737 
738  do {
739  std::vector<Value *> PairableInsts;
741  DenseSet<ValuePair> FixedOrderPairs;
742  DenseMap<ValuePair, int> CandidatePairCostSavings;
743  ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs,
744  FixedOrderPairs,
745  CandidatePairCostSavings,
746  PairableInsts, NonPow2Len);
747  if (PairableInsts.empty()) continue;
748 
749  // Build the candidate pair set for faster lookups.
750  DenseSet<ValuePair> CandidatePairsSet;
751  for (DenseMap<Value *, std::vector<Value *> >::iterator I =
752  CandidatePairs.begin(), E = CandidatePairs.end(); I != E; ++I)
753  for (std::vector<Value *>::iterator J = I->second.begin(),
754  JE = I->second.end(); J != JE; ++J)
755  CandidatePairsSet.insert(ValuePair(I->first, *J));
756 
757  // Now we have a map of all of the pairable instructions and we need to
758  // select the best possible pairing. A good pairing is one such that the
759  // users of the pair are also paired. This defines a (directed) forest
760  // over the pairs such that two pairs are connected iff the second pair
761  // uses the first.
762 
763  // Note that it only matters that both members of the second pair use some
764  // element of the first pair (to allow for splatting).
765 
767  ConnectedPairDeps;
768  DenseMap<VPPair, unsigned> PairConnectionTypes;
769  computeConnectedPairs(CandidatePairs, CandidatePairsSet,
770  PairableInsts, ConnectedPairs, PairConnectionTypes);
771  if (ConnectedPairs.empty()) continue;
772 
773  for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator
774  I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
775  I != IE; ++I)
776  for (std::vector<ValuePair>::iterator J = I->second.begin(),
777  JE = I->second.end(); J != JE; ++J)
778  ConnectedPairDeps[*J].push_back(I->first);
779 
780  // Build the pairable-instruction dependency map
781  DenseSet<ValuePair> PairableInstUsers;
782  buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers);
783 
784  // There is now a graph of the connected pairs. For each variable, pick
785  // the pairing with the largest dag meeting the depth requirement on at
786  // least one branch. Then select all pairings that are part of that dag
787  // and remove them from the list of available pairings and pairable
788  // variables.
789 
790  DenseMap<Value *, Value *> ChosenPairs;
791  choosePairs(CandidatePairs, CandidatePairsSet,
792  CandidatePairCostSavings,
793  PairableInsts, FixedOrderPairs, PairConnectionTypes,
794  ConnectedPairs, ConnectedPairDeps,
795  PairableInstUsers, ChosenPairs);
796 
797  if (ChosenPairs.empty()) continue;
798  AllPairableInsts.insert(AllPairableInsts.end(), PairableInsts.begin(),
799  PairableInsts.end());
800  AllChosenPairs.insert(ChosenPairs.begin(), ChosenPairs.end());
801 
802  // Only for the chosen pairs, propagate information on fixed-order pairs,
803  // pair connections, and their types to the data structures used by the
804  // pair fusion procedures.
805  for (DenseMap<Value *, Value *>::iterator I = ChosenPairs.begin(),
806  IE = ChosenPairs.end(); I != IE; ++I) {
807  if (FixedOrderPairs.count(*I))
808  AllFixedOrderPairs.insert(*I);
809  else if (FixedOrderPairs.count(ValuePair(I->second, I->first)))
810  AllFixedOrderPairs.insert(ValuePair(I->second, I->first));
811 
812  for (DenseMap<Value *, Value *>::iterator J = ChosenPairs.begin();
813  J != IE; ++J) {
815  PairConnectionTypes.find(VPPair(*I, *J));
816  if (K != PairConnectionTypes.end()) {
817  AllPairConnectionTypes.insert(*K);
818  } else {
819  K = PairConnectionTypes.find(VPPair(*J, *I));
820  if (K != PairConnectionTypes.end())
821  AllPairConnectionTypes.insert(*K);
822  }
823  }
824  }
825 
826  for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator
827  I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
828  I != IE; ++I)
829  for (std::vector<ValuePair>::iterator J = I->second.begin(),
830  JE = I->second.end(); J != JE; ++J)
831  if (AllPairConnectionTypes.count(VPPair(I->first, *J))) {
832  AllConnectedPairs[I->first].push_back(*J);
833  AllConnectedPairDeps[*J].push_back(I->first);
834  }
835  } while (ShouldContinue);
836 
837  if (AllChosenPairs.empty()) return false;
838  NumFusedOps += AllChosenPairs.size();
839 
840  // A set of pairs has now been selected. It is now necessary to replace the
841  // paired instructions with vector instructions. For this procedure each
842  // operand must be replaced with a vector operand. This vector is formed
843  // by using build_vector on the old operands. The replaced values are then
844  // replaced with a vector_extract on the result. Subsequent optimization
845  // passes should coalesce the build/extract combinations.
846 
847  fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs, AllFixedOrderPairs,
848  AllPairConnectionTypes,
849  AllConnectedPairs, AllConnectedPairDeps);
850 
851  // It is important to cleanup here so that future iterations of this
852  // function have less work to do.
853  (void)SimplifyInstructionsInBlock(&BB, TLI);
854  return true;
855  }
856 
857  // This function returns true if the provided instruction is capable of being
858  // fused into a vector instruction. This determination is based only on the
859  // type and other attributes of the instruction.
860  bool BBVectorize::isInstVectorizable(Instruction *I,
861  bool &IsSimpleLoadStore) {
862  IsSimpleLoadStore = false;
863 
864  if (CallInst *C = dyn_cast<CallInst>(I)) {
865  if (!isVectorizableIntrinsic(C))
866  return false;
867  } else if (LoadInst *L = dyn_cast<LoadInst>(I)) {
868  // Vectorize simple loads if possbile:
869  IsSimpleLoadStore = L->isSimple();
870  if (!IsSimpleLoadStore || !Config.VectorizeMemOps)
871  return false;
872  } else if (StoreInst *S = dyn_cast<StoreInst>(I)) {
873  // Vectorize simple stores if possbile:
874  IsSimpleLoadStore = S->isSimple();
875  if (!IsSimpleLoadStore || !Config.VectorizeMemOps)
876  return false;
877  } else if (CastInst *C = dyn_cast<CastInst>(I)) {
878  // We can vectorize casts, but not casts of pointer types, etc.
879  if (!Config.VectorizeCasts)
880  return false;
881 
882  Type *SrcTy = C->getSrcTy();
883  if (!SrcTy->isSingleValueType())
884  return false;
885 
886  Type *DestTy = C->getDestTy();
887  if (!DestTy->isSingleValueType())
888  return false;
889  } else if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
890  if (!Config.VectorizeSelect)
891  return false;
892  // We can vectorize a select if either all operands are scalars,
893  // or all operands are vectors. Trying to "widen" a select between
894  // vectors that has a scalar condition results in a malformed select.
895  // FIXME: We could probably be smarter about this by rewriting the select
896  // with different types instead.
897  return (SI->getCondition()->getType()->isVectorTy() ==
898  SI->getTrueValue()->getType()->isVectorTy());
899  } else if (isa<CmpInst>(I)) {
900  if (!Config.VectorizeCmp)
901  return false;
902  } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(I)) {
903  if (!Config.VectorizeGEP)
904  return false;
905 
906  // Currently, vector GEPs exist only with one index.
907  if (G->getNumIndices() != 1)
908  return false;
909  } else if (!(I->isBinaryOp() || isa<ShuffleVectorInst>(I) ||
910  isa<ExtractElementInst>(I) || isa<InsertElementInst>(I))) {
911  return false;
912  }
913 
914  Type *T1, *T2;
915  getInstructionTypes(I, T1, T2);
916 
917  // Not every type can be vectorized...
918  if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) ||
920  return false;
921 
922  if (T1->getScalarSizeInBits() == 1) {
923  if (!Config.VectorizeBools)
924  return false;
925  } else {
926  if (!Config.VectorizeInts && T1->isIntOrIntVectorTy())
927  return false;
928  }
929 
930  if (T2->getScalarSizeInBits() == 1) {
931  if (!Config.VectorizeBools)
932  return false;
933  } else {
934  if (!Config.VectorizeInts && T2->isIntOrIntVectorTy())
935  return false;
936  }
937 
938  if (!Config.VectorizeFloats
939  && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy()))
940  return false;
941 
942  // Don't vectorize target-specific types.
943  if (T1->isX86_FP80Ty() || T1->isPPC_FP128Ty() || T1->isX86_MMXTy())
944  return false;
945  if (T2->isX86_FP80Ty() || T2->isPPC_FP128Ty() || T2->isX86_MMXTy())
946  return false;
947 
948  if (!Config.VectorizePointers && (T1->getScalarType()->isPointerTy() ||
949  T2->getScalarType()->isPointerTy()))
950  return false;
951 
952  if (!TTI && (T1->getPrimitiveSizeInBits() >= Config.VectorBits ||
953  T2->getPrimitiveSizeInBits() >= Config.VectorBits))
954  return false;
955 
956  return true;
957  }
958 
959  // This function returns true if the two provided instructions are compatible
960  // (meaning that they can be fused into a vector instruction). This assumes
961  // that I has already been determined to be vectorizable and that J is not
962  // in the use dag of I.
963  bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J,
964  bool IsSimpleLoadStore, bool NonPow2Len,
965  int &CostSavings, int &FixedOrder) {
966  DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I <<
967  " <-> " << *J << "\n");
968 
969  CostSavings = 0;
970  FixedOrder = 0;
971 
972  // Loads and stores can be merged if they have different alignments,
973  // but are otherwise the same.
975  (NonPow2Len ? Instruction::CompareUsingScalarTypes : 0)))
976  return false;
977 
978  Type *IT1, *IT2, *JT1, *JT2;
979  getInstructionTypes(I, IT1, IT2);
980  getInstructionTypes(J, JT1, JT2);
981  unsigned MaxTypeBits = std::max(
984  if (!TTI && MaxTypeBits > Config.VectorBits)
985  return false;
986 
987  // FIXME: handle addsub-type operations!
988 
989  if (IsSimpleLoadStore) {
990  Value *IPtr, *JPtr;
991  unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
992  int64_t OffsetInElmts = 0;
993  if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
994  IAddressSpace, JAddressSpace, OffsetInElmts) &&
995  std::abs(OffsetInElmts) == 1) {
996  FixedOrder = (int) OffsetInElmts;
997  unsigned BottomAlignment = IAlignment;
998  if (OffsetInElmts < 0) BottomAlignment = JAlignment;
999 
1000  Type *aTypeI = isa<StoreInst>(I) ?
1001  cast<StoreInst>(I)->getValueOperand()->getType() : I->getType();
1002  Type *aTypeJ = isa<StoreInst>(J) ?
1003  cast<StoreInst>(J)->getValueOperand()->getType() : J->getType();
1004  Type *VType = getVecTypeForPair(aTypeI, aTypeJ);
1005 
1006  if (Config.AlignedOnly) {
1007  // An aligned load or store is possible only if the instruction
1008  // with the lower offset has an alignment suitable for the
1009  // vector type.
1010  const DataLayout &DL = I->getModule()->getDataLayout();
1011  unsigned VecAlignment = DL.getPrefTypeAlignment(VType);
1012  if (BottomAlignment < VecAlignment)
1013  return false;
1014  }
1015 
1016  if (TTI) {
1017  unsigned ICost = TTI->getMemoryOpCost(I->getOpcode(), aTypeI,
1018  IAlignment, IAddressSpace);
1019  unsigned JCost = TTI->getMemoryOpCost(J->getOpcode(), aTypeJ,
1020  JAlignment, JAddressSpace);
1021  unsigned VCost = TTI->getMemoryOpCost(I->getOpcode(), VType,
1022  BottomAlignment,
1023  IAddressSpace);
1024 
1025  ICost += TTI->getAddressComputationCost(aTypeI);
1026  JCost += TTI->getAddressComputationCost(aTypeJ);
1027  VCost += TTI->getAddressComputationCost(VType);
1028 
1029  if (VCost > ICost + JCost)
1030  return false;
1031 
1032  // We don't want to fuse to a type that will be split, even
1033  // if the two input types will also be split and there is no other
1034  // associated cost.
1035  unsigned VParts = TTI->getNumberOfParts(VType);
1036  if (VParts > 1)
1037  return false;
1038  else if (!VParts && VCost == ICost + JCost)
1039  return false;
1040 
1041  CostSavings = ICost + JCost - VCost;
1042  }
1043  } else {
1044  return false;
1045  }
1046  } else if (TTI) {
1047  unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2);
1048  unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2);
1049  Type *VT1 = getVecTypeForPair(IT1, JT1),
1050  *VT2 = getVecTypeForPair(IT2, JT2);
1055 
1056  // On some targets (example X86) the cost of a vector shift may vary
1057  // depending on whether the second operand is a Uniform or
1058  // NonUniform Constant.
1059  switch (I->getOpcode()) {
1060  default : break;
1061  case Instruction::Shl:
1062  case Instruction::LShr:
1063  case Instruction::AShr:
1064 
1065  // If both I and J are scalar shifts by constant, then the
1066  // merged vector shift count would be either a constant splat value
1067  // or a non-uniform vector of constants.
1068  if (ConstantInt *CII = dyn_cast<ConstantInt>(I->getOperand(1))) {
1069  if (ConstantInt *CIJ = dyn_cast<ConstantInt>(J->getOperand(1)))
1070  Op2VK = CII == CIJ ? TargetTransformInfo::OK_UniformConstantValue :
1072  } else {
1073  // Check for a splat of a constant or for a non uniform vector
1074  // of constants.
1075  Value *IOp = I->getOperand(1);
1076  Value *JOp = J->getOperand(1);
1077  if ((isa<ConstantVector>(IOp) || isa<ConstantDataVector>(IOp)) &&
1078  (isa<ConstantVector>(JOp) || isa<ConstantDataVector>(JOp))) {
1080  Constant *SplatValue = cast<Constant>(IOp)->getSplatValue();
1081  if (SplatValue != nullptr &&
1082  SplatValue == cast<Constant>(JOp)->getSplatValue())
1084  }
1085  }
1086  }
1087 
1088  // Note that this procedure is incorrect for insert and extract element
1089  // instructions (because combining these often results in a shuffle),
1090  // but this cost is ignored (because insert and extract element
1091  // instructions are assigned a zero depth factor and are not really
1092  // fused in general).
1093  unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2, Op1VK, Op2VK);
1094 
1095  if (VCost > ICost + JCost)
1096  return false;
1097 
1098  // We don't want to fuse to a type that will be split, even
1099  // if the two input types will also be split and there is no other
1100  // associated cost.
1101  unsigned VParts1 = TTI->getNumberOfParts(VT1),
1102  VParts2 = TTI->getNumberOfParts(VT2);
1103  if (VParts1 > 1 || VParts2 > 1)
1104  return false;
1105  else if ((!VParts1 || !VParts2) && VCost == ICost + JCost)
1106  return false;
1107 
1108  CostSavings = ICost + JCost - VCost;
1109  }
1110 
1111  // The powi,ctlz,cttz intrinsics are special because only the first
1112  // argument is vectorized, the second arguments must be equal.
1113  CallInst *CI = dyn_cast<CallInst>(I);
1114  Function *FI;
1115  if (CI && (FI = CI->getCalledFunction())) {
1116  Intrinsic::ID IID = FI->getIntrinsicID();
1117  if (IID == Intrinsic::powi || IID == Intrinsic::ctlz ||
1118  IID == Intrinsic::cttz) {
1119  Value *A1I = CI->getArgOperand(1),
1120  *A1J = cast<CallInst>(J)->getArgOperand(1);
1121  const SCEV *A1ISCEV = SE->getSCEV(A1I),
1122  *A1JSCEV = SE->getSCEV(A1J);
1123  return (A1ISCEV == A1JSCEV);
1124  }
1125 
1126  if (IID && TTI) {
1127  FastMathFlags FMFCI;
1128  if (auto *FPMOCI = dyn_cast<FPMathOperator>(CI))
1129  FMFCI = FPMOCI->getFastMathFlags();
1130 
1132  for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
1133  Tys.push_back(CI->getArgOperand(i)->getType());
1134  unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, Tys, FMFCI);
1135 
1136  Tys.clear();
1137  CallInst *CJ = cast<CallInst>(J);
1138 
1139  FastMathFlags FMFCJ;
1140  if (auto *FPMOCJ = dyn_cast<FPMathOperator>(CJ))
1141  FMFCJ = FPMOCJ->getFastMathFlags();
1142 
1143  for (unsigned i = 0, ie = CJ->getNumArgOperands(); i != ie; ++i)
1144  Tys.push_back(CJ->getArgOperand(i)->getType());
1145  unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, Tys, FMFCJ);
1146 
1147  Tys.clear();
1148  assert(CI->getNumArgOperands() == CJ->getNumArgOperands() &&
1149  "Intrinsic argument counts differ");
1150  for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
1151  if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz ||
1152  IID == Intrinsic::cttz) && i == 1)
1153  Tys.push_back(CI->getArgOperand(i)->getType());
1154  else
1155  Tys.push_back(getVecTypeForPair(CI->getArgOperand(i)->getType(),
1156  CJ->getArgOperand(i)->getType()));
1157  }
1158 
1159  FastMathFlags FMFV = FMFCI;
1160  FMFV &= FMFCJ;
1161  Type *RetTy = getVecTypeForPair(IT1, JT1);
1162  unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys, FMFV);
1163 
1164  if (VCost > ICost + JCost)
1165  return false;
1166 
1167  // We don't want to fuse to a type that will be split, even
1168  // if the two input types will also be split and there is no other
1169  // associated cost.
1170  unsigned RetParts = TTI->getNumberOfParts(RetTy);
1171  if (RetParts > 1)
1172  return false;
1173  else if (!RetParts && VCost == ICost + JCost)
1174  return false;
1175 
1176  for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
1177  if (!Tys[i]->isVectorTy())
1178  continue;
1179 
1180  unsigned NumParts = TTI->getNumberOfParts(Tys[i]);
1181  if (NumParts > 1)
1182  return false;
1183  else if (!NumParts && VCost == ICost + JCost)
1184  return false;
1185  }
1186 
1187  CostSavings = ICost + JCost - VCost;
1188  }
1189  }
1190 
1191  return true;
1192  }
1193 
1194  // Figure out whether or not J uses I and update the users and write-set
1195  // structures associated with I. Specifically, Users represents the set of
1196  // instructions that depend on I. WriteSet represents the set
1197  // of memory locations that are dependent on I. If UpdateUsers is true,
1198  // and J uses I, then Users is updated to contain J and WriteSet is updated
1199  // to contain any memory locations to which J writes. The function returns
1200  // true if J uses I. By default, alias analysis is used to determine
1201  // whether J reads from memory that overlaps with a location in WriteSet.
1202  // If LoadMoveSet is not null, then it is a previously-computed map
1203  // where the key is the memory-based user instruction and the value is
1204  // the instruction to be compared with I. So, if LoadMoveSet is provided,
1205  // then the alias analysis is not used. This is necessary because this
1206  // function is called during the process of moving instructions during
1207  // vectorization and the results of the alias analysis are not stable during
1208  // that process.
1209  bool BBVectorize::trackUsesOfI(DenseSet<Value *> &Users,
1210  AliasSetTracker &WriteSet, Instruction *I,
1211  Instruction *J, bool UpdateUsers,
1212  DenseSet<ValuePair> *LoadMoveSetPairs) {
1213  bool UsesI = false;
1214 
1215  // This instruction may already be marked as a user due, for example, to
1216  // being a member of a selected pair.
1217  if (Users.count(J))
1218  UsesI = true;
1219 
1220  if (!UsesI)
1221  for (User::op_iterator JU = J->op_begin(), JE = J->op_end();
1222  JU != JE; ++JU) {
1223  Value *V = *JU;
1224  if (I == V || Users.count(V)) {
1225  UsesI = true;
1226  break;
1227  }
1228  }
1229  if (!UsesI && J->mayReadFromMemory()) {
1230  if (LoadMoveSetPairs) {
1231  UsesI = LoadMoveSetPairs->count(ValuePair(J, I));
1232  } else {
1233  for (AliasSetTracker::iterator W = WriteSet.begin(),
1234  WE = WriteSet.end(); W != WE; ++W) {
1235  if (W->aliasesUnknownInst(J, *AA)) {
1236  UsesI = true;
1237  break;
1238  }
1239  }
1240  }
1241  }
1242 
1243  if (UsesI && UpdateUsers) {
1244  if (J->mayWriteToMemory()) WriteSet.add(J);
1245  Users.insert(J);
1246  }
1247 
1248  return UsesI;
1249  }
1250 
1251  // This function iterates over all instruction pairs in the provided
1252  // basic block and collects all candidate pairs for vectorization.
1253  bool BBVectorize::getCandidatePairs(BasicBlock &BB,
1254  BasicBlock::iterator &Start,
1255  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
1256  DenseSet<ValuePair> &FixedOrderPairs,
1257  DenseMap<ValuePair, int> &CandidatePairCostSavings,
1258  std::vector<Value *> &PairableInsts, bool NonPow2Len) {
1259  size_t TotalPairs = 0;
1260  BasicBlock::iterator E = BB.end();
1261  if (Start == E) return false;
1262 
1263  bool ShouldContinue = false, IAfterStart = false;
1264  for (BasicBlock::iterator I = Start++; I != E; ++I) {
1265  if (I == Start) IAfterStart = true;
1266 
1267  bool IsSimpleLoadStore;
1268  if (!isInstVectorizable(&*I, IsSimpleLoadStore))
1269  continue;
1270 
1271  // Look for an instruction with which to pair instruction *I...
1273  AliasSetTracker WriteSet(*AA);
1274  if (I->mayWriteToMemory())
1275  WriteSet.add(&*I);
1276 
1277  bool JAfterStart = IAfterStart;
1278  BasicBlock::iterator J = std::next(I);
1279  for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) {
1280  if (J == Start)
1281  JAfterStart = true;
1282 
1283  // Determine if J uses I, if so, exit the loop.
1284  bool UsesI = trackUsesOfI(Users, WriteSet, &*I, &*J, !Config.FastDep);
1285  if (Config.FastDep) {
1286  // Note: For this heuristic to be effective, independent operations
1287  // must tend to be intermixed. This is likely to be true from some
1288  // kinds of grouped loop unrolling (but not the generic LLVM pass),
1289  // but otherwise may require some kind of reordering pass.
1290 
1291  // When using fast dependency analysis,
1292  // stop searching after first use:
1293  if (UsesI) break;
1294  } else {
1295  if (UsesI) continue;
1296  }
1297 
1298  // J does not use I, and comes before the first use of I, so it can be
1299  // merged with I if the instructions are compatible.
1300  int CostSavings, FixedOrder;
1301  if (!areInstsCompatible(&*I, &*J, IsSimpleLoadStore, NonPow2Len,
1302  CostSavings, FixedOrder))
1303  continue;
1304 
1305  // J is a candidate for merging with I.
1306  if (PairableInsts.empty() ||
1307  PairableInsts[PairableInsts.size() - 1] != &*I) {
1308  PairableInsts.push_back(&*I);
1309  }
1310 
1311  CandidatePairs[&*I].push_back(&*J);
1312  ++TotalPairs;
1313  if (TTI)
1314  CandidatePairCostSavings.insert(
1315  ValuePairWithCost(ValuePair(&*I, &*J), CostSavings));
1316 
1317  if (FixedOrder == 1)
1318  FixedOrderPairs.insert(ValuePair(&*I, &*J));
1319  else if (FixedOrder == -1)
1320  FixedOrderPairs.insert(ValuePair(&*J, &*I));
1321 
1322  // The next call to this function must start after the last instruction
1323  // selected during this invocation.
1324  if (JAfterStart) {
1325  Start = std::next(J);
1326  IAfterStart = JAfterStart = false;
1327  }
1328 
1329  DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair "
1330  << *I << " <-> " << *J << " (cost savings: " <<
1331  CostSavings << ")\n");
1332 
1333  // If we have already found too many pairs, break here and this function
1334  // will be called again starting after the last instruction selected
1335  // during this invocation.
1336  if (PairableInsts.size() >= Config.MaxInsts ||
1337  TotalPairs >= Config.MaxPairs) {
1338  ShouldContinue = true;
1339  break;
1340  }
1341  }
1342 
1343  if (ShouldContinue)
1344  break;
1345  }
1346 
1347  DEBUG(dbgs() << "BBV: found " << PairableInsts.size()
1348  << " instructions with candidate pairs\n");
1349 
1350  return ShouldContinue;
1351  }
1352 
1353  // Finds candidate pairs connected to the pair P = <PI, PJ>. This means that
1354  // it looks for pairs such that both members have an input which is an
1355  // output of PI or PJ.
1356  void BBVectorize::computePairsConnectedTo(
1357  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
1358  DenseSet<ValuePair> &CandidatePairsSet,
1359  std::vector<Value *> &PairableInsts,
1360  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
1361  DenseMap<VPPair, unsigned> &PairConnectionTypes,
1362  ValuePair P) {
1363  StoreInst *SI, *SJ;
1364 
1365  // For each possible pairing for this variable, look at the uses of
1366  // the first value...
1367  for (Value::user_iterator I = P.first->user_begin(),
1368  E = P.first->user_end();
1369  I != E; ++I) {
1370  User *UI = *I;
1371  if (isa<LoadInst>(UI)) {
1372  // A pair cannot be connected to a load because the load only takes one
1373  // operand (the address) and it is a scalar even after vectorization.
1374  continue;
1375  } else if ((SI = dyn_cast<StoreInst>(UI)) &&
1376  P.first == SI->getPointerOperand()) {
1377  // Similarly, a pair cannot be connected to a store through its
1378  // pointer operand.
1379  continue;
1380  }
1381 
1382  // For each use of the first variable, look for uses of the second
1383  // variable...
1384  for (User *UJ : P.second->users()) {
1385  if ((SJ = dyn_cast<StoreInst>(UJ)) &&
1386  P.second == SJ->getPointerOperand())
1387  continue;
1388 
1389  // Look for <I, J>:
1390  if (CandidatePairsSet.count(ValuePair(UI, UJ))) {
1391  VPPair VP(P, ValuePair(UI, UJ));
1392  ConnectedPairs[VP.first].push_back(VP.second);
1393  PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionDirect));
1394  }
1395 
1396  // Look for <J, I>:
1397  if (CandidatePairsSet.count(ValuePair(UJ, UI))) {
1398  VPPair VP(P, ValuePair(UJ, UI));
1399  ConnectedPairs[VP.first].push_back(VP.second);
1400  PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSwap));
1401  }
1402  }
1403 
1404  if (Config.SplatBreaksChain) continue;
1405  // Look for cases where just the first value in the pair is used by
1406  // both members of another pair (splatting).
1407  for (Value::user_iterator J = P.first->user_begin(); J != E; ++J) {
1408  User *UJ = *J;
1409  if ((SJ = dyn_cast<StoreInst>(UJ)) &&
1410  P.first == SJ->getPointerOperand())
1411  continue;
1412 
1413  if (CandidatePairsSet.count(ValuePair(UI, UJ))) {
1414  VPPair VP(P, ValuePair(UI, UJ));
1415  ConnectedPairs[VP.first].push_back(VP.second);
1416  PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
1417  }
1418  }
1419  }
1420 
1421  if (Config.SplatBreaksChain) return;
1422  // Look for cases where just the second value in the pair is used by
1423  // both members of another pair (splatting).
1424  for (Value::user_iterator I = P.second->user_begin(),
1425  E = P.second->user_end();
1426  I != E; ++I) {
1427  User *UI = *I;
1428  if (isa<LoadInst>(UI))
1429  continue;
1430  else if ((SI = dyn_cast<StoreInst>(UI)) &&
1431  P.second == SI->getPointerOperand())
1432  continue;
1433 
1434  for (Value::user_iterator J = P.second->user_begin(); J != E; ++J) {
1435  User *UJ = *J;
1436  if ((SJ = dyn_cast<StoreInst>(UJ)) &&
1437  P.second == SJ->getPointerOperand())
1438  continue;
1439 
1440  if (CandidatePairsSet.count(ValuePair(UI, UJ))) {
1441  VPPair VP(P, ValuePair(UI, UJ));
1442  ConnectedPairs[VP.first].push_back(VP.second);
1443  PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
1444  }
1445  }
1446  }
1447  }
1448 
1449  // This function figures out which pairs are connected. Two pairs are
1450  // connected if some output of the first pair forms an input to both members
1451  // of the second pair.
1452  void BBVectorize::computeConnectedPairs(
1453  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
1454  DenseSet<ValuePair> &CandidatePairsSet,
1455  std::vector<Value *> &PairableInsts,
1456  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
1457  DenseMap<VPPair, unsigned> &PairConnectionTypes) {
1458  for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
1459  PE = PairableInsts.end(); PI != PE; ++PI) {
1461  CandidatePairs.find(*PI);
1462  if (PP == CandidatePairs.end())
1463  continue;
1464 
1465  for (std::vector<Value *>::iterator P = PP->second.begin(),
1466  E = PP->second.end(); P != E; ++P)
1467  computePairsConnectedTo(CandidatePairs, CandidatePairsSet,
1468  PairableInsts, ConnectedPairs,
1469  PairConnectionTypes, ValuePair(*PI, *P));
1470  }
1471 
1472  DEBUG(size_t TotalPairs = 0;
1473  for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator I =
1474  ConnectedPairs.begin(), IE = ConnectedPairs.end(); I != IE; ++I)
1475  TotalPairs += I->second.size();
1476  dbgs() << "BBV: found " << TotalPairs
1477  << " pair connections.\n");
1478  }
1479 
1480  // This function builds a set of use tuples such that <A, B> is in the set
1481  // if B is in the use dag of A. If B is in the use dag of A, then B
1482  // depends on the output of A.
1483  void BBVectorize::buildDepMap(
1484  BasicBlock &BB,
1485  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
1486  std::vector<Value *> &PairableInsts,
1487  DenseSet<ValuePair> &PairableInstUsers) {
1488  DenseSet<Value *> IsInPair;
1489  for (DenseMap<Value *, std::vector<Value *> >::iterator C =
1490  CandidatePairs.begin(), E = CandidatePairs.end(); C != E; ++C) {
1491  IsInPair.insert(C->first);
1492  IsInPair.insert(C->second.begin(), C->second.end());
1493  }
1494 
1495  // Iterate through the basic block, recording all users of each
1496  // pairable instruction.
1497 
1498  BasicBlock::iterator E = BB.end(), EL =
1499  BasicBlock::iterator(cast<Instruction>(PairableInsts.back()));
1500  for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) {
1501  if (IsInPair.find(&*I) == IsInPair.end())
1502  continue;
1503 
1505  AliasSetTracker WriteSet(*AA);
1506  if (I->mayWriteToMemory())
1507  WriteSet.add(&*I);
1508 
1509  for (BasicBlock::iterator J = std::next(I); J != E; ++J) {
1510  (void)trackUsesOfI(Users, WriteSet, &*I, &*J);
1511 
1512  if (J == EL)
1513  break;
1514  }
1515 
1516  for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end();
1517  U != E; ++U) {
1518  if (IsInPair.find(*U) == IsInPair.end()) continue;
1519  PairableInstUsers.insert(ValuePair(&*I, *U));
1520  }
1521 
1522  if (I == EL)
1523  break;
1524  }
1525  }
1526 
1527  // Returns true if an input to pair P is an output of pair Q and also an
1528  // input of pair Q is an output of pair P. If this is the case, then these
1529  // two pairs cannot be simultaneously fused.
1530  bool BBVectorize::pairsConflict(ValuePair P, ValuePair Q,
1531  DenseSet<ValuePair> &PairableInstUsers,
1532  DenseMap<ValuePair, std::vector<ValuePair> > *PairableInstUserMap,
1533  DenseSet<VPPair> *PairableInstUserPairSet) {
1534  // Two pairs are in conflict if they are mutual Users of eachother.
1535  bool QUsesP = PairableInstUsers.count(ValuePair(P.first, Q.first)) ||
1536  PairableInstUsers.count(ValuePair(P.first, Q.second)) ||
1537  PairableInstUsers.count(ValuePair(P.second, Q.first)) ||
1538  PairableInstUsers.count(ValuePair(P.second, Q.second));
1539  bool PUsesQ = PairableInstUsers.count(ValuePair(Q.first, P.first)) ||
1540  PairableInstUsers.count(ValuePair(Q.first, P.second)) ||
1541  PairableInstUsers.count(ValuePair(Q.second, P.first)) ||
1542  PairableInstUsers.count(ValuePair(Q.second, P.second));
1543  if (PairableInstUserMap) {
1544  // FIXME: The expensive part of the cycle check is not so much the cycle
1545  // check itself but this edge insertion procedure. This needs some
1546  // profiling and probably a different data structure.
1547  if (PUsesQ) {
1548  if (PairableInstUserPairSet->insert(VPPair(Q, P)).second)
1549  (*PairableInstUserMap)[Q].push_back(P);
1550  }
1551  if (QUsesP) {
1552  if (PairableInstUserPairSet->insert(VPPair(P, Q)).second)
1553  (*PairableInstUserMap)[P].push_back(Q);
1554  }
1555  }
1556 
1557  return (QUsesP && PUsesQ);
1558  }
1559 
1560  // This function walks the use graph of current pairs to see if, starting
1561  // from P, the walk returns to P.
1562  bool BBVectorize::pairWillFormCycle(ValuePair P,
1563  DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
1564  DenseSet<ValuePair> &CurrentPairs) {
1565  DEBUG(if (DebugCycleCheck)
1566  dbgs() << "BBV: starting cycle check for : " << *P.first << " <-> "
1567  << *P.second << "\n");
1568  // A lookup table of visisted pairs is kept because the PairableInstUserMap
1569  // contains non-direct associations.
1570  DenseSet<ValuePair> Visited;
1572  // General depth-first post-order traversal:
1573  Q.push_back(P);
1574  do {
1575  ValuePair QTop = Q.pop_back_val();
1576  Visited.insert(QTop);
1577 
1578  DEBUG(if (DebugCycleCheck)
1579  dbgs() << "BBV: cycle check visiting: " << *QTop.first << " <-> "
1580  << *QTop.second << "\n");
1582  PairableInstUserMap.find(QTop);
1583  if (QQ == PairableInstUserMap.end())
1584  continue;
1585 
1586  for (std::vector<ValuePair>::iterator C = QQ->second.begin(),
1587  CE = QQ->second.end(); C != CE; ++C) {
1588  if (*C == P) {
1589  DEBUG(dbgs()
1590  << "BBV: rejected to prevent non-trivial cycle formation: "
1591  << QTop.first << " <-> " << C->second << "\n");
1592  return true;
1593  }
1594 
1595  if (CurrentPairs.count(*C) && !Visited.count(*C))
1596  Q.push_back(*C);
1597  }
1598  } while (!Q.empty());
1599 
1600  return false;
1601  }
1602 
1603  // This function builds the initial dag of connected pairs with the
1604  // pair J at the root.
1605  void BBVectorize::buildInitialDAGFor(
1606  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
1607  DenseSet<ValuePair> &CandidatePairsSet,
1608  std::vector<Value *> &PairableInsts,
1609  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
1610  DenseSet<ValuePair> &PairableInstUsers,
1611  DenseMap<Value *, Value *> &ChosenPairs,
1612  DenseMap<ValuePair, size_t> &DAG, ValuePair J) {
1613  // Each of these pairs is viewed as the root node of a DAG. The DAG
1614  // is then walked (depth-first). As this happens, we keep track of
1615  // the pairs that compose the DAG and the maximum depth of the DAG.
1617  // General depth-first post-order traversal:
1618  Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
1619  do {
1620  ValuePairWithDepth QTop = Q.back();
1621 
1622  // Push each child onto the queue:
1623  bool MoreChildren = false;
1624  size_t MaxChildDepth = QTop.second;
1626  ConnectedPairs.find(QTop.first);
1627  if (QQ != ConnectedPairs.end())
1628  for (std::vector<ValuePair>::iterator k = QQ->second.begin(),
1629  ke = QQ->second.end(); k != ke; ++k) {
1630  // Make sure that this child pair is still a candidate:
1631  if (CandidatePairsSet.count(*k)) {
1633  if (C == DAG.end()) {
1634  size_t d = getDepthFactor(k->first);
1635  Q.push_back(ValuePairWithDepth(*k, QTop.second+d));
1636  MoreChildren = true;
1637  } else {
1638  MaxChildDepth = std::max(MaxChildDepth, C->second);
1639  }
1640  }
1641  }
1642 
1643  if (!MoreChildren) {
1644  // Record the current pair as part of the DAG:
1645  DAG.insert(ValuePairWithDepth(QTop.first, MaxChildDepth));
1646  Q.pop_back();
1647  }
1648  } while (!Q.empty());
1649  }
1650 
1651  // Given some initial dag, prune it by removing conflicting pairs (pairs
1652  // that cannot be simultaneously chosen for vectorization).
1653  void BBVectorize::pruneDAGFor(
1654  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
1655  std::vector<Value *> &PairableInsts,
1656  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
1657  DenseSet<ValuePair> &PairableInstUsers,
1658  DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
1659  DenseSet<VPPair> &PairableInstUserPairSet,
1660  DenseMap<Value *, Value *> &ChosenPairs,
1662  DenseSet<ValuePair> &PrunedDAG, ValuePair J,
1663  bool UseCycleCheck) {
1665  // General depth-first post-order traversal:
1666  Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
1667  do {
1668  ValuePairWithDepth QTop = Q.pop_back_val();
1669  PrunedDAG.insert(QTop.first);
1670 
1671  // Visit each child, pruning as necessary...
1674  ConnectedPairs.find(QTop.first);
1675  if (QQ == ConnectedPairs.end())
1676  continue;
1677 
1678  for (std::vector<ValuePair>::iterator K = QQ->second.begin(),
1679  KE = QQ->second.end(); K != KE; ++K) {
1681  if (C == DAG.end()) continue;
1682 
1683  // This child is in the DAG, now we need to make sure it is the
1684  // best of any conflicting children. There could be multiple
1685  // conflicting children, so first, determine if we're keeping
1686  // this child, then delete conflicting children as necessary.
1687 
1688  // It is also necessary to guard against pairing-induced
1689  // dependencies. Consider instructions a .. x .. y .. b
1690  // such that (a,b) are to be fused and (x,y) are to be fused
1691  // but a is an input to x and b is an output from y. This
1692  // means that y cannot be moved after b but x must be moved
1693  // after b for (a,b) to be fused. In other words, after
1694  // fusing (a,b) we have y .. a/b .. x where y is an input
1695  // to a/b and x is an output to a/b: x and y can no longer
1696  // be legally fused. To prevent this condition, we must
1697  // make sure that a child pair added to the DAG is not
1698  // both an input and output of an already-selected pair.
1699 
1700  // Pairing-induced dependencies can also form from more complicated
1701  // cycles. The pair vs. pair conflicts are easy to check, and so
1702  // that is done explicitly for "fast rejection", and because for
1703  // child vs. child conflicts, we may prefer to keep the current
1704  // pair in preference to the already-selected child.
1705  DenseSet<ValuePair> CurrentPairs;
1706 
1707  bool CanAdd = true;
1709  = BestChildren.begin(), E2 = BestChildren.end();
1710  C2 != E2; ++C2) {
1711  if (C2->first.first == C->first.first ||
1712  C2->first.first == C->first.second ||
1713  C2->first.second == C->first.first ||
1714  C2->first.second == C->first.second ||
1715  pairsConflict(C2->first, C->first, PairableInstUsers,
1716  UseCycleCheck ? &PairableInstUserMap : nullptr,
1717  UseCycleCheck ? &PairableInstUserPairSet
1718  : nullptr)) {
1719  if (C2->second >= C->second) {
1720  CanAdd = false;
1721  break;
1722  }
1723 
1724  CurrentPairs.insert(C2->first);
1725  }
1726  }
1727  if (!CanAdd) continue;
1728 
1729  // Even worse, this child could conflict with another node already
1730  // selected for the DAG. If that is the case, ignore this child.
1731  for (DenseSet<ValuePair>::iterator T = PrunedDAG.begin(),
1732  E2 = PrunedDAG.end(); T != E2; ++T) {
1733  if (T->first == C->first.first ||
1734  T->first == C->first.second ||
1735  T->second == C->first.first ||
1736  T->second == C->first.second ||
1737  pairsConflict(*T, C->first, PairableInstUsers,
1738  UseCycleCheck ? &PairableInstUserMap : nullptr,
1739  UseCycleCheck ? &PairableInstUserPairSet
1740  : nullptr)) {
1741  CanAdd = false;
1742  break;
1743  }
1744 
1745  CurrentPairs.insert(*T);
1746  }
1747  if (!CanAdd) continue;
1748 
1749  // And check the queue too...
1751  E2 = Q.end(); C2 != E2; ++C2) {
1752  if (C2->first.first == C->first.first ||
1753  C2->first.first == C->first.second ||
1754  C2->first.second == C->first.first ||
1755  C2->first.second == C->first.second ||
1756  pairsConflict(C2->first, C->first, PairableInstUsers,
1757  UseCycleCheck ? &PairableInstUserMap : nullptr,
1758  UseCycleCheck ? &PairableInstUserPairSet
1759  : nullptr)) {
1760  CanAdd = false;
1761  break;
1762  }
1763 
1764  CurrentPairs.insert(C2->first);
1765  }
1766  if (!CanAdd) continue;
1767 
1768  // Last but not least, check for a conflict with any of the
1769  // already-chosen pairs.
1771  ChosenPairs.begin(), E2 = ChosenPairs.end();
1772  C2 != E2; ++C2) {
1773  if (pairsConflict(*C2, C->first, PairableInstUsers,
1774  UseCycleCheck ? &PairableInstUserMap : nullptr,
1775  UseCycleCheck ? &PairableInstUserPairSet
1776  : nullptr)) {
1777  CanAdd = false;
1778  break;
1779  }
1780 
1781  CurrentPairs.insert(*C2);
1782  }
1783  if (!CanAdd) continue;
1784 
1785  // To check for non-trivial cycles formed by the addition of the
1786  // current pair we've formed a list of all relevant pairs, now use a
1787  // graph walk to check for a cycle. We start from the current pair and
1788  // walk the use dag to see if we again reach the current pair. If we
1789  // do, then the current pair is rejected.
1790 
1791  // FIXME: It may be more efficient to use a topological-ordering
1792  // algorithm to improve the cycle check. This should be investigated.
1793  if (UseCycleCheck &&
1794  pairWillFormCycle(C->first, PairableInstUserMap, CurrentPairs))
1795  continue;
1796 
1797  // This child can be added, but we may have chosen it in preference
1798  // to an already-selected child. Check for this here, and if a
1799  // conflict is found, then remove the previously-selected child
1800  // before adding this one in its place.
1802  = BestChildren.begin(); C2 != BestChildren.end();) {
1803  if (C2->first.first == C->first.first ||
1804  C2->first.first == C->first.second ||
1805  C2->first.second == C->first.first ||
1806  C2->first.second == C->first.second ||
1807  pairsConflict(C2->first, C->first, PairableInstUsers))
1808  C2 = BestChildren.erase(C2);
1809  else
1810  ++C2;
1811  }
1812 
1813  BestChildren.push_back(ValuePairWithDepth(C->first, C->second));
1814  }
1815 
1817  = BestChildren.begin(), E2 = BestChildren.end();
1818  C != E2; ++C) {
1819  size_t DepthF = getDepthFactor(C->first.first);
1820  Q.push_back(ValuePairWithDepth(C->first, QTop.second+DepthF));
1821  }
1822  } while (!Q.empty());
1823  }
1824 
1825  // This function finds the best dag of mututally-compatible connected
1826  // pairs, given the choice of root pairs as an iterator range.
1827  void BBVectorize::findBestDAGFor(
1828  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
1829  DenseSet<ValuePair> &CandidatePairsSet,
1830  DenseMap<ValuePair, int> &CandidatePairCostSavings,
1831  std::vector<Value *> &PairableInsts,
1832  DenseSet<ValuePair> &FixedOrderPairs,
1833  DenseMap<VPPair, unsigned> &PairConnectionTypes,
1834  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
1835  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
1836  DenseSet<ValuePair> &PairableInstUsers,
1837  DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
1838  DenseSet<VPPair> &PairableInstUserPairSet,
1839  DenseMap<Value *, Value *> &ChosenPairs,
1840  DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth,
1841  int &BestEffSize, Value *II, std::vector<Value *>&JJ,
1842  bool UseCycleCheck) {
1843  for (std::vector<Value *>::iterator J = JJ.begin(), JE = JJ.end();
1844  J != JE; ++J) {
1845  ValuePair IJ(II, *J);
1846  if (!CandidatePairsSet.count(IJ))
1847  continue;
1848 
1849  // Before going any further, make sure that this pair does not
1850  // conflict with any already-selected pairs (see comment below
1851  // near the DAG pruning for more details).
1852  DenseSet<ValuePair> ChosenPairSet;
1853  bool DoesConflict = false;
1854  for (DenseMap<Value *, Value *>::iterator C = ChosenPairs.begin(),
1855  E = ChosenPairs.end(); C != E; ++C) {
1856  if (pairsConflict(*C, IJ, PairableInstUsers,
1857  UseCycleCheck ? &PairableInstUserMap : nullptr,
1858  UseCycleCheck ? &PairableInstUserPairSet : nullptr)) {
1859  DoesConflict = true;
1860  break;
1861  }
1862 
1863  ChosenPairSet.insert(*C);
1864  }
1865  if (DoesConflict) continue;
1866 
1867  if (UseCycleCheck &&
1868  pairWillFormCycle(IJ, PairableInstUserMap, ChosenPairSet))
1869  continue;
1870 
1872  buildInitialDAGFor(CandidatePairs, CandidatePairsSet,
1873  PairableInsts, ConnectedPairs,
1874  PairableInstUsers, ChosenPairs, DAG, IJ);
1875 
1876  // Because we'll keep the child with the largest depth, the largest
1877  // depth is still the same in the unpruned DAG.
1878  size_t MaxDepth = DAG.lookup(IJ);
1879 
1880  DEBUG(if (DebugPairSelection) dbgs() << "BBV: found DAG for pair {"
1881  << *IJ.first << " <-> " << *IJ.second << "} of depth " <<
1882  MaxDepth << " and size " << DAG.size() << "\n");
1883 
1884  // At this point the DAG has been constructed, but, may contain
1885  // contradictory children (meaning that different children of
1886  // some dag node may be attempting to fuse the same instruction).
1887  // So now we walk the dag again, in the case of a conflict,
1888  // keep only the child with the largest depth. To break a tie,
1889  // favor the first child.
1890 
1891  DenseSet<ValuePair> PrunedDAG;
1892  pruneDAGFor(CandidatePairs, PairableInsts, ConnectedPairs,
1893  PairableInstUsers, PairableInstUserMap,
1894  PairableInstUserPairSet,
1895  ChosenPairs, DAG, PrunedDAG, IJ, UseCycleCheck);
1896 
1897  int EffSize = 0;
1898  if (TTI) {
1899  DenseSet<Value *> PrunedDAGInstrs;
1900  for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
1901  E = PrunedDAG.end(); S != E; ++S) {
1902  PrunedDAGInstrs.insert(S->first);
1903  PrunedDAGInstrs.insert(S->second);
1904  }
1905 
1906  // The set of pairs that have already contributed to the total cost.
1907  DenseSet<ValuePair> IncomingPairs;
1908 
1909  // If the cost model were perfect, this might not be necessary; but we
1910  // need to make sure that we don't get stuck vectorizing our own
1911  // shuffle chains.
1912  bool HasNontrivialInsts = false;
1913 
1914  // The node weights represent the cost savings associated with
1915  // fusing the pair of instructions.
1916  for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
1917  E = PrunedDAG.end(); S != E; ++S) {
1918  if (!isa<ShuffleVectorInst>(S->first) &&
1919  !isa<InsertElementInst>(S->first) &&
1920  !isa<ExtractElementInst>(S->first))
1921  HasNontrivialInsts = true;
1922 
1923  bool FlipOrder = false;
1924 
1925  if (getDepthFactor(S->first)) {
1926  int ESContrib = CandidatePairCostSavings.find(*S)->second;
1927  DEBUG(if (DebugPairSelection) dbgs() << "\tweight {"
1928  << *S->first << " <-> " << *S->second << "} = " <<
1929  ESContrib << "\n");
1930  EffSize += ESContrib;
1931  }
1932 
1933  // The edge weights contribute in a negative sense: they represent
1934  // the cost of shuffles.
1936  ConnectedPairDeps.find(*S);
1937  if (SS != ConnectedPairDeps.end()) {
1938  unsigned NumDepsDirect = 0, NumDepsSwap = 0;
1939  for (std::vector<ValuePair>::iterator T = SS->second.begin(),
1940  TE = SS->second.end(); T != TE; ++T) {
1941  VPPair Q(*S, *T);
1942  if (!PrunedDAG.count(Q.second))
1943  continue;
1945  PairConnectionTypes.find(VPPair(Q.second, Q.first));
1946  assert(R != PairConnectionTypes.end() &&
1947  "Cannot find pair connection type");
1948  if (R->second == PairConnectionDirect)
1949  ++NumDepsDirect;
1950  else if (R->second == PairConnectionSwap)
1951  ++NumDepsSwap;
1952  }
1953 
1954  // If there are more swaps than direct connections, then
1955  // the pair order will be flipped during fusion. So the real
1956  // number of swaps is the minimum number.
1957  FlipOrder = !FixedOrderPairs.count(*S) &&
1958  ((NumDepsSwap > NumDepsDirect) ||
1959  FixedOrderPairs.count(ValuePair(S->second, S->first)));
1960 
1961  for (std::vector<ValuePair>::iterator T = SS->second.begin(),
1962  TE = SS->second.end(); T != TE; ++T) {
1963  VPPair Q(*S, *T);
1964  if (!PrunedDAG.count(Q.second))
1965  continue;
1967  PairConnectionTypes.find(VPPair(Q.second, Q.first));
1968  assert(R != PairConnectionTypes.end() &&
1969  "Cannot find pair connection type");
1970  Type *Ty1 = Q.second.first->getType(),
1971  *Ty2 = Q.second.second->getType();
1972  Type *VTy = getVecTypeForPair(Ty1, Ty2);
1973  if ((R->second == PairConnectionDirect && FlipOrder) ||
1974  (R->second == PairConnectionSwap && !FlipOrder) ||
1975  R->second == PairConnectionSplat) {
1976  int ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
1977  VTy, VTy);
1978 
1979  if (VTy->getVectorNumElements() == 2) {
1980  if (R->second == PairConnectionSplat)
1981  ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
1983  else
1984  ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
1986  }
1987 
1988  DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
1989  *Q.second.first << " <-> " << *Q.second.second <<
1990  "} -> {" <<
1991  *S->first << " <-> " << *S->second << "} = " <<
1992  ESContrib << "\n");
1993  EffSize -= ESContrib;
1994  }
1995  }
1996  }
1997 
1998  // Compute the cost of outgoing edges. We assume that edges outgoing
1999  // to shuffles, inserts or extracts can be merged, and so contribute
2000  // no additional cost.
2001  if (!S->first->getType()->isVoidTy()) {
2002  Type *Ty1 = S->first->getType(),
2003  *Ty2 = S->second->getType();
2004  Type *VTy = getVecTypeForPair(Ty1, Ty2);
2005 
2006  bool NeedsExtraction = false;
2007  for (User *U : S->first->users()) {
2008  if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(U)) {
2009  // Shuffle can be folded if it has no other input
2010  if (isa<UndefValue>(SI->getOperand(1)))
2011  continue;
2012  }
2013  if (isa<ExtractElementInst>(U))
2014  continue;
2015  if (PrunedDAGInstrs.count(U))
2016  continue;
2017  NeedsExtraction = true;
2018  break;
2019  }
2020 
2021  if (NeedsExtraction) {
2022  int ESContrib;
2023  if (Ty1->isVectorTy()) {
2024  ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
2025  Ty1, VTy);
2026  ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
2028  } else
2029  ESContrib = (int) TTI->getVectorInstrCost(
2030  Instruction::ExtractElement, VTy, 0);
2031 
2032  DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
2033  *S->first << "} = " << ESContrib << "\n");
2034  EffSize -= ESContrib;
2035  }
2036 
2037  NeedsExtraction = false;
2038  for (User *U : S->second->users()) {
2039  if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(U)) {
2040  // Shuffle can be folded if it has no other input
2041  if (isa<UndefValue>(SI->getOperand(1)))
2042  continue;
2043  }
2044  if (isa<ExtractElementInst>(U))
2045  continue;
2046  if (PrunedDAGInstrs.count(U))
2047  continue;
2048  NeedsExtraction = true;
2049  break;
2050  }
2051 
2052  if (NeedsExtraction) {
2053  int ESContrib;
2054  if (Ty2->isVectorTy()) {
2055  ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
2056  Ty2, VTy);
2057  ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
2059  Ty1->isVectorTy() ? Ty1->getVectorNumElements() : 1, Ty2));
2060  } else
2061  ESContrib = (int) TTI->getVectorInstrCost(
2062  Instruction::ExtractElement, VTy, 1);
2063  DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
2064  *S->second << "} = " << ESContrib << "\n");
2065  EffSize -= ESContrib;
2066  }
2067  }
2068 
2069  // Compute the cost of incoming edges.
2070  if (!isa<LoadInst>(S->first) && !isa<StoreInst>(S->first)) {
2071  Instruction *S1 = cast<Instruction>(S->first),
2072  *S2 = cast<Instruction>(S->second);
2073  for (unsigned o = 0; o < S1->getNumOperands(); ++o) {
2074  Value *O1 = S1->getOperand(o), *O2 = S2->getOperand(o);
2075 
2076  // Combining constants into vector constants (or small vector
2077  // constants into larger ones are assumed free).
2078  if (isa<Constant>(O1) && isa<Constant>(O2))
2079  continue;
2080 
2081  if (FlipOrder)
2082  std::swap(O1, O2);
2083 
2084  ValuePair VP = ValuePair(O1, O2);
2085  ValuePair VPR = ValuePair(O2, O1);
2086 
2087  // Internal edges are not handled here.
2088  if (PrunedDAG.count(VP) || PrunedDAG.count(VPR))
2089  continue;
2090 
2091  Type *Ty1 = O1->getType(),
2092  *Ty2 = O2->getType();
2093  Type *VTy = getVecTypeForPair(Ty1, Ty2);
2094 
2095  // Combining vector operations of the same type is also assumed
2096  // folded with other operations.
2097  if (Ty1 == Ty2) {
2098  // If both are insert elements, then both can be widened.
2100  *IEO2 = dyn_cast<InsertElementInst>(O2);
2101  if (IEO1 && IEO2 && isPureIEChain(IEO1) && isPureIEChain(IEO2))
2102  continue;
2103  // If both are extract elements, and both have the same input
2104  // type, then they can be replaced with a shuffle
2106  *EIO2 = dyn_cast<ExtractElementInst>(O2);
2107  if (EIO1 && EIO2 &&
2108  EIO1->getOperand(0)->getType() ==
2109  EIO2->getOperand(0)->getType())
2110  continue;
2111  // If both are a shuffle with equal operand types and only two
2112  // unqiue operands, then they can be replaced with a single
2113  // shuffle
2115  *SIO2 = dyn_cast<ShuffleVectorInst>(O2);
2116  if (SIO1 && SIO2 &&
2117  SIO1->getOperand(0)->getType() ==
2118  SIO2->getOperand(0)->getType()) {
2119  SmallSet<Value *, 4> SIOps;
2120  SIOps.insert(SIO1->getOperand(0));
2121  SIOps.insert(SIO1->getOperand(1));
2122  SIOps.insert(SIO2->getOperand(0));
2123  SIOps.insert(SIO2->getOperand(1));
2124  if (SIOps.size() <= 2)
2125  continue;
2126  }
2127  }
2128 
2129  int ESContrib;
2130  // This pair has already been formed.
2131  if (IncomingPairs.count(VP)) {
2132  continue;
2133  } else if (IncomingPairs.count(VPR)) {
2134  ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
2135  VTy, VTy);
2136 
2137  if (VTy->getVectorNumElements() == 2)
2138  ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
2140  } else if (!Ty1->isVectorTy() && !Ty2->isVectorTy()) {
2141  ESContrib = (int) TTI->getVectorInstrCost(
2142  Instruction::InsertElement, VTy, 0);
2143  ESContrib += (int) TTI->getVectorInstrCost(
2144  Instruction::InsertElement, VTy, 1);
2145  } else if (!Ty1->isVectorTy()) {
2146  // O1 needs to be inserted into a vector of size O2, and then
2147  // both need to be shuffled together.
2148  ESContrib = (int) TTI->getVectorInstrCost(
2149  Instruction::InsertElement, Ty2, 0);
2150  ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
2151  VTy, Ty2);
2152  } else if (!Ty2->isVectorTy()) {
2153  // O2 needs to be inserted into a vector of size O1, and then
2154  // both need to be shuffled together.
2155  ESContrib = (int) TTI->getVectorInstrCost(
2156  Instruction::InsertElement, Ty1, 0);
2157  ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
2158  VTy, Ty1);
2159  } else {
2160  Type *TyBig = Ty1, *TySmall = Ty2;
2161  if (Ty2->getVectorNumElements() > Ty1->getVectorNumElements())
2162  std::swap(TyBig, TySmall);
2163 
2164  ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
2165  VTy, TyBig);
2166  if (TyBig != TySmall)
2167  ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
2168  TyBig, TySmall);
2169  }
2170 
2171  DEBUG(if (DebugPairSelection) dbgs() << "\tcost {"
2172  << *O1 << " <-> " << *O2 << "} = " <<
2173  ESContrib << "\n");
2174  EffSize -= ESContrib;
2175  IncomingPairs.insert(VP);
2176  }
2177  }
2178  }
2179 
2180  if (!HasNontrivialInsts) {
2181  DEBUG(if (DebugPairSelection) dbgs() <<
2182  "\tNo non-trivial instructions in DAG;"
2183  " override to zero effective size\n");
2184  EffSize = 0;
2185  }
2186  } else {
2187  for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
2188  E = PrunedDAG.end(); S != E; ++S)
2189  EffSize += (int) getDepthFactor(S->first);
2190  }
2191 
2193  dbgs() << "BBV: found pruned DAG for pair {"
2194  << *IJ.first << " <-> " << *IJ.second << "} of depth " <<
2195  MaxDepth << " and size " << PrunedDAG.size() <<
2196  " (effective size: " << EffSize << ")\n");
2197  if (((TTI && !UseChainDepthWithTI) ||
2198  MaxDepth >= Config.ReqChainDepth) &&
2199  EffSize > 0 && EffSize > BestEffSize) {
2200  BestMaxDepth = MaxDepth;
2201  BestEffSize = EffSize;
2202  BestDAG = PrunedDAG;
2203  }
2204  }
2205  }
2206 
2207  // Given the list of candidate pairs, this function selects those
2208  // that will be fused into vector instructions.
2209  void BBVectorize::choosePairs(
2210  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
2211  DenseSet<ValuePair> &CandidatePairsSet,
2212  DenseMap<ValuePair, int> &CandidatePairCostSavings,
2213  std::vector<Value *> &PairableInsts,
2214  DenseSet<ValuePair> &FixedOrderPairs,
2215  DenseMap<VPPair, unsigned> &PairConnectionTypes,
2216  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
2217  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
2218  DenseSet<ValuePair> &PairableInstUsers,
2219  DenseMap<Value *, Value *>& ChosenPairs) {
2220  bool UseCycleCheck =
2221  CandidatePairsSet.size() <= Config.MaxCandPairsForCycleCheck;
2222 
2223  DenseMap<Value *, std::vector<Value *> > CandidatePairs2;
2224  for (DenseSet<ValuePair>::iterator I = CandidatePairsSet.begin(),
2225  E = CandidatePairsSet.end(); I != E; ++I) {
2226  std::vector<Value *> &JJ = CandidatePairs2[I->second];
2227  if (JJ.empty()) JJ.reserve(32);
2228  JJ.push_back(I->first);
2229  }
2230 
2231  DenseMap<ValuePair, std::vector<ValuePair> > PairableInstUserMap;
2232  DenseSet<VPPair> PairableInstUserPairSet;
2233  for (std::vector<Value *>::iterator I = PairableInsts.begin(),
2234  E = PairableInsts.end(); I != E; ++I) {
2235  // The number of possible pairings for this variable:
2236  size_t NumChoices = CandidatePairs.lookup(*I).size();
2237  if (!NumChoices) continue;
2238 
2239  std::vector<Value *> &JJ = CandidatePairs[*I];
2240 
2241  // The best pair to choose and its dag:
2242  size_t BestMaxDepth = 0;
2243  int BestEffSize = 0;
2244  DenseSet<ValuePair> BestDAG;
2245  findBestDAGFor(CandidatePairs, CandidatePairsSet,
2246  CandidatePairCostSavings,
2247  PairableInsts, FixedOrderPairs, PairConnectionTypes,
2248  ConnectedPairs, ConnectedPairDeps,
2249  PairableInstUsers, PairableInstUserMap,
2250  PairableInstUserPairSet, ChosenPairs,
2251  BestDAG, BestMaxDepth, BestEffSize, *I, JJ,
2252  UseCycleCheck);
2253 
2254  if (BestDAG.empty())
2255  continue;
2256 
2257  // A dag has been chosen (or not) at this point. If no dag was
2258  // chosen, then this instruction, I, cannot be paired (and is no longer
2259  // considered).
2260 
2261  DEBUG(dbgs() << "BBV: selected pairs in the best DAG for: "
2262  << *cast<Instruction>(*I) << "\n");
2263 
2264  for (DenseSet<ValuePair>::iterator S = BestDAG.begin(),
2265  SE2 = BestDAG.end(); S != SE2; ++S) {
2266  // Insert the members of this dag into the list of chosen pairs.
2267  ChosenPairs.insert(ValuePair(S->first, S->second));
2268  DEBUG(dbgs() << "BBV: selected pair: " << *S->first << " <-> " <<
2269  *S->second << "\n");
2270 
2271  // Remove all candidate pairs that have values in the chosen dag.
2272  std::vector<Value *> &KK = CandidatePairs[S->first];
2273  for (std::vector<Value *>::iterator K = KK.begin(), KE = KK.end();
2274  K != KE; ++K) {
2275  if (*K == S->second)
2276  continue;
2277 
2278  CandidatePairsSet.erase(ValuePair(S->first, *K));
2279  }
2280 
2281  std::vector<Value *> &LL = CandidatePairs2[S->second];
2282  for (std::vector<Value *>::iterator L = LL.begin(), LE = LL.end();
2283  L != LE; ++L) {
2284  if (*L == S->first)
2285  continue;
2286 
2287  CandidatePairsSet.erase(ValuePair(*L, S->second));
2288  }
2289 
2290  std::vector<Value *> &MM = CandidatePairs[S->second];
2291  for (std::vector<Value *>::iterator M = MM.begin(), ME = MM.end();
2292  M != ME; ++M) {
2293  assert(*M != S->first && "Flipped pair in candidate list?");
2294  CandidatePairsSet.erase(ValuePair(S->second, *M));
2295  }
2296 
2297  std::vector<Value *> &NN = CandidatePairs2[S->first];
2298  for (std::vector<Value *>::iterator N = NN.begin(), NE = NN.end();
2299  N != NE; ++N) {
2300  assert(*N != S->second && "Flipped pair in candidate list?");
2301  CandidatePairsSet.erase(ValuePair(*N, S->first));
2302  }
2303  }
2304  }
2305 
2306  DEBUG(dbgs() << "BBV: selected " << ChosenPairs.size() << " pairs.\n");
2307  }
2308 
2309  std::string getReplacementName(Instruction *I, bool IsInput, unsigned o,
2310  unsigned n = 0) {
2311  if (!I->hasName())
2312  return "";
2313 
2314  return (I->getName() + (IsInput ? ".v.i" : ".v.r") + utostr(o) +
2315  (n > 0 ? "." + utostr(n) : "")).str();
2316  }
2317 
2318  // Returns the value that is to be used as the pointer input to the vector
2319  // instruction that fuses I with J.
2320  Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context,
2321  Instruction *I, Instruction *J, unsigned o) {
2322  Value *IPtr, *JPtr;
2323  unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
2324  int64_t OffsetInElmts;
2325 
2326  // Note: the analysis might fail here, that is why the pair order has
2327  // been precomputed (OffsetInElmts must be unused here).
2328  (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
2329  IAddressSpace, JAddressSpace,
2330  OffsetInElmts, false);
2331 
2332  // The pointer value is taken to be the one with the lowest offset.
2333  Value *VPtr = IPtr;
2334 
2335  Type *ArgTypeI = IPtr->getType()->getPointerElementType();
2336  Type *ArgTypeJ = JPtr->getType()->getPointerElementType();
2337  Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
2338  Type *VArgPtrType
2339  = PointerType::get(VArgType,
2340  IPtr->getType()->getPointerAddressSpace());
2341  return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o),
2342  /* insert before */ I);
2343  }
2344 
2345  void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J,
2346  unsigned MaskOffset, unsigned NumInElem,
2347  unsigned NumInElem1, unsigned IdxOffset,
2348  std::vector<Constant*> &Mask) {
2349  unsigned NumElem1 = J->getType()->getVectorNumElements();
2350  for (unsigned v = 0; v < NumElem1; ++v) {
2351  int m = cast<ShuffleVectorInst>(J)->getMaskValue(v);
2352  if (m < 0) {
2353  Mask[v+MaskOffset] = UndefValue::get(Type::getInt32Ty(Context));
2354  } else {
2355  unsigned mm = m + (int) IdxOffset;
2356  if (m >= (int) NumInElem1)
2357  mm += (int) NumInElem;
2358 
2359  Mask[v+MaskOffset] =
2360  ConstantInt::get(Type::getInt32Ty(Context), mm);
2361  }
2362  }
2363  }
2364 
2365  // Returns the value that is to be used as the vector-shuffle mask to the
2366  // vector instruction that fuses I with J.
2367  Value *BBVectorize::getReplacementShuffleMask(LLVMContext& Context,
2368  Instruction *I, Instruction *J) {
2369  // This is the shuffle mask. We need to append the second
2370  // mask to the first, and the numbers need to be adjusted.
2371 
2372  Type *ArgTypeI = I->getType();
2373  Type *ArgTypeJ = J->getType();
2374  Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
2375 
2376  unsigned NumElemI = ArgTypeI->getVectorNumElements();
2377 
2378  // Get the total number of elements in the fused vector type.
2379  // By definition, this must equal the number of elements in
2380  // the final mask.
2381  unsigned NumElem = VArgType->getVectorNumElements();
2382  std::vector<Constant*> Mask(NumElem);
2383 
2384  Type *OpTypeI = I->getOperand(0)->getType();
2385  unsigned NumInElemI = OpTypeI->getVectorNumElements();
2386  Type *OpTypeJ = J->getOperand(0)->getType();
2387  unsigned NumInElemJ = OpTypeJ->getVectorNumElements();
2388 
2389  // The fused vector will be:
2390  // -----------------------------------------------------
2391  // | NumInElemI | NumInElemJ | NumInElemI | NumInElemJ |
2392  // -----------------------------------------------------
2393  // from which we'll extract NumElem total elements (where the first NumElemI
2394  // of them come from the mask in I and the remainder come from the mask
2395  // in J.
2396 
2397  // For the mask from the first pair...
2398  fillNewShuffleMask(Context, I, 0, NumInElemJ, NumInElemI,
2399  0, Mask);
2400 
2401  // For the mask from the second pair...
2402  fillNewShuffleMask(Context, J, NumElemI, NumInElemI, NumInElemJ,
2403  NumInElemI, Mask);
2404 
2405  return ConstantVector::get(Mask);
2406  }
2407 
2408  bool BBVectorize::expandIEChain(LLVMContext& Context, Instruction *I,
2409  Instruction *J, unsigned o, Value *&LOp,
2410  unsigned numElemL,
2411  Type *ArgTypeL, Type *ArgTypeH,
2412  bool IBeforeJ, unsigned IdxOff) {
2413  bool ExpandedIEChain = false;
2414  if (InsertElementInst *LIE = dyn_cast<InsertElementInst>(LOp)) {
2415  // If we have a pure insertelement chain, then this can be rewritten
2416  // into a chain that directly builds the larger type.
2417  if (isPureIEChain(LIE)) {
2418  SmallVector<Value *, 8> VectElemts(numElemL,
2419  UndefValue::get(ArgTypeL->getScalarType()));
2420  InsertElementInst *LIENext = LIE;
2421  do {
2422  unsigned Idx =
2423  cast<ConstantInt>(LIENext->getOperand(2))->getSExtValue();
2424  VectElemts[Idx] = LIENext->getOperand(1);
2425  } while ((LIENext =
2426  dyn_cast<InsertElementInst>(LIENext->getOperand(0))));
2427 
2428  LIENext = nullptr;
2429  Value *LIEPrev = UndefValue::get(ArgTypeH);
2430  for (unsigned i = 0; i < numElemL; ++i) {
2431  if (isa<UndefValue>(VectElemts[i])) continue;
2432  LIENext = InsertElementInst::Create(LIEPrev, VectElemts[i],
2434  i + IdxOff),
2435  getReplacementName(IBeforeJ ? I : J,
2436  true, o, i+1));
2437  LIENext->insertBefore(IBeforeJ ? J : I);
2438  LIEPrev = LIENext;
2439  }
2440 
2441  LOp = LIENext ? (Value*) LIENext : UndefValue::get(ArgTypeH);
2442  ExpandedIEChain = true;
2443  }
2444  }
2445 
2446  return ExpandedIEChain;
2447  }
2448 
2449  static unsigned getNumScalarElements(Type *Ty) {
2450  if (VectorType *VecTy = dyn_cast<VectorType>(Ty))
2451  return VecTy->getNumElements();
2452  return 1;
2453  }
2454 
2455  // Returns the value to be used as the specified operand of the vector
2456  // instruction that fuses I with J.
2457  Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I,
2458  Instruction *J, unsigned o, bool IBeforeJ) {
2459  Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
2460  Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1);
2461 
2462  // Compute the fused vector type for this operand
2463  Type *ArgTypeI = I->getOperand(o)->getType();
2464  Type *ArgTypeJ = J->getOperand(o)->getType();
2465  VectorType *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
2466 
2467  Instruction *L = I, *H = J;
2468  Type *ArgTypeL = ArgTypeI, *ArgTypeH = ArgTypeJ;
2469 
2470  unsigned numElemL = getNumScalarElements(ArgTypeL);
2471  unsigned numElemH = getNumScalarElements(ArgTypeH);
2472 
2473  Value *LOp = L->getOperand(o);
2474  Value *HOp = H->getOperand(o);
2475  unsigned numElem = VArgType->getNumElements();
2476 
2477  // First, we check if we can reuse the "original" vector outputs (if these
2478  // exist). We might need a shuffle.
2483 
2484  // FIXME: If we're fusing shuffle instructions, then we can't apply this
2485  // optimization. The input vectors to the shuffle might be a different
2486  // length from the shuffle outputs. Unfortunately, the replacement
2487  // shuffle mask has already been formed, and the mask entries are sensitive
2488  // to the sizes of the inputs.
2489  bool IsSizeChangeShuffle =
2490  isa<ShuffleVectorInst>(L) &&
2491  (LOp->getType() != L->getType() || HOp->getType() != H->getType());
2492 
2493  if ((LEE || LSV) && (HEE || HSV) && !IsSizeChangeShuffle) {
2494  // We can have at most two unique vector inputs.
2495  bool CanUseInputs = true;
2496  Value *I1, *I2 = nullptr;
2497  if (LEE) {
2498  I1 = LEE->getOperand(0);
2499  } else {
2500  I1 = LSV->getOperand(0);
2501  I2 = LSV->getOperand(1);
2502  if (I2 == I1 || isa<UndefValue>(I2))
2503  I2 = nullptr;
2504  }
2505 
2506  if (HEE) {
2507  Value *I3 = HEE->getOperand(0);
2508  if (!I2 && I3 != I1)
2509  I2 = I3;
2510  else if (I3 != I1 && I3 != I2)
2511  CanUseInputs = false;
2512  } else {
2513  Value *I3 = HSV->getOperand(0);
2514  if (!I2 && I3 != I1)
2515  I2 = I3;
2516  else if (I3 != I1 && I3 != I2)
2517  CanUseInputs = false;
2518 
2519  if (CanUseInputs) {
2520  Value *I4 = HSV->getOperand(1);
2521  if (!isa<UndefValue>(I4)) {
2522  if (!I2 && I4 != I1)
2523  I2 = I4;
2524  else if (I4 != I1 && I4 != I2)
2525  CanUseInputs = false;
2526  }
2527  }
2528  }
2529 
2530  if (CanUseInputs) {
2531  unsigned LOpElem =
2532  cast<Instruction>(LOp)->getOperand(0)->getType()
2533  ->getVectorNumElements();
2534 
2535  unsigned HOpElem =
2536  cast<Instruction>(HOp)->getOperand(0)->getType()
2537  ->getVectorNumElements();
2538 
2539  // We have one or two input vectors. We need to map each index of the
2540  // operands to the index of the original vector.
2541  SmallVector<std::pair<int, int>, 8> II(numElem);
2542  for (unsigned i = 0; i < numElemL; ++i) {
2543  int Idx, INum;
2544  if (LEE) {
2545  Idx =
2546  cast<ConstantInt>(LEE->getOperand(1))->getSExtValue();
2547  INum = LEE->getOperand(0) == I1 ? 0 : 1;
2548  } else {
2549  Idx = LSV->getMaskValue(i);
2550  if (Idx < (int) LOpElem) {
2551  INum = LSV->getOperand(0) == I1 ? 0 : 1;
2552  } else {
2553  Idx -= LOpElem;
2554  INum = LSV->getOperand(1) == I1 ? 0 : 1;
2555  }
2556  }
2557 
2558  II[i] = std::pair<int, int>(Idx, INum);
2559  }
2560  for (unsigned i = 0; i < numElemH; ++i) {
2561  int Idx, INum;
2562  if (HEE) {
2563  Idx =
2564  cast<ConstantInt>(HEE->getOperand(1))->getSExtValue();
2565  INum = HEE->getOperand(0) == I1 ? 0 : 1;
2566  } else {
2567  Idx = HSV->getMaskValue(i);
2568  if (Idx < (int) HOpElem) {
2569  INum = HSV->getOperand(0) == I1 ? 0 : 1;
2570  } else {
2571  Idx -= HOpElem;
2572  INum = HSV->getOperand(1) == I1 ? 0 : 1;
2573  }
2574  }
2575 
2576  II[i + numElemL] = std::pair<int, int>(Idx, INum);
2577  }
2578 
2579  // We now have an array which tells us from which index of which
2580  // input vector each element of the operand comes.
2581  VectorType *I1T = cast<VectorType>(I1->getType());
2582  unsigned I1Elem = I1T->getNumElements();
2583 
2584  if (!I2) {
2585  // In this case there is only one underlying vector input. Check for
2586  // the trivial case where we can use the input directly.
2587  if (I1Elem == numElem) {
2588  bool ElemInOrder = true;
2589  for (unsigned i = 0; i < numElem; ++i) {
2590  if (II[i].first != (int) i && II[i].first != -1) {
2591  ElemInOrder = false;
2592  break;
2593  }
2594  }
2595 
2596  if (ElemInOrder)
2597  return I1;
2598  }
2599 
2600  // A shuffle is needed.
2601  std::vector<Constant *> Mask(numElem);
2602  for (unsigned i = 0; i < numElem; ++i) {
2603  int Idx = II[i].first;
2604  if (Idx == -1)
2605  Mask[i] = UndefValue::get(Type::getInt32Ty(Context));
2606  else
2607  Mask[i] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
2608  }
2609 
2610  Instruction *S =
2611  new ShuffleVectorInst(I1, UndefValue::get(I1T),
2612  ConstantVector::get(Mask),
2613  getReplacementName(IBeforeJ ? I : J,
2614  true, o));
2615  S->insertBefore(IBeforeJ ? J : I);
2616  return S;
2617  }
2618 
2619  VectorType *I2T = cast<VectorType>(I2->getType());
2620  unsigned I2Elem = I2T->getNumElements();
2621 
2622  // This input comes from two distinct vectors. The first step is to
2623  // make sure that both vectors are the same length. If not, the
2624  // smaller one will need to grow before they can be shuffled together.
2625  if (I1Elem < I2Elem) {
2626  std::vector<Constant *> Mask(I2Elem);
2627  unsigned v = 0;
2628  for (; v < I1Elem; ++v)
2629  Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2630  for (; v < I2Elem; ++v)
2631  Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
2632 
2633  Instruction *NewI1 =
2634  new ShuffleVectorInst(I1, UndefValue::get(I1T),
2635  ConstantVector::get(Mask),
2636  getReplacementName(IBeforeJ ? I : J,
2637  true, o, 1));
2638  NewI1->insertBefore(IBeforeJ ? J : I);
2639  I1 = NewI1;
2640  I1Elem = I2Elem;
2641  } else if (I1Elem > I2Elem) {
2642  std::vector<Constant *> Mask(I1Elem);
2643  unsigned v = 0;
2644  for (; v < I2Elem; ++v)
2645  Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2646  for (; v < I1Elem; ++v)
2647  Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
2648 
2649  Instruction *NewI2 =
2650  new ShuffleVectorInst(I2, UndefValue::get(I2T),
2651  ConstantVector::get(Mask),
2652  getReplacementName(IBeforeJ ? I : J,
2653  true, o, 1));
2654  NewI2->insertBefore(IBeforeJ ? J : I);
2655  I2 = NewI2;
2656  }
2657 
2658  // Now that both I1 and I2 are the same length we can shuffle them
2659  // together (and use the result).
2660  std::vector<Constant *> Mask(numElem);
2661  for (unsigned v = 0; v < numElem; ++v) {
2662  if (II[v].first == -1) {
2663  Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
2664  } else {
2665  int Idx = II[v].first + II[v].second * I1Elem;
2666  Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
2667  }
2668  }
2669 
2670  Instruction *NewOp =
2671  new ShuffleVectorInst(I1, I2, ConstantVector::get(Mask),
2672  getReplacementName(IBeforeJ ? I : J, true, o));
2673  NewOp->insertBefore(IBeforeJ ? J : I);
2674  return NewOp;
2675  }
2676  }
2677 
2678  Type *ArgType = ArgTypeL;
2679  if (numElemL < numElemH) {
2680  if (numElemL == 1 && expandIEChain(Context, I, J, o, HOp, numElemH,
2681  ArgTypeL, VArgType, IBeforeJ, 1)) {
2682  // This is another short-circuit case: we're combining a scalar into
2683  // a vector that is formed by an IE chain. We've just expanded the IE
2684  // chain, now insert the scalar and we're done.
2685 
2686  Instruction *S = InsertElementInst::Create(HOp, LOp, CV0,
2687  getReplacementName(IBeforeJ ? I : J, true, o));
2688  S->insertBefore(IBeforeJ ? J : I);
2689  return S;
2690  } else if (!expandIEChain(Context, I, J, o, LOp, numElemL, ArgTypeL,
2691  ArgTypeH, IBeforeJ)) {
2692  // The two vector inputs to the shuffle must be the same length,
2693  // so extend the smaller vector to be the same length as the larger one.
2694  Instruction *NLOp;
2695  if (numElemL > 1) {
2696 
2697  std::vector<Constant *> Mask(numElemH);
2698  unsigned v = 0;
2699  for (; v < numElemL; ++v)
2700  Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2701  for (; v < numElemH; ++v)
2702  Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
2703 
2704  NLOp = new ShuffleVectorInst(LOp, UndefValue::get(ArgTypeL),
2705  ConstantVector::get(Mask),
2706  getReplacementName(IBeforeJ ? I : J,
2707  true, o, 1));
2708  } else {
2709  NLOp = InsertElementInst::Create(UndefValue::get(ArgTypeH), LOp, CV0,
2710  getReplacementName(IBeforeJ ? I : J,
2711  true, o, 1));
2712  }
2713 
2714  NLOp->insertBefore(IBeforeJ ? J : I);
2715  LOp = NLOp;
2716  }
2717 
2718  ArgType = ArgTypeH;
2719  } else if (numElemL > numElemH) {
2720  if (numElemH == 1 && expandIEChain(Context, I, J, o, LOp, numElemL,
2721  ArgTypeH, VArgType, IBeforeJ)) {
2722  Instruction *S =
2723  InsertElementInst::Create(LOp, HOp,
2725  numElemL),
2726  getReplacementName(IBeforeJ ? I : J,
2727  true, o));
2728  S->insertBefore(IBeforeJ ? J : I);
2729  return S;
2730  } else if (!expandIEChain(Context, I, J, o, HOp, numElemH, ArgTypeH,
2731  ArgTypeL, IBeforeJ)) {
2732  Instruction *NHOp;
2733  if (numElemH > 1) {
2734  std::vector<Constant *> Mask(numElemL);
2735  unsigned v = 0;
2736  for (; v < numElemH; ++v)
2737  Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2738  for (; v < numElemL; ++v)
2739  Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
2740 
2741  NHOp = new ShuffleVectorInst(HOp, UndefValue::get(ArgTypeH),
2742  ConstantVector::get(Mask),
2743  getReplacementName(IBeforeJ ? I : J,
2744  true, o, 1));
2745  } else {
2746  NHOp = InsertElementInst::Create(UndefValue::get(ArgTypeL), HOp, CV0,
2747  getReplacementName(IBeforeJ ? I : J,
2748  true, o, 1));
2749  }
2750 
2751  NHOp->insertBefore(IBeforeJ ? J : I);
2752  HOp = NHOp;
2753  }
2754  }
2755 
2756  if (ArgType->isVectorTy()) {
2757  unsigned numElem = VArgType->getVectorNumElements();
2758  std::vector<Constant*> Mask(numElem);
2759  for (unsigned v = 0; v < numElem; ++v) {
2760  unsigned Idx = v;
2761  // If the low vector was expanded, we need to skip the extra
2762  // undefined entries.
2763  if (v >= numElemL && numElemH > numElemL)
2764  Idx += (numElemH - numElemL);
2765  Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
2766  }
2767 
2768  Instruction *BV = new ShuffleVectorInst(LOp, HOp,
2769  ConstantVector::get(Mask),
2770  getReplacementName(IBeforeJ ? I : J, true, o));
2771  BV->insertBefore(IBeforeJ ? J : I);
2772  return BV;
2773  }
2774 
2776  UndefValue::get(VArgType), LOp, CV0,
2777  getReplacementName(IBeforeJ ? I : J,
2778  true, o, 1));
2779  BV1->insertBefore(IBeforeJ ? J : I);
2780  Instruction *BV2 = InsertElementInst::Create(BV1, HOp, CV1,
2781  getReplacementName(IBeforeJ ? I : J,
2782  true, o, 2));
2783  BV2->insertBefore(IBeforeJ ? J : I);
2784  return BV2;
2785  }
2786 
2787  // This function creates an array of values that will be used as the inputs
2788  // to the vector instruction that fuses I with J.
2789  void BBVectorize::getReplacementInputsForPair(LLVMContext& Context,
2790  Instruction *I, Instruction *J,
2791  SmallVectorImpl<Value *> &ReplacedOperands,
2792  bool IBeforeJ) {
2793  unsigned NumOperands = I->getNumOperands();
2794 
2795  for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) {
2796  // Iterate backward so that we look at the store pointer
2797  // first and know whether or not we need to flip the inputs.
2798 
2799  if (isa<LoadInst>(I) || (o == 1 && isa<StoreInst>(I))) {
2800  // This is the pointer for a load/store instruction.
2801  ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o);
2802  continue;
2803  } else if (isa<CallInst>(I)) {
2804  Function *F = cast<CallInst>(I)->getCalledFunction();
2805  Intrinsic::ID IID = F->getIntrinsicID();
2806  if (o == NumOperands-1) {
2807  BasicBlock &BB = *I->getParent();
2808 
2809  Module *M = BB.getParent()->getParent();
2810  Type *ArgTypeI = I->getType();
2811  Type *ArgTypeJ = J->getType();
2812  Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
2813 
2814  ReplacedOperands[o] = Intrinsic::getDeclaration(M, IID, VArgType);
2815  continue;
2816  } else if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz ||
2817  IID == Intrinsic::cttz) && o == 1) {
2818  // The second argument of powi/ctlz/cttz is a single integer/constant
2819  // and we've already checked that both arguments are equal.
2820  // As a result, we just keep I's second argument.
2821  ReplacedOperands[o] = I->getOperand(o);
2822  continue;
2823  }
2824  } else if (isa<ShuffleVectorInst>(I) && o == NumOperands-1) {
2825  ReplacedOperands[o] = getReplacementShuffleMask(Context, I, J);
2826  continue;
2827  }
2828 
2829  ReplacedOperands[o] = getReplacementInput(Context, I, J, o, IBeforeJ);
2830  }
2831  }
2832 
2833  // This function creates two values that represent the outputs of the
2834  // original I and J instructions. These are generally vector shuffles
2835  // or extracts. In many cases, these will end up being unused and, thus,
2836  // eliminated by later passes.
2837  void BBVectorize::replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
2838  Instruction *J, Instruction *K,
2839  Instruction *&InsertionPt,
2840  Instruction *&K1, Instruction *&K2) {
2841  if (isa<StoreInst>(I))
2842  return;
2843 
2844  Type *IType = I->getType();
2845  Type *JType = J->getType();
2846 
2847  VectorType *VType = getVecTypeForPair(IType, JType);
2848  unsigned numElem = VType->getNumElements();
2849 
2850  unsigned numElemI = getNumScalarElements(IType);
2851  unsigned numElemJ = getNumScalarElements(JType);
2852 
2853  if (IType->isVectorTy()) {
2854  std::vector<Constant *> Mask1(numElemI), Mask2(numElemI);
2855  for (unsigned v = 0; v < numElemI; ++v) {
2856  Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2857  Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemJ + v);
2858  }
2859 
2860  K1 = new ShuffleVectorInst(K, UndefValue::get(VType),
2861  ConstantVector::get(Mask1),
2862  getReplacementName(K, false, 1));
2863  } else {
2864  Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
2865  K1 = ExtractElementInst::Create(K, CV0, getReplacementName(K, false, 1));
2866  }
2867 
2868  if (JType->isVectorTy()) {
2869  std::vector<Constant *> Mask1(numElemJ), Mask2(numElemJ);
2870  for (unsigned v = 0; v < numElemJ; ++v) {
2871  Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2872  Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemI + v);
2873  }
2874 
2875  K2 = new ShuffleVectorInst(K, UndefValue::get(VType),
2876  ConstantVector::get(Mask2),
2877  getReplacementName(K, false, 2));
2878  } else {
2879  Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem - 1);
2880  K2 = ExtractElementInst::Create(K, CV1, getReplacementName(K, false, 2));
2881  }
2882 
2883  K1->insertAfter(K);
2884  K2->insertAfter(K1);
2885  InsertionPt = K2;
2886  }
2887 
2888  // Move all uses of the function I (including pairing-induced uses) after J.
2889  bool BBVectorize::canMoveUsesOfIAfterJ(BasicBlock &BB,
2890  DenseSet<ValuePair> &LoadMoveSetPairs,
2891  Instruction *I, Instruction *J) {
2892  // Skip to the first instruction past I.
2893  BasicBlock::iterator L = std::next(BasicBlock::iterator(I));
2894 
2896  AliasSetTracker WriteSet(*AA);
2897  if (I->mayWriteToMemory()) WriteSet.add(I);
2898 
2899  for (; cast<Instruction>(L) != J; ++L)
2900  (void)trackUsesOfI(Users, WriteSet, I, &*L, true, &LoadMoveSetPairs);
2901 
2902  assert(cast<Instruction>(L) == J &&
2903  "Tracking has not proceeded far enough to check for dependencies");
2904  // If J is now in the use set of I, then trackUsesOfI will return true
2905  // and we have a dependency cycle (and the fusing operation must abort).
2906  return !trackUsesOfI(Users, WriteSet, I, J, true, &LoadMoveSetPairs);
2907  }
2908 
2909  // Move all uses of the function I (including pairing-induced uses) after J.
2910  void BBVectorize::moveUsesOfIAfterJ(BasicBlock &BB,
2911  DenseSet<ValuePair> &LoadMoveSetPairs,
2912  Instruction *&InsertionPt,
2913  Instruction *I, Instruction *J) {
2914  // Skip to the first instruction past I.
2915  BasicBlock::iterator L = std::next(BasicBlock::iterator(I));
2916 
2918  AliasSetTracker WriteSet(*AA);
2919  if (I->mayWriteToMemory()) WriteSet.add(I);
2920 
2921  for (; cast<Instruction>(L) != J;) {
2922  if (trackUsesOfI(Users, WriteSet, I, &*L, true, &LoadMoveSetPairs)) {
2923  // Move this instruction
2924  Instruction *InstToMove = &*L++;
2925 
2926  DEBUG(dbgs() << "BBV: moving: " << *InstToMove <<
2927  " to after " << *InsertionPt << "\n");
2928  InstToMove->removeFromParent();
2929  InstToMove->insertAfter(InsertionPt);
2930  InsertionPt = InstToMove;
2931  } else {
2932  ++L;
2933  }
2934  }
2935  }
2936 
2937  // Collect all load instruction that are in the move set of a given first
2938  // pair member. These loads depend on the first instruction, I, and so need
2939  // to be moved after J (the second instruction) when the pair is fused.
2940  void BBVectorize::collectPairLoadMoveSet(BasicBlock &BB,
2941  DenseMap<Value *, Value *> &ChosenPairs,
2942  DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
2943  DenseSet<ValuePair> &LoadMoveSetPairs,
2944  Instruction *I) {
2945  // Skip to the first instruction past I.
2946  BasicBlock::iterator L = std::next(BasicBlock::iterator(I));
2947 
2949  AliasSetTracker WriteSet(*AA);
2950  if (I->mayWriteToMemory()) WriteSet.add(I);
2951 
2952  // Note: We cannot end the loop when we reach J because J could be moved
2953  // farther down the use chain by another instruction pairing. Also, J
2954  // could be before I if this is an inverted input.
2955  for (BasicBlock::iterator E = BB.end(); L != E; ++L) {
2956  if (trackUsesOfI(Users, WriteSet, I, &*L)) {
2957  if (L->mayReadFromMemory()) {
2958  LoadMoveSet[&*L].push_back(I);
2959  LoadMoveSetPairs.insert(ValuePair(&*L, I));
2960  }
2961  }
2962  }
2963  }
2964 
2965  // In cases where both load/stores and the computation of their pointers
2966  // are chosen for vectorization, we can end up in a situation where the
2967  // aliasing analysis starts returning different query results as the
2968  // process of fusing instruction pairs continues. Because the algorithm
2969  // relies on finding the same use dags here as were found earlier, we'll
2970  // need to precompute the necessary aliasing information here and then
2971  // manually update it during the fusion process.
2972  void BBVectorize::collectLoadMoveSet(BasicBlock &BB,
2973  std::vector<Value *> &PairableInsts,
2974  DenseMap<Value *, Value *> &ChosenPairs,
2975  DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
2976  DenseSet<ValuePair> &LoadMoveSetPairs) {
2977  for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
2978  PIE = PairableInsts.end(); PI != PIE; ++PI) {
2979  DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI);
2980  if (P == ChosenPairs.end()) continue;
2981 
2982  Instruction *I = cast<Instruction>(P->first);
2983  collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet,
2984  LoadMoveSetPairs, I);
2985  }
2986  }
2987 
2988  // This function fuses the chosen instruction pairs into vector instructions,
2989  // taking care preserve any needed scalar outputs and, then, it reorders the
2990  // remaining instructions as needed (users of the first member of the pair
2991  // need to be moved to after the location of the second member of the pair
2992  // because the vector instruction is inserted in the location of the pair's
2993  // second member).
2994  void BBVectorize::fuseChosenPairs(BasicBlock &BB,
2995  std::vector<Value *> &PairableInsts,
2996  DenseMap<Value *, Value *> &ChosenPairs,
2997  DenseSet<ValuePair> &FixedOrderPairs,
2998  DenseMap<VPPair, unsigned> &PairConnectionTypes,
2999  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
3000  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps) {
3001  LLVMContext& Context = BB.getContext();
3002 
3003  // During the vectorization process, the order of the pairs to be fused
3004  // could be flipped. So we'll add each pair, flipped, into the ChosenPairs
3005  // list. After a pair is fused, the flipped pair is removed from the list.
3006  DenseSet<ValuePair> FlippedPairs;
3007  for (DenseMap<Value *, Value *>::iterator P = ChosenPairs.begin(),
3008  E = ChosenPairs.end(); P != E; ++P)
3009  FlippedPairs.insert(ValuePair(P->second, P->first));
3010  for (DenseSet<ValuePair>::iterator P = FlippedPairs.begin(),
3011  E = FlippedPairs.end(); P != E; ++P)
3012  ChosenPairs.insert(*P);
3013 
3015  DenseSet<ValuePair> LoadMoveSetPairs;
3016  collectLoadMoveSet(BB, PairableInsts, ChosenPairs,
3017  LoadMoveSet, LoadMoveSetPairs);
3018 
3019  DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n");
3020 
3021  for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) {
3022  DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(&*PI);
3023  if (P == ChosenPairs.end()) {
3024  ++PI;
3025  continue;
3026  }
3027 
3028  if (getDepthFactor(P->first) == 0) {
3029  // These instructions are not really fused, but are tracked as though
3030  // they are. Any case in which it would be interesting to fuse them
3031  // will be taken care of by InstCombine.
3032  --NumFusedOps;
3033  ++PI;
3034  continue;
3035  }
3036 
3037  Instruction *I = cast<Instruction>(P->first),
3038  *J = cast<Instruction>(P->second);
3039 
3040  DEBUG(dbgs() << "BBV: fusing: " << *I <<
3041  " <-> " << *J << "\n");
3042 
3043  // Remove the pair and flipped pair from the list.
3044  DenseMap<Value *, Value *>::iterator FP = ChosenPairs.find(P->second);
3045  assert(FP != ChosenPairs.end() && "Flipped pair not found in list");
3046  ChosenPairs.erase(FP);
3047  ChosenPairs.erase(P);
3048 
3049  if (!canMoveUsesOfIAfterJ(BB, LoadMoveSetPairs, I, J)) {
3050  DEBUG(dbgs() << "BBV: fusion of: " << *I <<
3051  " <-> " << *J <<
3052  " aborted because of non-trivial dependency cycle\n");
3053  --NumFusedOps;
3054  ++PI;
3055  continue;
3056  }
3057 
3058  // If the pair must have the other order, then flip it.
3059  bool FlipPairOrder = FixedOrderPairs.count(ValuePair(J, I));
3060  if (!FlipPairOrder && !FixedOrderPairs.count(ValuePair(I, J))) {
3061  // This pair does not have a fixed order, and so we might want to
3062  // flip it if that will yield fewer shuffles. We count the number
3063  // of dependencies connected via swaps, and those directly connected,
3064  // and flip the order if the number of swaps is greater.
3065  bool OrigOrder = true;
3067  ConnectedPairDeps.find(ValuePair(I, J));
3068  if (IJ == ConnectedPairDeps.end()) {
3069  IJ = ConnectedPairDeps.find(ValuePair(J, I));
3070  OrigOrder = false;
3071  }
3072 
3073  if (IJ != ConnectedPairDeps.end()) {
3074  unsigned NumDepsDirect = 0, NumDepsSwap = 0;
3075  for (std::vector<ValuePair>::iterator T = IJ->second.begin(),
3076  TE = IJ->second.end(); T != TE; ++T) {
3077  VPPair Q(IJ->first, *T);
3079  PairConnectionTypes.find(VPPair(Q.second, Q.first));
3080  assert(R != PairConnectionTypes.end() &&
3081  "Cannot find pair connection type");
3082  if (R->second == PairConnectionDirect)
3083  ++NumDepsDirect;
3084  else if (R->second == PairConnectionSwap)
3085  ++NumDepsSwap;
3086  }
3087 
3088  if (!OrigOrder)
3089  std::swap(NumDepsDirect, NumDepsSwap);
3090 
3091  if (NumDepsSwap > NumDepsDirect) {
3092  FlipPairOrder = true;
3093  DEBUG(dbgs() << "BBV: reordering pair: " << *I <<
3094  " <-> " << *J << "\n");
3095  }
3096  }
3097  }
3098 
3099  Instruction *L = I, *H = J;
3100  if (FlipPairOrder)
3101  std::swap(H, L);
3102 
3103  // If the pair being fused uses the opposite order from that in the pair
3104  // connection map, then we need to flip the types.
3106  ConnectedPairs.find(ValuePair(H, L));
3107  if (HL != ConnectedPairs.end())
3108  for (std::vector<ValuePair>::iterator T = HL->second.begin(),
3109  TE = HL->second.end(); T != TE; ++T) {
3110  VPPair Q(HL->first, *T);
3111  DenseMap<VPPair, unsigned>::iterator R = PairConnectionTypes.find(Q);
3112  assert(R != PairConnectionTypes.end() &&
3113  "Cannot find pair connection type");
3114  if (R->second == PairConnectionDirect)
3115  R->second = PairConnectionSwap;
3116  else if (R->second == PairConnectionSwap)
3117  R->second = PairConnectionDirect;
3118  }
3119 
3120  bool LBeforeH = !FlipPairOrder;
3121  unsigned NumOperands = I->getNumOperands();
3122  SmallVector<Value *, 3> ReplacedOperands(NumOperands);
3123  getReplacementInputsForPair(Context, L, H, ReplacedOperands,
3124  LBeforeH);
3125 
3126  // Make a copy of the original operation, change its type to the vector
3127  // type and replace its operands with the vector operands.
3128  Instruction *K = L->clone();
3129  if (L->hasName())
3130  K->takeName(L);
3131  else if (H->hasName())
3132  K->takeName(H);
3133 
3134  if (auto CS = CallSite(K)) {
3136  FunctionType *Old = CS.getFunctionType();
3137  unsigned NumOld = Old->getNumParams();
3138  assert(NumOld <= ReplacedOperands.size());
3139  for (unsigned i = 0; i != NumOld; ++i)
3140  Tys.push_back(ReplacedOperands[i]->getType());
3141  CS.mutateFunctionType(
3142  FunctionType::get(getVecTypeForPair(L->getType(), H->getType()),
3143  Tys, Old->isVarArg()));
3144  } else if (!isa<StoreInst>(K))
3145  K->mutateType(getVecTypeForPair(L->getType(), H->getType()));
3146 
3147  unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
3150  combineMetadata(K, H, KnownIDs);
3151  K->andIRFlags(H);
3152 
3153  for (unsigned o = 0; o < NumOperands; ++o)
3154  K->setOperand(o, ReplacedOperands[o]);
3155 
3156  K->insertAfter(J);
3157 
3158  // Instruction insertion point:
3159  Instruction *InsertionPt = K;
3160  Instruction *K1 = nullptr, *K2 = nullptr;
3161  replaceOutputsOfPair(Context, L, H, K, InsertionPt, K1, K2);
3162 
3163  // The use dag of the first original instruction must be moved to after
3164  // the location of the second instruction. The entire use dag of the
3165  // first instruction is disjoint from the input dag of the second
3166  // (by definition), and so commutes with it.
3167 
3168  moveUsesOfIAfterJ(BB, LoadMoveSetPairs, InsertionPt, I, J);
3169 
3170  if (!isa<StoreInst>(I)) {
3171  L->replaceAllUsesWith(K1);
3172  H->replaceAllUsesWith(K2);
3173  }
3174 
3175  // Instructions that may read from memory may be in the load move set.
3176  // Once an instruction is fused, we no longer need its move set, and so
3177  // the values of the map never need to be updated. However, when a load
3178  // is fused, we need to merge the entries from both instructions in the
3179  // pair in case those instructions were in the move set of some other
3180  // yet-to-be-fused pair. The loads in question are the keys of the map.
3181  if (I->mayReadFromMemory()) {
3182  std::vector<ValuePair> NewSetMembers;
3184  LoadMoveSet.find(I);
3185  if (II != LoadMoveSet.end())
3186  for (std::vector<Value *>::iterator N = II->second.begin(),
3187  NE = II->second.end(); N != NE; ++N)
3188  NewSetMembers.push_back(ValuePair(K, *N));
3190  LoadMoveSet.find(J);
3191  if (JJ != LoadMoveSet.end())
3192  for (std::vector<Value *>::iterator N = JJ->second.begin(),
3193  NE = JJ->second.end(); N != NE; ++N)
3194  NewSetMembers.push_back(ValuePair(K, *N));
3195  for (std::vector<ValuePair>::iterator A = NewSetMembers.begin(),
3196  AE = NewSetMembers.end(); A != AE; ++A) {
3197  LoadMoveSet[A->first].push_back(A->second);
3198  LoadMoveSetPairs.insert(*A);
3199  }
3200  }
3201 
3202  // Before removing I, set the iterator to the next instruction.
3203  PI = std::next(BasicBlock::iterator(I));
3204  if (cast<Instruction>(PI) == J)
3205  ++PI;
3206 
3207  SE->forgetValue(I);
3208  SE->forgetValue(J);
3209  I->eraseFromParent();
3210  J->eraseFromParent();
3211 
3212  DEBUG(if (PrintAfterEveryPair) dbgs() << "BBV: block is now: \n" <<
3213  BB << "\n");
3214  }
3215 
3216  DEBUG(dbgs() << "BBV: final: \n" << BB << "\n");
3217  }
3218 }
3219 
3220 char BBVectorize::ID = 0;
3221 static const char bb_vectorize_name[] = "Basic-Block Vectorization";
3222 INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
3231 INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
3232 
3234  return new BBVectorize(C);
3235 }
3236 
3237 bool
3239  BBVectorize BBVectorizer(P, *BB.getParent(), C);
3240  return BBVectorizer.vectorizeBB(BB);
3241 }
3242 
3243 //===----------------------------------------------------------------------===//
3253  VectorizeFMA = !::NoFMA;
3255  VectorizeCmp = !::NoCmp;
3256  VectorizeGEP = !::NoGEP;
3263  MaxInsts = ::MaxInsts;
3264  MaxPairs = ::MaxPairs;
3265  MaxIter = ::MaxIter;
3268  FastDep = ::FastDep;
3269 }
Legacy wrapper pass to provide the GlobalsAAResult object.
MachineLoop * L
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:81
bool VectorizeFMA
Vectorize the fused-multiply-add intrinsic.
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:76
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:102
static cl::opt< unsigned > MaxPairs("bb-vectorize-max-pairs-per-group", cl::init(3000), cl::Hidden, cl::desc("The maximum number of candidate instruction pairs per group"))
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:870
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
LLVMContext & Context
bool hasName() const
Definition: Value.h:236
STATISTIC(NumFunctions,"Total number of functions")
size_t i
ValueT lookup(const KeyT &Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:162
This is the interface for a simple mod/ref and alias analysis over globals.
A Module instance is used to store all the information related to an LLVM module. ...
Definition: Module.h:52
unsigned getNumParams() const
Return the number of fixed parameters this function type requires.
Definition: DerivedTypes.h:137
const Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
static cl::opt< bool > DebugPairSelection("bb-vectorize-debug-pair-selection", cl::init(false), cl::Hidden, cl::desc("When debugging is enabled, output information on the"" pair-selection process"))
iterator find(const ValueT &V)
Definition: DenseSet.h:146
Implements a dense probed hash-table based set.
Definition: DenseSet.h:202
unsigned getNumOperands() const
Definition: User.h:167
unsigned getPrefTypeAlignment(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:699
The main scalar evolution driver.
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:84
This class represents a function call, abstracting a target machine's calling convention.
void initializeBBVectorizePass(PassRegistry &)
This file contains the declarations for metadata subclasses.
static bool runOnBasicBlock(BasicBlock &BB)
static cl::opt< bool > NoMemOpBoost("bb-vectorize-no-mem-op-boost", cl::init(false), cl::Hidden, cl::desc("Don't boost the chain-depth contribution of loads and stores"))
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space...
Definition: Type.cpp:655
static uint64_t round(uint64_t Acc, uint64_t Input)
Definition: xxhash.cpp:57
static cl::opt< bool > IgnoreTargetInfo("bb-vectorize-ignore-target-info", cl::init(false), cl::Hidden, cl::desc("Ignore target information"))
This instruction constructs a fixed permutation of two input vectors.
bool VectorizeMath
Vectorize floating-point math intrinsics.
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:100
unsigned getNumberOfParts(Type *Tp) const
bool SimplifyInstructionsInBlock(BasicBlock *BB, const TargetLibraryInfo *TLI=nullptr)
Scan the specified basic block and try to simplify any instructions in it and recursively delete dead...
Definition: Local.cpp:489
An instruction for reading from memory.
Definition: Instructions.h:164
iv Induction Variable Users
Definition: IVUsers.cpp:51
op_iterator op_begin()
Definition: User.h:205
static cl::opt< unsigned > VectorBits("bb-vectorize-vector-bits", cl::init(128), cl::Hidden, cl::desc("The size of the native vector registers"))
Type * getPointerElementType() const
Definition: Type.h:358
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:191
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition: Type.h:239
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:172
AnalysisUsage & addRequired()
static InsertElementInst * Create(Value *Vec, Value *NewElt, Value *Idx, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:53
This is the interface for a SCEV-based alias analysis.
static cl::opt< bool > AlignedOnly("bb-vectorize-aligned-only", cl::init(false), cl::Hidden, cl::desc("Only generate aligned loads and stores"))
This class represents the LLVM 'select' instruction.
unsigned MaxCandPairsForCycleCheck
The maximum number of candidate pairs with which to use a full cycle check.
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:578
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:143
const_iterator end() const
A Use represents the edge between a Value definition and its users.
Definition: Use.h:56
unsigned getNumArgOperands() const
Return the number of call arguments.
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:994
Windows NT (Windows on ARM)
size_type size() const
Definition: SmallSet.h:59
Check for equivalence ignoring load/store alignment.
Definition: Instruction.h:486
Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following: ...
static cl::opt< unsigned > MaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden, cl::desc("The maximum number of pairable instructions per group"))
static const char bb_vectorize_name[]
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:60
bool Pow2LenOnly
Don't try to form odd-length vectors.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:399
const_iterator begin() const
user_iterator_impl< User > user_iterator
Definition: Value.h:340
Class to represent function types.
Definition: DerivedTypes.h:102
#define F(x, y, z)
Definition: MD5.cpp:51
Check for equivalence treating a type and a vector of that type as equivalent.
Definition: Instruction.h:489
static cl::opt< bool > NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize floating-point math intrinsics"))
bool mayReadFromMemory() const
Return true if this instruction may read memory.
static cl::opt< bool > UseChainDepthWithTI("bb-vectorize-use-chain-depth", cl::init(false), cl::Hidden, cl::desc("Use the chain depth requirement with"" target information"))
#define T
static cl::opt< bool > NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize the fused-multiply-add intrinsic"))
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Definition: Type.cpp:646
void andIRFlags(const Value *V)
Logical 'and' of any supported wrapping, exact, and fast-math flags of V and this instruction...
Function Alias Analysis false
int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) const
This class represents a no-op cast from one type to another.
static FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
Definition: Type.cpp:291
static std::string utostr(uint64_t X, bool isNeg=false)
Definition: StringExtras.h:79
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:949
An instruction for storing to memory.
Definition: Instructions.h:300
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:401
Reverse the order of the vector.
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:263
bool VectorizePointers
Vectorize pointer values.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:96
Type * getScalarType() const LLVM_READONLY
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.cpp:44
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
Definition: Type.h:157
bool VectorizeCmp
Vectorize comparison instructions.
Maximum length of the test input libFuzzer tries to guess a good value based on the corpus and reports it always prefer smaller inputs during the corpus shuffle When libFuzzer itself reports a bug this exit code will be used If indicates the maximal total time in seconds to run the fuzzer minimizes the provided crash input Use with etc Experimental Use value profile to guide fuzzing Number of simultaneous worker processes to run the jobs If min(jobs, NumberOfCpuCores()/2)\" is used.") FUZZER_FLAG_INT(reload
static GCRegistry::Add< CoreCLRGC > E("coreclr","CoreCLR-compatible GC")
unsigned MaxIter
The maximum number of pairing iterations.
bool VectorizeMemOps
Vectorize loads and stores.
static cl::opt< bool > FastDep("bb-vectorize-fast-dep", cl::init(false), cl::Hidden, cl::desc("Use a fast instruction dependency analysis"))
ExtractSubvector Index indicates start offset.
static cl::opt< bool > NoCmp("bb-vectorize-no-cmp", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize comparison instructions"))
int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef< Type * > Tys, FastMathFlags FMF) const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs ...
Definition: Instructions.h:830
bool isX86_MMXTy() const
Return true if this is X86 MMX.
Definition: Type.h:180
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:201
#define P(N)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:395
This instruction inserts a single (scalar) element into a VectorType value.
iterator begin() const
Definition: LoopInfo.h:132
unsigned getAlignment() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:348
Wrapper pass for TargetTransformInfo.
bool AlignedOnly
Only generate aligned loads and stores.
static cl::opt< bool > NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize floating-point values"))
void insertBefore(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified instruction...
Definition: Instruction.cpp:82
LLVM Basic Block Representation.
Definition: BasicBlock.h:51
static cl::opt< unsigned > SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden, cl::desc("The maximum search distance for instruction pairs"))
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:45
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:48
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:219
This is an important base class in LLVM.
Definition: Constant.h:42
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator begin()
Definition: SmallVector.h:115
bool SplatBreaksChain
Replicating one element to a pair breaks the chain.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:36
This file contains the declarations for the subclasses of Constant, which represent the different fla...
#define H(x, y, z)
Definition: MD5.cpp:53
static cl::opt< bool > DebugInstructionExamination("bb-vectorize-debug-instruction-examination", cl::init(false), cl::Hidden, cl::desc("When debugging is enabled, output information on the"" instruction-examination process"))
APInt Or(const APInt &LHS, const APInt &RHS)
Bitwise OR function for APInt.
Definition: APInt.h:1947
APInt Xor(const APInt &LHS, const APInt &RHS)
Bitwise XOR function for APInt.
Definition: APInt.h:1952
static cl::opt< bool > NoBitManipulation("bb-vectorize-no-bitmanip", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize BitManipulation intrinsics"))
Represent the analysis usage information of a pass.
op_iterator op_end()
Definition: User.h:207
uint32_t Offset
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE,"Assign register bank of generic virtual registers", false, false) RegBankSelect
uint64_t getNumElements() const
Definition: DerivedTypes.h:335
Value * getOperand(unsigned i) const
Definition: User.h:145
Value * getPointerOperand()
Definition: Instructions.h:270
unsigned SearchLimit
The maximum search distance for instruction pairs.
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again...
Definition: DenseMap.h:87
static cl::opt< bool > NoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize casting (conversion) operations"))
std::pair< NoneType, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:80
static cl::opt< unsigned > MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200), cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use"" a full cycle check"))
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:213
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1337
iterator erase(const_iterator CI)
Definition: SmallVector.h:431
int getShuffleCost(ShuffleKind Kind, Type *Tp, int Index=0, Type *SubTp=nullptr) const
static cl::opt< unsigned > MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden, cl::desc("The maximum number of pairing iterations"))
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:183
LLVM_READONLY APFloat maxnum(const APFloat &A, const APFloat &B)
Implements IEEE maxNum semantics.
Definition: APFloat.h:1118
bool mayWriteToMemory() const
Return true if this instruction may modify memory.
BasicBlockPass class - This class is used to implement most local optimizations.
Definition: Pass.h:335
unsigned MaxPairs
The maximum number of candidate instruction pairs per group.
const unsigned MaxDepth
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
Iterator for intrusive lists based on ilist_node.
This is the shared class of boolean and integer constants.
Definition: Constants.h:88
Legacy wrapper pass to provide the SCEVAAResult object.
bool VectorizeFloats
Vectorize floating-point values.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
iterator end()
Definition: BasicBlock.h:230
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type...
Definition: Type.cpp:123
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:58
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:843
Module.h This file contains the declarations for the Module class.
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:230
Provides information about what library functions are available for the current target.
static cl::opt< bool > SplatBreaksChain("bb-vectorize-splat-breaks-chain", cl::init(false), cl::Hidden, cl::desc("Replicating one element to a pair breaks the chain"))
bool vectorizeBasicBlock(Pass *P, BasicBlock &BB, const VectorizeConfig &C=VectorizeConfig())
Vectorize the BasicBlock.
const DataFlowGraph & G
Definition: RDFGraph.cpp:206
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:50
static cl::opt< bool > NoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize loads and stores"))
LLVM_NODISCARD T pop_back_val()
Definition: SmallVector.h:382
static Constant * get(Type *Ty, uint64_t V, bool isSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:558
Function * getCalledFunction() const
Return the function called, or null if this is an indirect function invocation.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:276
static cl::opt< bool > NoPointers("bb-vectorize-no-pointers", cl::init(true), cl::Hidden, cl::desc("Don't try to vectorize pointer values"))
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:275
void add(Value *Ptr, uint64_t Size, const AAMDNodes &AAInfo)
These methods are used to add different types of instructions to the alias sets.
bool FastDep
Use a fast instruction dependency analysis.
Intrinsic::ID getIntrinsicID() const LLVM_READONLY
getIntrinsicID - This method returns the ID number of the specified function, or Intrinsic::not_intri...
Definition: Function.h:146
Vectorize configuration.
static GCRegistry::Add< ShadowStackGC > C("shadow-stack","Very portable GC for uncooperative code generators")
void setOperand(unsigned i, Value *Val)
Definition: User.h:150
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
size_type count(const KeyT &Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:122
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:586
Value * getArgOperand(unsigned i) const
getArgOperand/setArgOperand - Return/set the i-th call argument.
Class to represent vector types.
Definition: DerivedTypes.h:369
static cl::opt< bool > NoGEP("bb-vectorize-no-gep", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize getelementptr instructions"))
iterator_range< user_iterator > users()
Definition: Value.h:370
static cl::opt< bool > NoBools("bb-vectorize-no-bools", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize boolean (i1) values"))
unsigned getVectorNumElements() const
Definition: DerivedTypes.h:438
bool VectorizeCasts
Vectorize casting (conversion) operations.
#define BBV_NAME
Definition: BBVectorize.cpp:17
BasicBlockPass * createBBVectorizePass(const VectorizeConfig &C=VectorizeConfig())
bool VectorizeBools
Vectorize boolean values.
APInt And(const APInt &LHS, const APInt &RHS)
Bitwise AND function for APInt.
Definition: APInt.h:1942
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
Definition: Instruction.cpp:72
static cl::opt< unsigned > ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden, cl::desc("The required chain depth for vectorization"))
unsigned ReqChainDepth
The required chain depth for vectorization.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.cpp:384
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator end()
Definition: SmallVector.h:119
bool isX86_FP80Ty() const
Return true if this is x86 long double.
Definition: Type.h:151
unsigned MaxInsts
The maximum number of pairable instructions per group.
size_type count(const ValueT &V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:81
This class represents an analyzed expression in the program.
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:169
static cl::opt< HelpPrinterWrapper, true, parser< bool > > HOp("help", cl::desc("Display available options (-help-hidden for more)"), cl::location(WrappedNormalPrinter), cl::ValueDisallowed, cl::cat(GenericCategory), cl::sub(*AllSubCommands))
static cl::opt< bool > NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize integer values"))
unsigned getAlignment() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:227
bool isBinaryOp() const
Definition: Instruction.h:115
void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction...
Definition: Instruction.cpp:88
bool NoMemOpBoost
Don't boost the chain-depth contribution of loads and stores.
#define I(x, y, z)
Definition: MD5.cpp:54
#define N
LLVM_ATTRIBUTE_ALWAYS_INLINE size_type size() const
Definition: SmallVector.h:135
bool VectorizeGEP
Vectorize getelementptr instructions.
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1099
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:287
This instruction extracts a single (scalar) element from a VectorType value.
static cl::opt< bool > PrintAfterEveryPair("bb-vectorize-debug-print-after-every-pair", cl::init(false), cl::Hidden, cl::desc("When debugging is enabled, dump the basic block after"" every pair is fused"))
static cl::opt< bool > DebugCycleCheck("bb-vectorize-debug-cycle-check", cl::init(false), cl::Hidden, cl::desc("When debugging is enabled, output information on the"" cycle-checking process"))
static cl::opt< bool > DebugCandidateSelection("bb-vectorize-debug-candidate-selection", cl::init(false), cl::Hidden, cl::desc("When debugging is enabled, output information on the"" candidate-selection process"))
uint64_t getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type...
Definition: DataLayout.h:391
void mutateType(Type *Ty)
Mutate the type of this Value to be of the specified type.
Definition: Value.h:558
VectorizeConfig()
Initialize the VectorizeConfig from command line options.
static Function * getCalledFunction(const Value *V, bool LookThroughBitCast, bool &IsNoBuiltin)
bool isVarArg() const
Definition: DerivedTypes.h:122
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
user_iterator user_begin()
Definition: Value.h:346
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:33
aarch64 promote const
unsigned getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition: Type.cpp:108
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:537
LLVM Value Representation.
Definition: Value.h:71
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:111
static VectorType * get(Type *ElementType, unsigned NumElements)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:631
bool VectorizeInts
Vectorize integer values.
std::underlying_type< E >::type Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:81
Broadcast element 0 to all other elements.
static cl::opt< bool > Pow2LenOnly("bb-vectorize-pow2-len-only", cl::init(false), cl::Hidden, cl::desc("Don't try to form non-2^n-length vectors"))
#define DEBUG(X)
Definition: Debug.h:100
bool VectorizeBitManipulations
Vectorize bit intrinsics.
Convenience struct for specifying and reasoning about fast-math flags.
Definition: Operator.h:168
OperandValueKind
Additional information about an operand's possible values.
int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index=-1) const
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:217
This pass exposes codegen information to IR-level passes.
bool isSameOperationAs(const Instruction *I, unsigned flags=0) const
This function determines if the specified instruction executes the same operation as the current one...
iterator getFirstInsertionPt()
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:209
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object...
bool VectorizeSelect
Vectorize select instructions.
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:162
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:479
static GCRegistry::Add< ErlangGC > A("erlang","erlang-compatible garbage collector")
Value * getPointerOperand()
Definition: Instructions.h:394
void combineMetadata(Instruction *K, const Instruction *J, ArrayRef< unsigned > KnownIDs)
Combine the metadata of two instructions so that K can replace J.
Definition: Local.cpp:1682
int getAddressComputationCost(Type *Ty, ScalarEvolution *SE=nullptr, const SCEV *Ptr=nullptr) const
const BasicBlock * getParent() const
Definition: Instruction.h:62
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:83
LLVM_READONLY APFloat minnum(const APFloat &A, const APFloat &B)
Implements IEEE minNum semantics.
Definition: APFloat.h:1107
static cl::opt< bool > NoSelect("bb-vectorize-no-select", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize select instructions"))
#define T1
unsigned VectorBits
The size of the native vector registers.
This class represents a constant integer value.