LLVM  3.7.0
SLPVectorizer.cpp
Go to the documentation of this file.
1 //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 // This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10 // stores that can be put together into vector-stores. Next, it attempts to
11 // construct vectorizable tree using the use-def chains. If a profitable tree
12 // was found, the SLP vectorizer performs vectorization on the tree.
13 //
14 // The pass is inspired by the work described in the paper:
15 // "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16 //
17 //===----------------------------------------------------------------------===//
19 #include "llvm/ADT/MapVector.h"
20 #include "llvm/ADT/Optional.h"
22 #include "llvm/ADT/SetVector.h"
23 #include "llvm/ADT/Statistic.h"
27 #include "llvm/Analysis/LoopInfo.h"
32 #include "llvm/IR/DataLayout.h"
33 #include "llvm/IR/Dominators.h"
34 #include "llvm/IR/IRBuilder.h"
35 #include "llvm/IR/Instructions.h"
36 #include "llvm/IR/IntrinsicInst.h"
37 #include "llvm/IR/Module.h"
38 #include "llvm/IR/NoFolder.h"
39 #include "llvm/IR/Type.h"
40 #include "llvm/IR/Value.h"
41 #include "llvm/IR/Verifier.h"
42 #include "llvm/Pass.h"
44 #include "llvm/Support/Debug.h"
47 #include <algorithm>
48 #include <map>
49 #include <memory>
50 
51 using namespace llvm;
52 
53 #define SV_NAME "slp-vectorizer"
54 #define DEBUG_TYPE "SLP"
55 
56 STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
57 
58 static cl::opt<int>
59  SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
60  cl::desc("Only vectorize if you gain more than this "
61  "number "));
62 
63 static cl::opt<bool>
64 ShouldVectorizeHor("slp-vectorize-hor", cl::init(false), cl::Hidden,
65  cl::desc("Attempt to vectorize horizontal reductions"));
66 
68  "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
69  cl::desc(
70  "Attempt to vectorize horizontal reductions feeding into a store"));
71 
72 static cl::opt<int>
73 MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
74  cl::desc("Attempt to vectorize for this register size in bits"));
75 
76 namespace {
77 
78 // FIXME: Set this via cl::opt to allow overriding.
79 static const unsigned MinVecRegSize = 128;
80 
81 static const unsigned RecursionMaxDepth = 12;
82 
83 // Limit the number of alias checks. The limit is chosen so that
84 // it has no negative effect on the llvm benchmarks.
85 static const unsigned AliasedCheckLimit = 10;
86 
87 // Another limit for the alias checks: The maximum distance between load/store
88 // instructions where alias checks are done.
89 // This limit is useful for very large basic blocks.
90 static const unsigned MaxMemDepDistance = 160;
91 
92 /// \brief Predicate for the element types that the SLP vectorizer supports.
93 ///
94 /// The most important thing to filter here are types which are invalid in LLVM
95 /// vectors. We also filter target specific types which have absolutely no
96 /// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
97 /// avoids spending time checking the cost model and realizing that they will
98 /// be inevitably scalarized.
99 static bool isValidElementType(Type *Ty) {
100  return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
101  !Ty->isPPC_FP128Ty();
102 }
103 
104 /// \returns the parent basic block if all of the instructions in \p VL
105 /// are in the same block or null otherwise.
106 static BasicBlock *getSameBlock(ArrayRef<Value *> VL) {
107  Instruction *I0 = dyn_cast<Instruction>(VL[0]);
108  if (!I0)
109  return nullptr;
110  BasicBlock *BB = I0->getParent();
111  for (int i = 1, e = VL.size(); i < e; i++) {
112  Instruction *I = dyn_cast<Instruction>(VL[i]);
113  if (!I)
114  return nullptr;
115 
116  if (BB != I->getParent())
117  return nullptr;
118  }
119  return BB;
120 }
121 
122 /// \returns True if all of the values in \p VL are constants.
123 static bool allConstant(ArrayRef<Value *> VL) {
124  for (unsigned i = 0, e = VL.size(); i < e; ++i)
125  if (!isa<Constant>(VL[i]))
126  return false;
127  return true;
128 }
129 
130 /// \returns True if all of the values in \p VL are identical.
131 static bool isSplat(ArrayRef<Value *> VL) {
132  for (unsigned i = 1, e = VL.size(); i < e; ++i)
133  if (VL[i] != VL[0])
134  return false;
135  return true;
136 }
137 
138 ///\returns Opcode that can be clubbed with \p Op to create an alternate
139 /// sequence which can later be merged as a ShuffleVector instruction.
140 static unsigned getAltOpcode(unsigned Op) {
141  switch (Op) {
142  case Instruction::FAdd:
143  return Instruction::FSub;
144  case Instruction::FSub:
145  return Instruction::FAdd;
146  case Instruction::Add:
147  return Instruction::Sub;
148  case Instruction::Sub:
149  return Instruction::Add;
150  default:
151  return 0;
152  }
153 }
154 
155 ///\returns bool representing if Opcode \p Op can be part
156 /// of an alternate sequence which can later be merged as
157 /// a ShuffleVector instruction.
158 static bool canCombineAsAltInst(unsigned Op) {
159  if (Op == Instruction::FAdd || Op == Instruction::FSub ||
160  Op == Instruction::Sub || Op == Instruction::Add)
161  return true;
162  return false;
163 }
164 
165 /// \returns ShuffleVector instruction if intructions in \p VL have
166 /// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence.
167 /// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...)
168 static unsigned isAltInst(ArrayRef<Value *> VL) {
169  Instruction *I0 = dyn_cast<Instruction>(VL[0]);
170  unsigned Opcode = I0->getOpcode();
171  unsigned AltOpcode = getAltOpcode(Opcode);
172  for (int i = 1, e = VL.size(); i < e; i++) {
173  Instruction *I = dyn_cast<Instruction>(VL[i]);
174  if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode))
175  return 0;
176  }
177  return Instruction::ShuffleVector;
178 }
179 
180 /// \returns The opcode if all of the Instructions in \p VL have the same
181 /// opcode, or zero.
182 static unsigned getSameOpcode(ArrayRef<Value *> VL) {
183  Instruction *I0 = dyn_cast<Instruction>(VL[0]);
184  if (!I0)
185  return 0;
186  unsigned Opcode = I0->getOpcode();
187  for (int i = 1, e = VL.size(); i < e; i++) {
188  Instruction *I = dyn_cast<Instruction>(VL[i]);
189  if (!I || Opcode != I->getOpcode()) {
190  if (canCombineAsAltInst(Opcode) && i == 1)
191  return isAltInst(VL);
192  return 0;
193  }
194  }
195  return Opcode;
196 }
197 
198 /// Get the intersection (logical and) of all of the potential IR flags
199 /// of each scalar operation (VL) that will be converted into a vector (I).
200 /// Flag set: NSW, NUW, exact, and all of fast-math.
201 static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
202  if (auto *VecOp = dyn_cast<BinaryOperator>(I)) {
203  if (auto *Intersection = dyn_cast<BinaryOperator>(VL[0])) {
204  // Intersection is initialized to the 0th scalar,
205  // so start counting from index '1'.
206  for (int i = 1, e = VL.size(); i < e; ++i) {
207  if (auto *Scalar = dyn_cast<BinaryOperator>(VL[i]))
208  Intersection->andIRFlags(Scalar);
209  }
210  VecOp->copyIRFlags(Intersection);
211  }
212  }
213 }
214 
215 /// \returns \p I after propagating metadata from \p VL.
216 static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) {
217  Instruction *I0 = cast<Instruction>(VL[0]);
219  I0->getAllMetadataOtherThanDebugLoc(Metadata);
220 
221  for (unsigned i = 0, n = Metadata.size(); i != n; ++i) {
222  unsigned Kind = Metadata[i].first;
223  MDNode *MD = Metadata[i].second;
224 
225  for (int i = 1, e = VL.size(); MD && i != e; i++) {
226  Instruction *I = cast<Instruction>(VL[i]);
227  MDNode *IMD = I->getMetadata(Kind);
228 
229  switch (Kind) {
230  default:
231  MD = nullptr; // Remove unknown metadata
232  break;
234  MD = MDNode::getMostGenericTBAA(MD, IMD);
235  break;
237  MD = MDNode::getMostGenericAliasScope(MD, IMD);
238  break;
240  MD = MDNode::intersect(MD, IMD);
241  break;
243  MD = MDNode::getMostGenericFPMath(MD, IMD);
244  break;
245  }
246  }
247  I->setMetadata(Kind, MD);
248  }
249  return I;
250 }
251 
252 /// \returns The type that all of the values in \p VL have or null if there
253 /// are different types.
254 static Type* getSameType(ArrayRef<Value *> VL) {
255  Type *Ty = VL[0]->getType();
256  for (int i = 1, e = VL.size(); i < e; i++)
257  if (VL[i]->getType() != Ty)
258  return nullptr;
259 
260  return Ty;
261 }
262 
263 /// \returns True if the ExtractElement instructions in VL can be vectorized
264 /// to use the original vector.
265 static bool CanReuseExtract(ArrayRef<Value *> VL) {
266  assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode");
267  // Check if all of the extracts come from the same vector and from the
268  // correct offset.
269  Value *VL0 = VL[0];
270  ExtractElementInst *E0 = cast<ExtractElementInst>(VL0);
271  Value *Vec = E0->getOperand(0);
272 
273  // We have to extract from the same vector type.
274  unsigned NElts = Vec->getType()->getVectorNumElements();
275 
276  if (NElts != VL.size())
277  return false;
278 
279  // Check that all of the indices extract from the correct offset.
281  if (!CI || CI->getZExtValue())
282  return false;
283 
284  for (unsigned i = 1, e = VL.size(); i < e; ++i) {
285  ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
287 
288  if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec)
289  return false;
290  }
291 
292  return true;
293 }
294 
295 /// \returns True if in-tree use also needs extract. This refers to
296 /// possible scalar operand in vectorized instruction.
297 static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
298  TargetLibraryInfo *TLI) {
299 
300  unsigned Opcode = UserInst->getOpcode();
301  switch (Opcode) {
302  case Instruction::Load: {
303  LoadInst *LI = cast<LoadInst>(UserInst);
304  return (LI->getPointerOperand() == Scalar);
305  }
306  case Instruction::Store: {
307  StoreInst *SI = cast<StoreInst>(UserInst);
308  return (SI->getPointerOperand() == Scalar);
309  }
310  case Instruction::Call: {
311  CallInst *CI = cast<CallInst>(UserInst);
313  if (hasVectorInstrinsicScalarOpd(ID, 1)) {
314  return (CI->getArgOperand(1) == Scalar);
315  }
316  }
317  default:
318  return false;
319  }
320 }
321 
322 /// \returns the AA location that is being access by the instruction.
323 static MemoryLocation getLocation(Instruction *I, AliasAnalysis *AA) {
324  if (StoreInst *SI = dyn_cast<StoreInst>(I))
325  return MemoryLocation::get(SI);
326  if (LoadInst *LI = dyn_cast<LoadInst>(I))
327  return MemoryLocation::get(LI);
328  return MemoryLocation();
329 }
330 
331 /// \returns True if the instruction is not a volatile or atomic load/store.
332 static bool isSimple(Instruction *I) {
333  if (LoadInst *LI = dyn_cast<LoadInst>(I))
334  return LI->isSimple();
335  if (StoreInst *SI = dyn_cast<StoreInst>(I))
336  return SI->isSimple();
337  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
338  return !MI->isVolatile();
339  return true;
340 }
341 
342 /// Bottom Up SLP Vectorizer.
343 class BoUpSLP {
344 public:
345  typedef SmallVector<Value *, 8> ValueList;
346  typedef SmallVector<Instruction *, 16> InstrList;
347  typedef SmallPtrSet<Value *, 16> ValueSet;
348  typedef SmallVector<StoreInst *, 8> StoreList;
349 
350  BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
353  : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
354  SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt),
355  Builder(Se->getContext()) {
356  CodeMetrics::collectEphemeralValues(F, AC, EphValues);
357  }
358 
359  /// \brief Vectorize the tree that starts with the elements in \p VL.
360  /// Returns the vectorized root.
361  Value *vectorizeTree();
362 
363  /// \returns the cost incurred by unwanted spills and fills, caused by
364  /// holding live values over call sites.
365  int getSpillCost();
366 
367  /// \returns the vectorization cost of the subtree that starts at \p VL.
368  /// A negative number means that this is profitable.
369  int getTreeCost();
370 
371  /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
372  /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
373  void buildTree(ArrayRef<Value *> Roots,
374  ArrayRef<Value *> UserIgnoreLst = None);
375 
376  /// Clear the internal data structures that are created by 'buildTree'.
377  void deleteTree() {
378  VectorizableTree.clear();
379  ScalarToTreeEntry.clear();
380  MustGather.clear();
381  ExternalUses.clear();
382  NumLoadsWantToKeepOrder = 0;
383  NumLoadsWantToChangeOrder = 0;
384  for (auto &Iter : BlocksSchedules) {
385  BlockScheduling *BS = Iter.second.get();
386  BS->clear();
387  }
388  }
389 
390  /// \returns true if the memory operations A and B are consecutive.
391  bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL);
392 
393  /// \brief Perform LICM and CSE on the newly generated gather sequences.
394  void optimizeGatherSequence();
395 
396  /// \returns true if it is benefitial to reverse the vector order.
397  bool shouldReorder() const {
398  return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
399  }
400 
401 private:
402  struct TreeEntry;
403 
404  /// \returns the cost of the vectorizable entry.
405  int getEntryCost(TreeEntry *E);
406 
407  /// This is the recursive part of buildTree.
408  void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth);
409 
410  /// Vectorize a single entry in the tree.
411  Value *vectorizeTree(TreeEntry *E);
412 
413  /// Vectorize a single entry in the tree, starting in \p VL.
414  Value *vectorizeTree(ArrayRef<Value *> VL);
415 
416  /// \returns the pointer to the vectorized value if \p VL is already
417  /// vectorized, or NULL. They may happen in cycles.
418  Value *alreadyVectorized(ArrayRef<Value *> VL) const;
419 
420  /// \brief Take the pointer operand from the Load/Store instruction.
421  /// \returns NULL if this is not a valid Load/Store instruction.
422  static Value *getPointerOperand(Value *I);
423 
424  /// \brief Take the address space operand from the Load/Store instruction.
425  /// \returns -1 if this is not a valid Load/Store instruction.
426  static unsigned getAddressSpaceOperand(Value *I);
427 
428  /// \returns the scalarization cost for this type. Scalarization in this
429  /// context means the creation of vectors from a group of scalars.
430  int getGatherCost(Type *Ty);
431 
432  /// \returns the scalarization cost for this list of values. Assuming that
433  /// this subtree gets vectorized, we may need to extract the values from the
434  /// roots. This method calculates the cost of extracting the values.
435  int getGatherCost(ArrayRef<Value *> VL);
436 
437  /// \brief Set the Builder insert point to one after the last instruction in
438  /// the bundle
439  void setInsertPointAfterBundle(ArrayRef<Value *> VL);
440 
441  /// \returns a vector from a collection of scalars in \p VL.
442  Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
443 
444  /// \returns whether the VectorizableTree is fully vectoriable and will
445  /// be beneficial even the tree height is tiny.
446  bool isFullyVectorizableTinyTree();
447 
448  /// \reorder commutative operands in alt shuffle if they result in
449  /// vectorized code.
450  void reorderAltShuffleOperands(ArrayRef<Value *> VL,
452  SmallVectorImpl<Value *> &Right);
453  /// \reorder commutative operands to get better probability of
454  /// generating vectorized code.
455  void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
457  SmallVectorImpl<Value *> &Right);
458  struct TreeEntry {
459  TreeEntry() : Scalars(), VectorizedValue(nullptr),
460  NeedToGather(0) {}
461 
462  /// \returns true if the scalars in VL are equal to this entry.
463  bool isSame(ArrayRef<Value *> VL) const {
464  assert(VL.size() == Scalars.size() && "Invalid size");
465  return std::equal(VL.begin(), VL.end(), Scalars.begin());
466  }
467 
468  /// A vector of scalars.
469  ValueList Scalars;
470 
471  /// The Scalars are vectorized into this value. It is initialized to Null.
472  Value *VectorizedValue;
473 
474  /// Do we need to gather this sequence ?
475  bool NeedToGather;
476  };
477 
478  /// Create a new VectorizableTree entry.
479  TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) {
480  VectorizableTree.emplace_back();
481  int idx = VectorizableTree.size() - 1;
482  TreeEntry *Last = &VectorizableTree[idx];
483  Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
484  Last->NeedToGather = !Vectorized;
485  if (Vectorized) {
486  for (int i = 0, e = VL.size(); i != e; ++i) {
487  assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!");
488  ScalarToTreeEntry[VL[i]] = idx;
489  }
490  } else {
491  MustGather.insert(VL.begin(), VL.end());
492  }
493  return Last;
494  }
495 
496  /// -- Vectorization State --
497  /// Holds all of the tree entries.
498  std::vector<TreeEntry> VectorizableTree;
499 
500  /// Maps a specific scalar to its tree entry.
501  SmallDenseMap<Value*, int> ScalarToTreeEntry;
502 
503  /// A list of scalars that we found that we need to keep as scalars.
504  ValueSet MustGather;
505 
506  /// This POD struct describes one external user in the vectorized tree.
507  struct ExternalUser {
508  ExternalUser (Value *S, llvm::User *U, int L) :
509  Scalar(S), User(U), Lane(L){};
510  // Which scalar in our function.
511  Value *Scalar;
512  // Which user that uses the scalar.
513  llvm::User *User;
514  // Which lane does the scalar belong to.
515  int Lane;
516  };
517  typedef SmallVector<ExternalUser, 16> UserList;
518 
519  /// Checks if two instructions may access the same memory.
520  ///
521  /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
522  /// is invariant in the calling loop.
523  bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
524  Instruction *Inst2) {
525 
526  // First check if the result is already in the cache.
527  AliasCacheKey key = std::make_pair(Inst1, Inst2);
528  Optional<bool> &result = AliasCache[key];
529  if (result.hasValue()) {
530  return result.getValue();
531  }
532  MemoryLocation Loc2 = getLocation(Inst2, AA);
533  bool aliased = true;
534  if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
535  // Do the alias check.
536  aliased = AA->alias(Loc1, Loc2);
537  }
538  // Store the result in the cache.
539  result = aliased;
540  return aliased;
541  }
542 
543  typedef std::pair<Instruction *, Instruction *> AliasCacheKey;
544 
545  /// Cache for alias results.
546  /// TODO: consider moving this to the AliasAnalysis itself.
548 
549  /// Removes an instruction from its block and eventually deletes it.
550  /// It's like Instruction::eraseFromParent() except that the actual deletion
551  /// is delayed until BoUpSLP is destructed.
552  /// This is required to ensure that there are no incorrect collisions in the
553  /// AliasCache, which can happen if a new instruction is allocated at the
554  /// same address as a previously deleted instruction.
555  void eraseInstruction(Instruction *I) {
556  I->removeFromParent();
557  I->dropAllReferences();
558  DeletedInstructions.push_back(std::unique_ptr<Instruction>(I));
559  }
560 
561  /// Temporary store for deleted instructions. Instructions will be deleted
562  /// eventually when the BoUpSLP is destructed.
563  SmallVector<std::unique_ptr<Instruction>, 8> DeletedInstructions;
564 
565  /// A list of values that need to extracted out of the tree.
566  /// This list holds pairs of (Internal Scalar : External User).
567  UserList ExternalUses;
568 
569  /// Values used only by @llvm.assume calls.
571 
572  /// Holds all of the instructions that we gathered.
573  SetVector<Instruction *> GatherSeq;
574  /// A list of blocks that we are going to CSE.
575  SetVector<BasicBlock *> CSEBlocks;
576 
577  /// Contains all scheduling relevant data for an instruction.
578  /// A ScheduleData either represents a single instruction or a member of an
579  /// instruction bundle (= a group of instructions which is combined into a
580  /// vector instruction).
581  struct ScheduleData {
582 
583  // The initial value for the dependency counters. It means that the
584  // dependencies are not calculated yet.
585  enum { InvalidDeps = -1 };
586 
587  ScheduleData()
588  : Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr),
589  NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0),
590  Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps),
591  UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false) {}
592 
593  void init(int BlockSchedulingRegionID) {
594  FirstInBundle = this;
595  NextInBundle = nullptr;
596  NextLoadStore = nullptr;
597  IsScheduled = false;
598  SchedulingRegionID = BlockSchedulingRegionID;
599  UnscheduledDepsInBundle = UnscheduledDeps;
600  clearDependencies();
601  }
602 
603  /// Returns true if the dependency information has been calculated.
604  bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
605 
606  /// Returns true for single instructions and for bundle representatives
607  /// (= the head of a bundle).
608  bool isSchedulingEntity() const { return FirstInBundle == this; }
609 
610  /// Returns true if it represents an instruction bundle and not only a
611  /// single instruction.
612  bool isPartOfBundle() const {
613  return NextInBundle != nullptr || FirstInBundle != this;
614  }
615 
616  /// Returns true if it is ready for scheduling, i.e. it has no more
617  /// unscheduled depending instructions/bundles.
618  bool isReady() const {
619  assert(isSchedulingEntity() &&
620  "can't consider non-scheduling entity for ready list");
621  return UnscheduledDepsInBundle == 0 && !IsScheduled;
622  }
623 
624  /// Modifies the number of unscheduled dependencies, also updating it for
625  /// the whole bundle.
626  int incrementUnscheduledDeps(int Incr) {
627  UnscheduledDeps += Incr;
628  return FirstInBundle->UnscheduledDepsInBundle += Incr;
629  }
630 
631  /// Sets the number of unscheduled dependencies to the number of
632  /// dependencies.
633  void resetUnscheduledDeps() {
634  incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
635  }
636 
637  /// Clears all dependency information.
638  void clearDependencies() {
639  Dependencies = InvalidDeps;
640  resetUnscheduledDeps();
641  MemoryDependencies.clear();
642  }
643 
644  void dump(raw_ostream &os) const {
645  if (!isSchedulingEntity()) {
646  os << "/ " << *Inst;
647  } else if (NextInBundle) {
648  os << '[' << *Inst;
649  ScheduleData *SD = NextInBundle;
650  while (SD) {
651  os << ';' << *SD->Inst;
652  SD = SD->NextInBundle;
653  }
654  os << ']';
655  } else {
656  os << *Inst;
657  }
658  }
659 
660  Instruction *Inst;
661 
662  /// Points to the head in an instruction bundle (and always to this for
663  /// single instructions).
664  ScheduleData *FirstInBundle;
665 
666  /// Single linked list of all instructions in a bundle. Null if it is a
667  /// single instruction.
668  ScheduleData *NextInBundle;
669 
670  /// Single linked list of all memory instructions (e.g. load, store, call)
671  /// in the block - until the end of the scheduling region.
672  ScheduleData *NextLoadStore;
673 
674  /// The dependent memory instructions.
675  /// This list is derived on demand in calculateDependencies().
676  SmallVector<ScheduleData *, 4> MemoryDependencies;
677 
678  /// This ScheduleData is in the current scheduling region if this matches
679  /// the current SchedulingRegionID of BlockScheduling.
680  int SchedulingRegionID;
681 
682  /// Used for getting a "good" final ordering of instructions.
683  int SchedulingPriority;
684 
685  /// The number of dependencies. Constitutes of the number of users of the
686  /// instruction plus the number of dependent memory instructions (if any).
687  /// This value is calculated on demand.
688  /// If InvalidDeps, the number of dependencies is not calculated yet.
689  ///
690  int Dependencies;
691 
692  /// The number of dependencies minus the number of dependencies of scheduled
693  /// instructions. As soon as this is zero, the instruction/bundle gets ready
694  /// for scheduling.
695  /// Note that this is negative as long as Dependencies is not calculated.
696  int UnscheduledDeps;
697 
698  /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
699  /// single instructions.
700  int UnscheduledDepsInBundle;
701 
702  /// True if this instruction is scheduled (or considered as scheduled in the
703  /// dry-run).
704  bool IsScheduled;
705  };
706 
707 #ifndef NDEBUG
708  friend raw_ostream &operator<<(raw_ostream &os,
709  const BoUpSLP::ScheduleData &SD);
710 #endif
711 
712  /// Contains all scheduling data for a basic block.
713  ///
714  struct BlockScheduling {
715 
716  BlockScheduling(BasicBlock *BB)
717  : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize),
718  ScheduleStart(nullptr), ScheduleEnd(nullptr),
719  FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr),
720  // Make sure that the initial SchedulingRegionID is greater than the
721  // initial SchedulingRegionID in ScheduleData (which is 0).
722  SchedulingRegionID(1) {}
723 
724  void clear() {
725  ReadyInsts.clear();
726  ScheduleStart = nullptr;
727  ScheduleEnd = nullptr;
728  FirstLoadStoreInRegion = nullptr;
729  LastLoadStoreInRegion = nullptr;
730 
731  // Make a new scheduling region, i.e. all existing ScheduleData is not
732  // in the new region yet.
733  ++SchedulingRegionID;
734  }
735 
736  ScheduleData *getScheduleData(Value *V) {
737  ScheduleData *SD = ScheduleDataMap[V];
738  if (SD && SD->SchedulingRegionID == SchedulingRegionID)
739  return SD;
740  return nullptr;
741  }
742 
743  bool isInSchedulingRegion(ScheduleData *SD) {
744  return SD->SchedulingRegionID == SchedulingRegionID;
745  }
746 
747  /// Marks an instruction as scheduled and puts all dependent ready
748  /// instructions into the ready-list.
749  template <typename ReadyListType>
750  void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
751  SD->IsScheduled = true;
752  DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
753 
754  ScheduleData *BundleMember = SD;
755  while (BundleMember) {
756  // Handle the def-use chain dependencies.
757  for (Use &U : BundleMember->Inst->operands()) {
758  ScheduleData *OpDef = getScheduleData(U.get());
759  if (OpDef && OpDef->hasValidDependencies() &&
760  OpDef->incrementUnscheduledDeps(-1) == 0) {
761  // There are no more unscheduled dependencies after decrementing,
762  // so we can put the dependent instruction into the ready list.
763  ScheduleData *DepBundle = OpDef->FirstInBundle;
764  assert(!DepBundle->IsScheduled &&
765  "already scheduled bundle gets ready");
766  ReadyList.insert(DepBundle);
767  DEBUG(dbgs() << "SLP: gets ready (def): " << *DepBundle << "\n");
768  }
769  }
770  // Handle the memory dependencies.
771  for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
772  if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
773  // There are no more unscheduled dependencies after decrementing,
774  // so we can put the dependent instruction into the ready list.
775  ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
776  assert(!DepBundle->IsScheduled &&
777  "already scheduled bundle gets ready");
778  ReadyList.insert(DepBundle);
779  DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle << "\n");
780  }
781  }
782  BundleMember = BundleMember->NextInBundle;
783  }
784  }
785 
786  /// Put all instructions into the ReadyList which are ready for scheduling.
787  template <typename ReadyListType>
788  void initialFillReadyList(ReadyListType &ReadyList) {
789  for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
790  ScheduleData *SD = getScheduleData(I);
791  if (SD->isSchedulingEntity() && SD->isReady()) {
792  ReadyList.insert(SD);
793  DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n");
794  }
795  }
796  }
797 
798  /// Checks if a bundle of instructions can be scheduled, i.e. has no
799  /// cyclic dependencies. This is only a dry-run, no instructions are
800  /// actually moved at this stage.
801  bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP);
802 
803  /// Un-bundles a group of instructions.
804  void cancelScheduling(ArrayRef<Value *> VL);
805 
806  /// Extends the scheduling region so that V is inside the region.
807  void extendSchedulingRegion(Value *V);
808 
809  /// Initialize the ScheduleData structures for new instructions in the
810  /// scheduling region.
811  void initScheduleData(Instruction *FromI, Instruction *ToI,
812  ScheduleData *PrevLoadStore,
813  ScheduleData *NextLoadStore);
814 
815  /// Updates the dependency information of a bundle and of all instructions/
816  /// bundles which depend on the original bundle.
817  void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
818  BoUpSLP *SLP);
819 
820  /// Sets all instruction in the scheduling region to un-scheduled.
821  void resetSchedule();
822 
823  BasicBlock *BB;
824 
825  /// Simple memory allocation for ScheduleData.
826  std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
827 
828  /// The size of a ScheduleData array in ScheduleDataChunks.
829  int ChunkSize;
830 
831  /// The allocator position in the current chunk, which is the last entry
832  /// of ScheduleDataChunks.
833  int ChunkPos;
834 
835  /// Attaches ScheduleData to Instruction.
836  /// Note that the mapping survives during all vectorization iterations, i.e.
837  /// ScheduleData structures are recycled.
838  DenseMap<Value *, ScheduleData *> ScheduleDataMap;
839 
840  struct ReadyList : SmallVector<ScheduleData *, 8> {
841  void insert(ScheduleData *SD) { push_back(SD); }
842  };
843 
844  /// The ready-list for scheduling (only used for the dry-run).
845  ReadyList ReadyInsts;
846 
847  /// The first instruction of the scheduling region.
848  Instruction *ScheduleStart;
849 
850  /// The first instruction _after_ the scheduling region.
851  Instruction *ScheduleEnd;
852 
853  /// The first memory accessing instruction in the scheduling region
854  /// (can be null).
855  ScheduleData *FirstLoadStoreInRegion;
856 
857  /// The last memory accessing instruction in the scheduling region
858  /// (can be null).
859  ScheduleData *LastLoadStoreInRegion;
860 
861  /// The ID of the scheduling region. For a new vectorization iteration this
862  /// is incremented which "removes" all ScheduleData from the region.
863  int SchedulingRegionID;
864  };
865 
866  /// Attaches the BlockScheduling structures to basic blocks.
868 
869  /// Performs the "real" scheduling. Done before vectorization is actually
870  /// performed in a basic block.
871  void scheduleBlock(BlockScheduling *BS);
872 
873  /// List of users to ignore during scheduling and that don't need extracting.
874  ArrayRef<Value *> UserIgnoreList;
875 
876  // Number of load-bundles, which contain consecutive loads.
877  int NumLoadsWantToKeepOrder;
878 
879  // Number of load-bundles of size 2, which are consecutive loads if reversed.
880  int NumLoadsWantToChangeOrder;
881 
882  // Analysis and block reference.
883  Function *F;
884  ScalarEvolution *SE;
885  TargetTransformInfo *TTI;
886  TargetLibraryInfo *TLI;
887  AliasAnalysis *AA;
888  LoopInfo *LI;
889  DominatorTree *DT;
890  /// Instruction builder to construct the vectorized tree.
891  IRBuilder<> Builder;
892 };
893 
894 #ifndef NDEBUG
895 raw_ostream &operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD) {
896  SD.dump(os);
897  return os;
898 }
899 #endif
900 
901 void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
902  ArrayRef<Value *> UserIgnoreLst) {
903  deleteTree();
904  UserIgnoreList = UserIgnoreLst;
905  if (!getSameType(Roots))
906  return;
907  buildTree_rec(Roots, 0);
908 
909  // Collect the values that we need to extract from the tree.
910  for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
911  TreeEntry *Entry = &VectorizableTree[EIdx];
912 
913  // For each lane:
914  for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
915  Value *Scalar = Entry->Scalars[Lane];
916 
917  // No need to handle users of gathered values.
918  if (Entry->NeedToGather)
919  continue;
920 
921  for (User *U : Scalar->users()) {
922  DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
923 
924  Instruction *UserInst = dyn_cast<Instruction>(U);
925  if (!UserInst)
926  continue;
927 
928  // Skip in-tree scalars that become vectors
929  if (ScalarToTreeEntry.count(U)) {
930  int Idx = ScalarToTreeEntry[U];
931  TreeEntry *UseEntry = &VectorizableTree[Idx];
932  Value *UseScalar = UseEntry->Scalars[0];
933  // Some in-tree scalars will remain as scalar in vectorized
934  // instructions. If that is the case, the one in Lane 0 will
935  // be used.
936  if (UseScalar != U ||
937  !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
938  DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
939  << ".\n");
940  assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
941  continue;
942  }
943  }
944 
945  // Ignore users in the user ignore list.
946  if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) !=
947  UserIgnoreList.end())
948  continue;
949 
950  DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " <<
951  Lane << " from " << *Scalar << ".\n");
952  ExternalUses.push_back(ExternalUser(Scalar, U, Lane));
953  }
954  }
955  }
956 }
957 
958 
959 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
960  bool SameTy = getSameType(VL); (void)SameTy;
961  bool isAltShuffle = false;
962  assert(SameTy && "Invalid types!");
963 
964  if (Depth == RecursionMaxDepth) {
965  DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
966  newTreeEntry(VL, false);
967  return;
968  }
969 
970  // Don't handle vectors.
971  if (VL[0]->getType()->isVectorTy()) {
972  DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
973  newTreeEntry(VL, false);
974  return;
975  }
976 
977  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
978  if (SI->getValueOperand()->getType()->isVectorTy()) {
979  DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
980  newTreeEntry(VL, false);
981  return;
982  }
983  unsigned Opcode = getSameOpcode(VL);
984 
985  // Check that this shuffle vector refers to the alternate
986  // sequence of opcodes.
987  if (Opcode == Instruction::ShuffleVector) {
988  Instruction *I0 = dyn_cast<Instruction>(VL[0]);
989  unsigned Op = I0->getOpcode();
990  if (Op != Instruction::ShuffleVector)
991  isAltShuffle = true;
992  }
993 
994  // If all of the operands are identical or constant we have a simple solution.
995  if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) {
996  DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
997  newTreeEntry(VL, false);
998  return;
999  }
1000 
1001  // We now know that this is a vector of instructions of the same type from
1002  // the same block.
1003 
1004  // Don't vectorize ephemeral values.
1005  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1006  if (EphValues.count(VL[i])) {
1007  DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
1008  ") is ephemeral.\n");
1009  newTreeEntry(VL, false);
1010  return;
1011  }
1012  }
1013 
1014  // Check if this is a duplicate of another entry.
1015  if (ScalarToTreeEntry.count(VL[0])) {
1016  int Idx = ScalarToTreeEntry[VL[0]];
1017  TreeEntry *E = &VectorizableTree[Idx];
1018  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1019  DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
1020  if (E->Scalars[i] != VL[i]) {
1021  DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
1022  newTreeEntry(VL, false);
1023  return;
1024  }
1025  }
1026  DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n");
1027  return;
1028  }
1029 
1030  // Check that none of the instructions in the bundle are already in the tree.
1031  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1032  if (ScalarToTreeEntry.count(VL[i])) {
1033  DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
1034  ") is already in tree.\n");
1035  newTreeEntry(VL, false);
1036  return;
1037  }
1038  }
1039 
1040  // If any of the scalars is marked as a value that needs to stay scalar then
1041  // we need to gather the scalars.
1042  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1043  if (MustGather.count(VL[i])) {
1044  DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
1045  newTreeEntry(VL, false);
1046  return;
1047  }
1048  }
1049 
1050  // Check that all of the users of the scalars that we want to vectorize are
1051  // schedulable.
1052  Instruction *VL0 = cast<Instruction>(VL[0]);
1053  BasicBlock *BB = cast<Instruction>(VL0)->getParent();
1054 
1055  if (!DT->isReachableFromEntry(BB)) {
1056  // Don't go into unreachable blocks. They may contain instructions with
1057  // dependency cycles which confuse the final scheduling.
1058  DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
1059  newTreeEntry(VL, false);
1060  return;
1061  }
1062 
1063  // Check that every instructions appears once in this bundle.
1064  for (unsigned i = 0, e = VL.size(); i < e; ++i)
1065  for (unsigned j = i+1; j < e; ++j)
1066  if (VL[i] == VL[j]) {
1067  DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
1068  newTreeEntry(VL, false);
1069  return;
1070  }
1071 
1072  auto &BSRef = BlocksSchedules[BB];
1073  if (!BSRef) {
1074  BSRef = llvm::make_unique<BlockScheduling>(BB);
1075  }
1076  BlockScheduling &BS = *BSRef.get();
1077 
1078  if (!BS.tryScheduleBundle(VL, this)) {
1079  DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
1080  BS.cancelScheduling(VL);
1081  newTreeEntry(VL, false);
1082  return;
1083  }
1084  DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
1085 
1086  switch (Opcode) {
1087  case Instruction::PHI: {
1088  PHINode *PH = dyn_cast<PHINode>(VL0);
1089 
1090  // Check for terminator values (e.g. invoke).
1091  for (unsigned j = 0; j < VL.size(); ++j)
1092  for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
1094  cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i)));
1095  if (Term) {
1096  DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
1097  BS.cancelScheduling(VL);
1098  newTreeEntry(VL, false);
1099  return;
1100  }
1101  }
1102 
1103  newTreeEntry(VL, true);
1104  DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
1105 
1106  for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
1107  ValueList Operands;
1108  // Prepare the operand vector.
1109  for (unsigned j = 0; j < VL.size(); ++j)
1110  Operands.push_back(cast<PHINode>(VL[j])->getIncomingValueForBlock(
1111  PH->getIncomingBlock(i)));
1112 
1113  buildTree_rec(Operands, Depth + 1);
1114  }
1115  return;
1116  }
1118  bool Reuse = CanReuseExtract(VL);
1119  if (Reuse) {
1120  DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
1121  } else {
1122  BS.cancelScheduling(VL);
1123  }
1124  newTreeEntry(VL, Reuse);
1125  return;
1126  }
1127  case Instruction::Load: {
1128  // Check if the loads are consecutive or of we need to swizzle them.
1129  for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
1130  LoadInst *L = cast<LoadInst>(VL[i]);
1131  if (!L->isSimple()) {
1132  BS.cancelScheduling(VL);
1133  newTreeEntry(VL, false);
1134  DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
1135  return;
1136  }
1137  const DataLayout &DL = F->getParent()->getDataLayout();
1138  if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) {
1139  if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], DL)) {
1140  ++NumLoadsWantToChangeOrder;
1141  }
1142  BS.cancelScheduling(VL);
1143  newTreeEntry(VL, false);
1144  DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
1145  return;
1146  }
1147  }
1148  ++NumLoadsWantToKeepOrder;
1149  newTreeEntry(VL, true);
1150  DEBUG(dbgs() << "SLP: added a vector of loads.\n");
1151  return;
1152  }
1153  case Instruction::ZExt:
1154  case Instruction::SExt:
1155  case Instruction::FPToUI:
1156  case Instruction::FPToSI:
1157  case Instruction::FPExt:
1158  case Instruction::PtrToInt:
1159  case Instruction::IntToPtr:
1160  case Instruction::SIToFP:
1161  case Instruction::UIToFP:
1162  case Instruction::Trunc:
1163  case Instruction::FPTrunc:
1164  case Instruction::BitCast: {
1165  Type *SrcTy = VL0->getOperand(0)->getType();
1166  for (unsigned i = 0; i < VL.size(); ++i) {
1167  Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
1168  if (Ty != SrcTy || !isValidElementType(Ty)) {
1169  BS.cancelScheduling(VL);
1170  newTreeEntry(VL, false);
1171  DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
1172  return;
1173  }
1174  }
1175  newTreeEntry(VL, true);
1176  DEBUG(dbgs() << "SLP: added a vector of casts.\n");
1177 
1178  for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1179  ValueList Operands;
1180  // Prepare the operand vector.
1181  for (unsigned j = 0; j < VL.size(); ++j)
1182  Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1183 
1184  buildTree_rec(Operands, Depth+1);
1185  }
1186  return;
1187  }
1188  case Instruction::ICmp:
1189  case Instruction::FCmp: {
1190  // Check that all of the compares have the same predicate.
1191  CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
1192  Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType();
1193  for (unsigned i = 1, e = VL.size(); i < e; ++i) {
1194  CmpInst *Cmp = cast<CmpInst>(VL[i]);
1195  if (Cmp->getPredicate() != P0 ||
1196  Cmp->getOperand(0)->getType() != ComparedTy) {
1197  BS.cancelScheduling(VL);
1198  newTreeEntry(VL, false);
1199  DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
1200  return;
1201  }
1202  }
1203 
1204  newTreeEntry(VL, true);
1205  DEBUG(dbgs() << "SLP: added a vector of compares.\n");
1206 
1207  for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1208  ValueList Operands;
1209  // Prepare the operand vector.
1210  for (unsigned j = 0; j < VL.size(); ++j)
1211  Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1212 
1213  buildTree_rec(Operands, Depth+1);
1214  }
1215  return;
1216  }
1217  case Instruction::Select:
1218  case Instruction::Add:
1219  case Instruction::FAdd:
1220  case Instruction::Sub:
1221  case Instruction::FSub:
1222  case Instruction::Mul:
1223  case Instruction::FMul:
1224  case Instruction::UDiv:
1225  case Instruction::SDiv:
1226  case Instruction::FDiv:
1227  case Instruction::URem:
1228  case Instruction::SRem:
1229  case Instruction::FRem:
1230  case Instruction::Shl:
1231  case Instruction::LShr:
1232  case Instruction::AShr:
1233  case Instruction::And:
1234  case Instruction::Or:
1235  case Instruction::Xor: {
1236  newTreeEntry(VL, true);
1237  DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
1238 
1239  // Sort operands of the instructions so that each side is more likely to
1240  // have the same opcode.
1241  if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
1242  ValueList Left, Right;
1243  reorderInputsAccordingToOpcode(VL, Left, Right);
1244  buildTree_rec(Left, Depth + 1);
1245  buildTree_rec(Right, Depth + 1);
1246  return;
1247  }
1248 
1249  for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1250  ValueList Operands;
1251  // Prepare the operand vector.
1252  for (unsigned j = 0; j < VL.size(); ++j)
1253  Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1254 
1255  buildTree_rec(Operands, Depth+1);
1256  }
1257  return;
1258  }
1259  case Instruction::GetElementPtr: {
1260  // We don't combine GEPs with complicated (nested) indexing.
1261  for (unsigned j = 0; j < VL.size(); ++j) {
1262  if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
1263  DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
1264  BS.cancelScheduling(VL);
1265  newTreeEntry(VL, false);
1266  return;
1267  }
1268  }
1269 
1270  // We can't combine several GEPs into one vector if they operate on
1271  // different types.
1272  Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType();
1273  for (unsigned j = 0; j < VL.size(); ++j) {
1274  Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
1275  if (Ty0 != CurTy) {
1276  DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
1277  BS.cancelScheduling(VL);
1278  newTreeEntry(VL, false);
1279  return;
1280  }
1281  }
1282 
1283  // We don't combine GEPs with non-constant indexes.
1284  for (unsigned j = 0; j < VL.size(); ++j) {
1285  auto Op = cast<Instruction>(VL[j])->getOperand(1);
1286  if (!isa<ConstantInt>(Op)) {
1287  DEBUG(
1288  dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
1289  BS.cancelScheduling(VL);
1290  newTreeEntry(VL, false);
1291  return;
1292  }
1293  }
1294 
1295  newTreeEntry(VL, true);
1296  DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
1297  for (unsigned i = 0, e = 2; i < e; ++i) {
1298  ValueList Operands;
1299  // Prepare the operand vector.
1300  for (unsigned j = 0; j < VL.size(); ++j)
1301  Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1302 
1303  buildTree_rec(Operands, Depth + 1);
1304  }
1305  return;
1306  }
1307  case Instruction::Store: {
1308  const DataLayout &DL = F->getParent()->getDataLayout();
1309  // Check if the stores are consecutive or of we need to swizzle them.
1310  for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
1311  if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) {
1312  BS.cancelScheduling(VL);
1313  newTreeEntry(VL, false);
1314  DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
1315  return;
1316  }
1317 
1318  newTreeEntry(VL, true);
1319  DEBUG(dbgs() << "SLP: added a vector of stores.\n");
1320 
1321  ValueList Operands;
1322  for (unsigned j = 0; j < VL.size(); ++j)
1323  Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
1324 
1325  buildTree_rec(Operands, Depth + 1);
1326  return;
1327  }
1328  case Instruction::Call: {
1329  // Check if the calls are all to the same vectorizable intrinsic.
1330  CallInst *CI = cast<CallInst>(VL[0]);
1331  // Check if this is an Intrinsic call or something that can be
1332  // represented by an intrinsic call
1333  Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
1334  if (!isTriviallyVectorizable(ID)) {
1335  BS.cancelScheduling(VL);
1336  newTreeEntry(VL, false);
1337  DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
1338  return;
1339  }
1340  Function *Int = CI->getCalledFunction();
1341  Value *A1I = nullptr;
1342  if (hasVectorInstrinsicScalarOpd(ID, 1))
1343  A1I = CI->getArgOperand(1);
1344  for (unsigned i = 1, e = VL.size(); i != e; ++i) {
1345  CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
1346  if (!CI2 || CI2->getCalledFunction() != Int ||
1347  getIntrinsicIDForCall(CI2, TLI) != ID) {
1348  BS.cancelScheduling(VL);
1349  newTreeEntry(VL, false);
1350  DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
1351  << "\n");
1352  return;
1353  }
1354  // ctlz,cttz and powi are special intrinsics whose second argument
1355  // should be same in order for them to be vectorized.
1356  if (hasVectorInstrinsicScalarOpd(ID, 1)) {
1357  Value *A1J = CI2->getArgOperand(1);
1358  if (A1I != A1J) {
1359  BS.cancelScheduling(VL);
1360  newTreeEntry(VL, false);
1361  DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
1362  << " argument "<< A1I<<"!=" << A1J
1363  << "\n");
1364  return;
1365  }
1366  }
1367  }
1368 
1369  newTreeEntry(VL, true);
1370  for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
1371  ValueList Operands;
1372  // Prepare the operand vector.
1373  for (unsigned j = 0; j < VL.size(); ++j) {
1374  CallInst *CI2 = dyn_cast<CallInst>(VL[j]);
1375  Operands.push_back(CI2->getArgOperand(i));
1376  }
1377  buildTree_rec(Operands, Depth + 1);
1378  }
1379  return;
1380  }
1381  case Instruction::ShuffleVector: {
1382  // If this is not an alternate sequence of opcode like add-sub
1383  // then do not vectorize this instruction.
1384  if (!isAltShuffle) {
1385  BS.cancelScheduling(VL);
1386  newTreeEntry(VL, false);
1387  DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
1388  return;
1389  }
1390  newTreeEntry(VL, true);
1391  DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
1392 
1393  // Reorder operands if reordering would enable vectorization.
1394  if (isa<BinaryOperator>(VL0)) {
1395  ValueList Left, Right;
1396  reorderAltShuffleOperands(VL, Left, Right);
1397  buildTree_rec(Left, Depth + 1);
1398  buildTree_rec(Right, Depth + 1);
1399  return;
1400  }
1401 
1402  for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1403  ValueList Operands;
1404  // Prepare the operand vector.
1405  for (unsigned j = 0; j < VL.size(); ++j)
1406  Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1407 
1408  buildTree_rec(Operands, Depth + 1);
1409  }
1410  return;
1411  }
1412  default:
1413  BS.cancelScheduling(VL);
1414  newTreeEntry(VL, false);
1415  DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
1416  return;
1417  }
1418 }
1419 
1420 int BoUpSLP::getEntryCost(TreeEntry *E) {
1421  ArrayRef<Value*> VL = E->Scalars;
1422 
1423  Type *ScalarTy = VL[0]->getType();
1424  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
1425  ScalarTy = SI->getValueOperand()->getType();
1426  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
1427 
1428  if (E->NeedToGather) {
1429  if (allConstant(VL))
1430  return 0;
1431  if (isSplat(VL)) {
1432  return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
1433  }
1434  return getGatherCost(E->Scalars);
1435  }
1436  unsigned Opcode = getSameOpcode(VL);
1437  assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL");
1438  Instruction *VL0 = cast<Instruction>(VL[0]);
1439  switch (Opcode) {
1440  case Instruction::PHI: {
1441  return 0;
1442  }
1444  if (CanReuseExtract(VL)) {
1445  int DeadCost = 0;
1446  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
1447  ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
1448  if (E->hasOneUse())
1449  // Take credit for instruction that will become dead.
1450  DeadCost +=
1451  TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
1452  }
1453  return -DeadCost;
1454  }
1455  return getGatherCost(VecTy);
1456  }
1457  case Instruction::ZExt:
1458  case Instruction::SExt:
1459  case Instruction::FPToUI:
1460  case Instruction::FPToSI:
1461  case Instruction::FPExt:
1462  case Instruction::PtrToInt:
1463  case Instruction::IntToPtr:
1464  case Instruction::SIToFP:
1465  case Instruction::UIToFP:
1466  case Instruction::Trunc:
1467  case Instruction::FPTrunc:
1468  case Instruction::BitCast: {
1469  Type *SrcTy = VL0->getOperand(0)->getType();
1470 
1471  // Calculate the cost of this instruction.
1472  int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
1473  VL0->getType(), SrcTy);
1474 
1475  VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
1476  int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
1477  return VecCost - ScalarCost;
1478  }
1479  case Instruction::FCmp:
1480  case Instruction::ICmp:
1481  case Instruction::Select:
1482  case Instruction::Add:
1483  case Instruction::FAdd:
1484  case Instruction::Sub:
1485  case Instruction::FSub:
1486  case Instruction::Mul:
1487  case Instruction::FMul:
1488  case Instruction::UDiv:
1489  case Instruction::SDiv:
1490  case Instruction::FDiv:
1491  case Instruction::URem:
1492  case Instruction::SRem:
1493  case Instruction::FRem:
1494  case Instruction::Shl:
1495  case Instruction::LShr:
1496  case Instruction::AShr:
1497  case Instruction::And:
1498  case Instruction::Or:
1499  case Instruction::Xor: {
1500  // Calculate the cost of this instruction.
1501  int ScalarCost = 0;
1502  int VecCost = 0;
1503  if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp ||
1504  Opcode == Instruction::Select) {
1505  VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
1506  ScalarCost = VecTy->getNumElements() *
1507  TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
1508  VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
1509  } else {
1510  // Certain instructions can be cheaper to vectorize if they have a
1511  // constant second vector operand.
1520 
1521  // If all operands are exactly the same ConstantInt then set the
1522  // operand kind to OK_UniformConstantValue.
1523  // If instead not all operands are constants, then set the operand kind
1524  // to OK_AnyValue. If all operands are constants but not the same,
1525  // then set the operand kind to OK_NonUniformConstantValue.
1526  ConstantInt *CInt = nullptr;
1527  for (unsigned i = 0; i < VL.size(); ++i) {
1528  const Instruction *I = cast<Instruction>(VL[i]);
1529  if (!isa<ConstantInt>(I->getOperand(1))) {
1531  break;
1532  }
1533  if (i == 0) {
1534  CInt = cast<ConstantInt>(I->getOperand(1));
1535  continue;
1536  }
1538  CInt != cast<ConstantInt>(I->getOperand(1)))
1540  }
1541  // FIXME: Currently cost of model modification for division by
1542  // power of 2 is handled only for X86. Add support for other targets.
1543  if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
1544  CInt->getValue().isPowerOf2())
1546 
1547  ScalarCost = VecTy->getNumElements() *
1548  TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK,
1549  Op1VP, Op2VP);
1550  VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
1551  Op1VP, Op2VP);
1552  }
1553  return VecCost - ScalarCost;
1554  }
1555  case Instruction::GetElementPtr: {
1560 
1561  int ScalarCost =
1562  VecTy->getNumElements() *
1563  TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
1564  int VecCost =
1565  TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
1566 
1567  return VecCost - ScalarCost;
1568  }
1569  case Instruction::Load: {
1570  // Cost of wide load - cost of scalar loads.
1571  int ScalarLdCost = VecTy->getNumElements() *
1572  TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
1573  int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, 1, 0);
1574  return VecLdCost - ScalarLdCost;
1575  }
1576  case Instruction::Store: {
1577  // We know that we can merge the stores. Calculate the cost.
1578  int ScalarStCost = VecTy->getNumElements() *
1579  TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
1580  int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, 1, 0);
1581  return VecStCost - ScalarStCost;
1582  }
1583  case Instruction::Call: {
1584  CallInst *CI = cast<CallInst>(VL0);
1585  Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
1586 
1587  // Calculate the cost of the scalar and vector calls.
1588  SmallVector<Type*, 4> ScalarTys, VecTys;
1589  for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) {
1590  ScalarTys.push_back(CI->getArgOperand(op)->getType());
1592  VecTy->getNumElements()));
1593  }
1594 
1595  int ScalarCallCost = VecTy->getNumElements() *
1596  TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys);
1597 
1598  int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys);
1599 
1600  DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
1601  << " (" << VecCallCost << "-" << ScalarCallCost << ")"
1602  << " for " << *CI << "\n");
1603 
1604  return VecCallCost - ScalarCallCost;
1605  }
1606  case Instruction::ShuffleVector: {
1611  int ScalarCost = 0;
1612  int VecCost = 0;
1613  for (unsigned i = 0; i < VL.size(); ++i) {
1614  Instruction *I = cast<Instruction>(VL[i]);
1615  if (!I)
1616  break;
1617  ScalarCost +=
1618  TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
1619  }
1620  // VecCost is equal to sum of the cost of creating 2 vectors
1621  // and the cost of creating shuffle.
1622  Instruction *I0 = cast<Instruction>(VL[0]);
1623  VecCost =
1624  TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
1625  Instruction *I1 = cast<Instruction>(VL[1]);
1626  VecCost +=
1627  TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
1628  VecCost +=
1629  TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
1630  return VecCost - ScalarCost;
1631  }
1632  default:
1633  llvm_unreachable("Unknown instruction");
1634  }
1635 }
1636 
1637 bool BoUpSLP::isFullyVectorizableTinyTree() {
1638  DEBUG(dbgs() << "SLP: Check whether the tree with height " <<
1639  VectorizableTree.size() << " is fully vectorizable .\n");
1640 
1641  // We only handle trees of height 2.
1642  if (VectorizableTree.size() != 2)
1643  return false;
1644 
1645  // Handle splat and all-constants stores.
1646  if (!VectorizableTree[0].NeedToGather &&
1647  (allConstant(VectorizableTree[1].Scalars) ||
1648  isSplat(VectorizableTree[1].Scalars)))
1649  return true;
1650 
1651  // Gathering cost would be too much for tiny trees.
1652  if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather)
1653  return false;
1654 
1655  return true;
1656 }
1657 
1658 int BoUpSLP::getSpillCost() {
1659  // Walk from the bottom of the tree to the top, tracking which values are
1660  // live. When we see a call instruction that is not part of our tree,
1661  // query TTI to see if there is a cost to keeping values live over it
1662  // (for example, if spills and fills are required).
1663  unsigned BundleWidth = VectorizableTree.front().Scalars.size();
1664  int Cost = 0;
1665 
1666  SmallPtrSet<Instruction*, 4> LiveValues;
1667  Instruction *PrevInst = nullptr;
1668 
1669  for (unsigned N = 0; N < VectorizableTree.size(); ++N) {
1670  Instruction *Inst = dyn_cast<Instruction>(VectorizableTree[N].Scalars[0]);
1671  if (!Inst)
1672  continue;
1673 
1674  if (!PrevInst) {
1675  PrevInst = Inst;
1676  continue;
1677  }
1678 
1679  DEBUG(
1680  dbgs() << "SLP: #LV: " << LiveValues.size();
1681  for (auto *X : LiveValues)
1682  dbgs() << " " << X->getName();
1683  dbgs() << ", Looking at ";
1684  Inst->dump();
1685  );
1686 
1687  // Update LiveValues.
1688  LiveValues.erase(PrevInst);
1689  for (auto &J : PrevInst->operands()) {
1690  if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J))
1691  LiveValues.insert(cast<Instruction>(&*J));
1692  }
1693 
1694  // Now find the sequence of instructions between PrevInst and Inst.
1695  BasicBlock::reverse_iterator InstIt(Inst), PrevInstIt(PrevInst);
1696  --PrevInstIt;
1697  while (InstIt != PrevInstIt) {
1698  if (PrevInstIt == PrevInst->getParent()->rend()) {
1699  PrevInstIt = Inst->getParent()->rbegin();
1700  continue;
1701  }
1702 
1703  if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) {
1705  for (auto *II : LiveValues)
1706  V.push_back(VectorType::get(II->getType(), BundleWidth));
1707  Cost += TTI->getCostOfKeepingLiveOverCall(V);
1708  }
1709 
1710  ++PrevInstIt;
1711  }
1712 
1713  PrevInst = Inst;
1714  }
1715 
1716  DEBUG(dbgs() << "SLP: SpillCost=" << Cost << "\n");
1717  return Cost;
1718 }
1719 
1720 int BoUpSLP::getTreeCost() {
1721  int Cost = 0;
1722  DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
1723  VectorizableTree.size() << ".\n");
1724 
1725  // We only vectorize tiny trees if it is fully vectorizable.
1726  if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) {
1727  if (VectorizableTree.empty()) {
1728  assert(!ExternalUses.size() && "We should not have any external users");
1729  }
1730  return INT_MAX;
1731  }
1732 
1733  unsigned BundleWidth = VectorizableTree[0].Scalars.size();
1734 
1735  for (unsigned i = 0, e = VectorizableTree.size(); i != e; ++i) {
1736  int C = getEntryCost(&VectorizableTree[i]);
1737  DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
1738  << *VectorizableTree[i].Scalars[0] << " .\n");
1739  Cost += C;
1740  }
1741 
1742  SmallSet<Value *, 16> ExtractCostCalculated;
1743  int ExtractCost = 0;
1744  for (UserList::iterator I = ExternalUses.begin(), E = ExternalUses.end();
1745  I != E; ++I) {
1746  // We only add extract cost once for the same scalar.
1747  if (!ExtractCostCalculated.insert(I->Scalar).second)
1748  continue;
1749 
1750  // Uses by ephemeral values are free (because the ephemeral value will be
1751  // removed prior to code generation, and so the extraction will be
1752  // removed as well).
1753  if (EphValues.count(I->User))
1754  continue;
1755 
1756  VectorType *VecTy = VectorType::get(I->Scalar->getType(), BundleWidth);
1757  ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
1758  I->Lane);
1759  }
1760 
1761  Cost += getSpillCost();
1762 
1763  DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n");
1764  return Cost + ExtractCost;
1765 }
1766 
1767 int BoUpSLP::getGatherCost(Type *Ty) {
1768  int Cost = 0;
1769  for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
1770  Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
1771  return Cost;
1772 }
1773 
1774 int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
1775  // Find the type of the operands in VL.
1776  Type *ScalarTy = VL[0]->getType();
1777  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
1778  ScalarTy = SI->getValueOperand()->getType();
1779  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
1780  // Find the cost of inserting/extracting values from the vector.
1781  return getGatherCost(VecTy);
1782 }
1783 
1785  if (LoadInst *LI = dyn_cast<LoadInst>(I))
1786  return LI->getPointerOperand();
1787  if (StoreInst *SI = dyn_cast<StoreInst>(I))
1788  return SI->getPointerOperand();
1789  return nullptr;
1790 }
1791 
1792 unsigned BoUpSLP::getAddressSpaceOperand(Value *I) {
1793  if (LoadInst *L = dyn_cast<LoadInst>(I))
1794  return L->getPointerAddressSpace();
1795  if (StoreInst *S = dyn_cast<StoreInst>(I))
1796  return S->getPointerAddressSpace();
1797  return -1;
1798 }
1799 
1800 bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL) {
1801  Value *PtrA = getPointerOperand(A);
1802  Value *PtrB = getPointerOperand(B);
1803  unsigned ASA = getAddressSpaceOperand(A);
1804  unsigned ASB = getAddressSpaceOperand(B);
1805 
1806  // Check that the address spaces match and that the pointers are valid.
1807  if (!PtrA || !PtrB || (ASA != ASB))
1808  return false;
1809 
1810  // Make sure that A and B are different pointers of the same type.
1811  if (PtrA == PtrB || PtrA->getType() != PtrB->getType())
1812  return false;
1813 
1814  unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
1815  Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
1816  APInt Size(PtrBitWidth, DL.getTypeStoreSize(Ty));
1817 
1818  APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
1819  PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
1820  PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
1821 
1822  APInt OffsetDelta = OffsetB - OffsetA;
1823 
1824  // Check if they are based on the same pointer. That makes the offsets
1825  // sufficient.
1826  if (PtrA == PtrB)
1827  return OffsetDelta == Size;
1828 
1829  // Compute the necessary base pointer delta to have the necessary final delta
1830  // equal to the size.
1831  APInt BaseDelta = Size - OffsetDelta;
1832 
1833  // Otherwise compute the distance with SCEV between the base pointers.
1834  const SCEV *PtrSCEVA = SE->getSCEV(PtrA);
1835  const SCEV *PtrSCEVB = SE->getSCEV(PtrB);
1836  const SCEV *C = SE->getConstant(BaseDelta);
1837  const SCEV *X = SE->getAddExpr(PtrSCEVA, C);
1838  return X == PtrSCEVB;
1839 }
1840 
1841 // Reorder commutative operations in alternate shuffle if the resulting vectors
1842 // are consecutive loads. This would allow us to vectorize the tree.
1843 // If we have something like-
1844 // load a[0] - load b[0]
1845 // load b[1] + load a[1]
1846 // load a[2] - load b[2]
1847 // load a[3] + load b[3]
1848 // Reordering the second load b[1] load a[1] would allow us to vectorize this
1849 // code.
1850 void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
1852  SmallVectorImpl<Value *> &Right) {
1853  const DataLayout &DL = F->getParent()->getDataLayout();
1854 
1855  // Push left and right operands of binary operation into Left and Right
1856  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
1857  Left.push_back(cast<Instruction>(VL[i])->getOperand(0));
1858  Right.push_back(cast<Instruction>(VL[i])->getOperand(1));
1859  }
1860 
1861  // Reorder if we have a commutative operation and consecutive access
1862  // are on either side of the alternate instructions.
1863  for (unsigned j = 0; j < VL.size() - 1; ++j) {
1864  if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
1865  if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
1866  Instruction *VL1 = cast<Instruction>(VL[j]);
1867  Instruction *VL2 = cast<Instruction>(VL[j + 1]);
1868  if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) {
1869  std::swap(Left[j], Right[j]);
1870  continue;
1871  } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) {
1872  std::swap(Left[j + 1], Right[j + 1]);
1873  continue;
1874  }
1875  // else unchanged
1876  }
1877  }
1878  if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
1879  if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
1880  Instruction *VL1 = cast<Instruction>(VL[j]);
1881  Instruction *VL2 = cast<Instruction>(VL[j + 1]);
1882  if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) {
1883  std::swap(Left[j], Right[j]);
1884  continue;
1885  } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) {
1886  std::swap(Left[j + 1], Right[j + 1]);
1887  continue;
1888  }
1889  // else unchanged
1890  }
1891  }
1892  }
1893 }
1894 
1895 void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
1897  SmallVectorImpl<Value *> &Right) {
1898 
1899  SmallVector<Value *, 16> OrigLeft, OrigRight;
1900 
1901  bool AllSameOpcodeLeft = true;
1902  bool AllSameOpcodeRight = true;
1903  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1904  Instruction *I = cast<Instruction>(VL[i]);
1905  Value *VLeft = I->getOperand(0);
1906  Value *VRight = I->getOperand(1);
1907 
1908  OrigLeft.push_back(VLeft);
1909  OrigRight.push_back(VRight);
1910 
1911  Instruction *ILeft = dyn_cast<Instruction>(VLeft);
1912  Instruction *IRight = dyn_cast<Instruction>(VRight);
1913 
1914  // Check whether all operands on one side have the same opcode. In this case
1915  // we want to preserve the original order and not make things worse by
1916  // reordering.
1917  if (i && AllSameOpcodeLeft && ILeft) {
1918  if (Instruction *PLeft = dyn_cast<Instruction>(OrigLeft[i - 1])) {
1919  if (PLeft->getOpcode() != ILeft->getOpcode())
1920  AllSameOpcodeLeft = false;
1921  } else
1922  AllSameOpcodeLeft = false;
1923  }
1924  if (i && AllSameOpcodeRight && IRight) {
1925  if (Instruction *PRight = dyn_cast<Instruction>(OrigRight[i - 1])) {
1926  if (PRight->getOpcode() != IRight->getOpcode())
1927  AllSameOpcodeRight = false;
1928  } else
1929  AllSameOpcodeRight = false;
1930  }
1931 
1932  // Sort two opcodes. In the code below we try to preserve the ability to use
1933  // broadcast of values instead of individual inserts.
1934  // vl1 = load
1935  // vl2 = phi
1936  // vr1 = load
1937  // vr2 = vr2
1938  // = vl1 x vr1
1939  // = vl2 x vr2
1940  // If we just sorted according to opcode we would leave the first line in
1941  // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load).
1942  // = vl1 x vr1
1943  // = vr2 x vl2
1944  // Because vr2 and vr1 are from the same load we loose the opportunity of a
1945  // broadcast for the packed right side in the backend: we have [vr1, vl2]
1946  // instead of [vr1, vr2=vr1].
1947  if (ILeft && IRight) {
1948  if (!i && ILeft->getOpcode() > IRight->getOpcode()) {
1949  Left.push_back(IRight);
1950  Right.push_back(ILeft);
1951  } else if (i && ILeft->getOpcode() > IRight->getOpcode() &&
1952  Right[i - 1] != IRight) {
1953  // Try not to destroy a broad cast for no apparent benefit.
1954  Left.push_back(IRight);
1955  Right.push_back(ILeft);
1956  } else if (i && ILeft->getOpcode() == IRight->getOpcode() &&
1957  Right[i - 1] == ILeft) {
1958  // Try preserve broadcasts.
1959  Left.push_back(IRight);
1960  Right.push_back(ILeft);
1961  } else if (i && ILeft->getOpcode() == IRight->getOpcode() &&
1962  Left[i - 1] == IRight) {
1963  // Try preserve broadcasts.
1964  Left.push_back(IRight);
1965  Right.push_back(ILeft);
1966  } else {
1967  Left.push_back(ILeft);
1968  Right.push_back(IRight);
1969  }
1970  continue;
1971  }
1972  // One opcode, put the instruction on the right.
1973  if (ILeft) {
1974  Left.push_back(VRight);
1975  Right.push_back(ILeft);
1976  continue;
1977  }
1978  Left.push_back(VLeft);
1979  Right.push_back(VRight);
1980  }
1981 
1982  bool LeftBroadcast = isSplat(Left);
1983  bool RightBroadcast = isSplat(Right);
1984 
1985  // If operands end up being broadcast return this operand order.
1986  if (LeftBroadcast || RightBroadcast)
1987  return;
1988 
1989  // Don't reorder if the operands where good to begin.
1990  if (AllSameOpcodeRight || AllSameOpcodeLeft) {
1991  Left = OrigLeft;
1992  Right = OrigRight;
1993  }
1994 
1995  const DataLayout &DL = F->getParent()->getDataLayout();
1996 
1997  // Finally check if we can get longer vectorizable chain by reordering
1998  // without breaking the good operand order detected above.
1999  // E.g. If we have something like-
2000  // load a[0] load b[0]
2001  // load b[1] load a[1]
2002  // load a[2] load b[2]
2003  // load a[3] load b[3]
2004  // Reordering the second load b[1] load a[1] would allow us to vectorize
2005  // this code and we still retain AllSameOpcode property.
2006  // FIXME: This load reordering might break AllSameOpcode in some rare cases
2007  // such as-
2008  // add a[0],c[0] load b[0]
2009  // add a[1],c[2] load b[1]
2010  // b[2] load b[2]
2011  // add a[3],c[3] load b[3]
2012  for (unsigned j = 0; j < VL.size() - 1; ++j) {
2013  if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
2014  if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
2015  if (isConsecutiveAccess(L, L1, DL)) {
2016  std::swap(Left[j + 1], Right[j + 1]);
2017  continue;
2018  }
2019  }
2020  }
2021  if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
2022  if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
2023  if (isConsecutiveAccess(L, L1, DL)) {
2024  std::swap(Left[j + 1], Right[j + 1]);
2025  continue;
2026  }
2027  }
2028  }
2029  // else unchanged
2030  }
2031 }
2032 
2033 void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
2034  Instruction *VL0 = cast<Instruction>(VL[0]);
2035  BasicBlock::iterator NextInst = VL0;
2036  ++NextInst;
2037  Builder.SetInsertPoint(VL0->getParent(), NextInst);
2038  Builder.SetCurrentDebugLocation(VL0->getDebugLoc());
2039 }
2040 
2041 Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
2042  Value *Vec = UndefValue::get(Ty);
2043  // Generate the 'InsertElement' instruction.
2044  for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
2045  Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
2046  if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
2047  GatherSeq.insert(Insrt);
2048  CSEBlocks.insert(Insrt->getParent());
2049 
2050  // Add to our 'need-to-extract' list.
2051  if (ScalarToTreeEntry.count(VL[i])) {
2052  int Idx = ScalarToTreeEntry[VL[i]];
2053  TreeEntry *E = &VectorizableTree[Idx];
2054  // Find which lane we need to extract.
2055  int FoundLane = -1;
2056  for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
2057  // Is this the lane of the scalar that we are looking for ?
2058  if (E->Scalars[Lane] == VL[i]) {
2059  FoundLane = Lane;
2060  break;
2061  }
2062  }
2063  assert(FoundLane >= 0 && "Could not find the correct lane");
2064  ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
2065  }
2066  }
2067  }
2068 
2069  return Vec;
2070 }
2071 
2072 Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const {
2074  = ScalarToTreeEntry.find(VL[0]);
2075  if (Entry != ScalarToTreeEntry.end()) {
2076  int Idx = Entry->second;
2077  const TreeEntry *En = &VectorizableTree[Idx];
2078  if (En->isSame(VL) && En->VectorizedValue)
2079  return En->VectorizedValue;
2080  }
2081  return nullptr;
2082 }
2083 
2084 Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
2085  if (ScalarToTreeEntry.count(VL[0])) {
2086  int Idx = ScalarToTreeEntry[VL[0]];
2087  TreeEntry *E = &VectorizableTree[Idx];
2088  if (E->isSame(VL))
2089  return vectorizeTree(E);
2090  }
2091 
2092  Type *ScalarTy = VL[0]->getType();
2093  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
2094  ScalarTy = SI->getValueOperand()->getType();
2095  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
2096 
2097  return Gather(VL, VecTy);
2098 }
2099 
2100 Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
2101  IRBuilder<>::InsertPointGuard Guard(Builder);
2102 
2103  if (E->VectorizedValue) {
2104  DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
2105  return E->VectorizedValue;
2106  }
2107 
2108  Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
2109  Type *ScalarTy = VL0->getType();
2110  if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
2111  ScalarTy = SI->getValueOperand()->getType();
2112  VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
2113 
2114  if (E->NeedToGather) {
2115  setInsertPointAfterBundle(E->Scalars);
2116  return Gather(E->Scalars, VecTy);
2117  }
2118 
2119  const DataLayout &DL = F->getParent()->getDataLayout();
2120  unsigned Opcode = getSameOpcode(E->Scalars);
2121 
2122  switch (Opcode) {
2123  case Instruction::PHI: {
2124  PHINode *PH = dyn_cast<PHINode>(VL0);
2125  Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
2126  Builder.SetCurrentDebugLocation(PH->getDebugLoc());
2127  PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
2128  E->VectorizedValue = NewPhi;
2129 
2130  // PHINodes may have multiple entries from the same block. We want to
2131  // visit every block once.
2132  SmallSet<BasicBlock*, 4> VisitedBBs;
2133 
2134  for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
2135  ValueList Operands;
2136  BasicBlock *IBB = PH->getIncomingBlock(i);
2137 
2138  if (!VisitedBBs.insert(IBB).second) {
2139  NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
2140  continue;
2141  }
2142 
2143  // Prepare the operand vector.
2144  for (Value *V : E->Scalars)
2145  Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(IBB));
2146 
2147  Builder.SetInsertPoint(IBB->getTerminator());
2148  Builder.SetCurrentDebugLocation(PH->getDebugLoc());
2149  Value *Vec = vectorizeTree(Operands);
2150  NewPhi->addIncoming(Vec, IBB);
2151  }
2152 
2153  assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
2154  "Invalid number of incoming values");
2155  return NewPhi;
2156  }
2157 
2159  if (CanReuseExtract(E->Scalars)) {
2160  Value *V = VL0->getOperand(0);
2161  E->VectorizedValue = V;
2162  return V;
2163  }
2164  return Gather(E->Scalars, VecTy);
2165  }
2166  case Instruction::ZExt:
2167  case Instruction::SExt:
2168  case Instruction::FPToUI:
2169  case Instruction::FPToSI:
2170  case Instruction::FPExt:
2171  case Instruction::PtrToInt:
2172  case Instruction::IntToPtr:
2173  case Instruction::SIToFP:
2174  case Instruction::UIToFP:
2175  case Instruction::Trunc:
2176  case Instruction::FPTrunc:
2177  case Instruction::BitCast: {
2178  ValueList INVL;
2179  for (Value *V : E->Scalars)
2180  INVL.push_back(cast<Instruction>(V)->getOperand(0));
2181 
2182  setInsertPointAfterBundle(E->Scalars);
2183 
2184  Value *InVec = vectorizeTree(INVL);
2185 
2186  if (Value *V = alreadyVectorized(E->Scalars))
2187  return V;
2188 
2189  CastInst *CI = dyn_cast<CastInst>(VL0);
2190  Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
2191  E->VectorizedValue = V;
2192  ++NumVectorInstructions;
2193  return V;
2194  }
2195  case Instruction::FCmp:
2196  case Instruction::ICmp: {
2197  ValueList LHSV, RHSV;
2198  for (Value *V : E->Scalars) {
2199  LHSV.push_back(cast<Instruction>(V)->getOperand(0));
2200  RHSV.push_back(cast<Instruction>(V)->getOperand(1));
2201  }
2202 
2203  setInsertPointAfterBundle(E->Scalars);
2204 
2205  Value *L = vectorizeTree(LHSV);
2206  Value *R = vectorizeTree(RHSV);
2207 
2208  if (Value *V = alreadyVectorized(E->Scalars))
2209  return V;
2210 
2211  CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
2212  Value *V;
2213  if (Opcode == Instruction::FCmp)
2214  V = Builder.CreateFCmp(P0, L, R);
2215  else
2216  V = Builder.CreateICmp(P0, L, R);
2217 
2218  E->VectorizedValue = V;
2219  ++NumVectorInstructions;
2220  return V;
2221  }
2222  case Instruction::Select: {
2223  ValueList TrueVec, FalseVec, CondVec;
2224  for (Value *V : E->Scalars) {
2225  CondVec.push_back(cast<Instruction>(V)->getOperand(0));
2226  TrueVec.push_back(cast<Instruction>(V)->getOperand(1));
2227  FalseVec.push_back(cast<Instruction>(V)->getOperand(2));
2228  }
2229 
2230  setInsertPointAfterBundle(E->Scalars);
2231 
2232  Value *Cond = vectorizeTree(CondVec);
2233  Value *True = vectorizeTree(TrueVec);
2234  Value *False = vectorizeTree(FalseVec);
2235 
2236  if (Value *V = alreadyVectorized(E->Scalars))
2237  return V;
2238 
2239  Value *V = Builder.CreateSelect(Cond, True, False);
2240  E->VectorizedValue = V;
2241  ++NumVectorInstructions;
2242  return V;
2243  }
2244  case Instruction::Add:
2245  case Instruction::FAdd:
2246  case Instruction::Sub:
2247  case Instruction::FSub:
2248  case Instruction::Mul:
2249  case Instruction::FMul:
2250  case Instruction::UDiv:
2251  case Instruction::SDiv:
2252  case Instruction::FDiv:
2253  case Instruction::URem:
2254  case Instruction::SRem:
2255  case Instruction::FRem:
2256  case Instruction::Shl:
2257  case Instruction::LShr:
2258  case Instruction::AShr:
2259  case Instruction::And:
2260  case Instruction::Or:
2261  case Instruction::Xor: {
2262  ValueList LHSVL, RHSVL;
2263  if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
2264  reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);
2265  else
2266  for (Value *V : E->Scalars) {
2267  LHSVL.push_back(cast<Instruction>(V)->getOperand(0));
2268  RHSVL.push_back(cast<Instruction>(V)->getOperand(1));
2269  }
2270 
2271  setInsertPointAfterBundle(E->Scalars);
2272 
2273  Value *LHS = vectorizeTree(LHSVL);
2274  Value *RHS = vectorizeTree(RHSVL);
2275 
2276  if (LHS == RHS && isa<Instruction>(LHS)) {
2277  assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order");
2278  }
2279 
2280  if (Value *V = alreadyVectorized(E->Scalars))
2281  return V;
2282 
2283  BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
2284  Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
2285  E->VectorizedValue = V;
2286  propagateIRFlags(E->VectorizedValue, E->Scalars);
2287  ++NumVectorInstructions;
2288 
2289  if (Instruction *I = dyn_cast<Instruction>(V))
2290  return propagateMetadata(I, E->Scalars);
2291 
2292  return V;
2293  }
2294  case Instruction::Load: {
2295  // Loads are inserted at the head of the tree because we don't want to
2296  // sink them all the way down past store instructions.
2297  setInsertPointAfterBundle(E->Scalars);
2298 
2299  LoadInst *LI = cast<LoadInst>(VL0);
2300  Type *ScalarLoadTy = LI->getType();
2301  unsigned AS = LI->getPointerAddressSpace();
2302 
2303  Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
2304  VecTy->getPointerTo(AS));
2305 
2306  // The pointer operand uses an in-tree scalar so we add the new BitCast to
2307  // ExternalUses list to make sure that an extract will be generated in the
2308  // future.
2309  if (ScalarToTreeEntry.count(LI->getPointerOperand()))
2310  ExternalUses.push_back(
2311  ExternalUser(LI->getPointerOperand(), cast<User>(VecPtr), 0));
2312 
2313  unsigned Alignment = LI->getAlignment();
2314  LI = Builder.CreateLoad(VecPtr);
2315  if (!Alignment) {
2316  Alignment = DL.getABITypeAlignment(ScalarLoadTy);
2317  }
2318  LI->setAlignment(Alignment);
2319  E->VectorizedValue = LI;
2320  ++NumVectorInstructions;
2321  return propagateMetadata(LI, E->Scalars);
2322  }
2323  case Instruction::Store: {
2324  StoreInst *SI = cast<StoreInst>(VL0);
2325  unsigned Alignment = SI->getAlignment();
2326  unsigned AS = SI->getPointerAddressSpace();
2327 
2328  ValueList ValueOp;
2329  for (Value *V : E->Scalars)
2330  ValueOp.push_back(cast<StoreInst>(V)->getValueOperand());
2331 
2332  setInsertPointAfterBundle(E->Scalars);
2333 
2334  Value *VecValue = vectorizeTree(ValueOp);
2335  Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
2336  VecTy->getPointerTo(AS));
2337  StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
2338 
2339  // The pointer operand uses an in-tree scalar so we add the new BitCast to
2340  // ExternalUses list to make sure that an extract will be generated in the
2341  // future.
2342  if (ScalarToTreeEntry.count(SI->getPointerOperand()))
2343  ExternalUses.push_back(
2344  ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0));
2345 
2346  if (!Alignment) {
2347  Alignment = DL.getABITypeAlignment(SI->getValueOperand()->getType());
2348  }
2349  S->setAlignment(Alignment);
2350  E->VectorizedValue = S;
2351  ++NumVectorInstructions;
2352  return propagateMetadata(S, E->Scalars);
2353  }
2354  case Instruction::GetElementPtr: {
2355  setInsertPointAfterBundle(E->Scalars);
2356 
2357  ValueList Op0VL;
2358  for (Value *V : E->Scalars)
2359  Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0));
2360 
2361  Value *Op0 = vectorizeTree(Op0VL);
2362 
2363  std::vector<Value *> OpVecs;
2364  for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
2365  ++j) {
2366  ValueList OpVL;
2367  for (Value *V : E->Scalars)
2368  OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j));
2369 
2370  Value *OpVec = vectorizeTree(OpVL);
2371  OpVecs.push_back(OpVec);
2372  }
2373 
2374  Value *V = Builder.CreateGEP(
2375  cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
2376  E->VectorizedValue = V;
2377  ++NumVectorInstructions;
2378 
2379  if (Instruction *I = dyn_cast<Instruction>(V))
2380  return propagateMetadata(I, E->Scalars);
2381 
2382  return V;
2383  }
2384  case Instruction::Call: {
2385  CallInst *CI = cast<CallInst>(VL0);
2386  setInsertPointAfterBundle(E->Scalars);
2387  Function *FI;
2389  Value *ScalarArg = nullptr;
2390  if (CI && (FI = CI->getCalledFunction())) {
2391  IID = FI->getIntrinsicID();
2392  }
2393  std::vector<Value *> OpVecs;
2394  for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
2395  ValueList OpVL;
2396  // ctlz,cttz and powi are special intrinsics whose second argument is
2397  // a scalar. This argument should not be vectorized.
2398  if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
2399  CallInst *CEI = cast<CallInst>(E->Scalars[0]);
2400  ScalarArg = CEI->getArgOperand(j);
2401  OpVecs.push_back(CEI->getArgOperand(j));
2402  continue;
2403  }
2404  for (Value *V : E->Scalars) {
2405  CallInst *CEI = cast<CallInst>(V);
2406  OpVL.push_back(CEI->getArgOperand(j));
2407  }
2408 
2409  Value *OpVec = vectorizeTree(OpVL);
2410  DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
2411  OpVecs.push_back(OpVec);
2412  }
2413 
2414  Module *M = F->getParent();
2415  Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
2416  Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
2417  Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
2418  Value *V = Builder.CreateCall(CF, OpVecs);
2419 
2420  // The scalar argument uses an in-tree scalar so we add the new vectorized
2421  // call to ExternalUses list to make sure that an extract will be
2422  // generated in the future.
2423  if (ScalarArg && ScalarToTreeEntry.count(ScalarArg))
2424  ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
2425 
2426  E->VectorizedValue = V;
2427  ++NumVectorInstructions;
2428  return V;
2429  }
2430  case Instruction::ShuffleVector: {
2431  ValueList LHSVL, RHSVL;
2432  assert(isa<BinaryOperator>(VL0) && "Invalid Shuffle Vector Operand");
2433  reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL);
2434  setInsertPointAfterBundle(E->Scalars);
2435 
2436  Value *LHS = vectorizeTree(LHSVL);
2437  Value *RHS = vectorizeTree(RHSVL);
2438 
2439  if (Value *V = alreadyVectorized(E->Scalars))
2440  return V;
2441 
2442  // Create a vector of LHS op1 RHS
2443  BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0);
2444  Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS);
2445 
2446  // Create a vector of LHS op2 RHS
2447  Instruction *VL1 = cast<Instruction>(E->Scalars[1]);
2448  BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1);
2449  Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS);
2450 
2451  // Create shuffle to take alternate operations from the vector.
2452  // Also, gather up odd and even scalar ops to propagate IR flags to
2453  // each vector operation.
2454  ValueList OddScalars, EvenScalars;
2455  unsigned e = E->Scalars.size();
2457  for (unsigned i = 0; i < e; ++i) {
2458  if (i & 1) {
2459  Mask[i] = Builder.getInt32(e + i);
2460  OddScalars.push_back(E->Scalars[i]);
2461  } else {
2462  Mask[i] = Builder.getInt32(i);
2463  EvenScalars.push_back(E->Scalars[i]);
2464  }
2465  }
2466 
2467  Value *ShuffleMask = ConstantVector::get(Mask);
2468  propagateIRFlags(V0, EvenScalars);
2469  propagateIRFlags(V1, OddScalars);
2470 
2471  Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
2472  E->VectorizedValue = V;
2473  ++NumVectorInstructions;
2474  if (Instruction *I = dyn_cast<Instruction>(V))
2475  return propagateMetadata(I, E->Scalars);
2476 
2477  return V;
2478  }
2479  default:
2480  llvm_unreachable("unknown inst");
2481  }
2482  return nullptr;
2483 }
2484 
2485 Value *BoUpSLP::vectorizeTree() {
2486 
2487  // All blocks must be scheduled before any instructions are inserted.
2488  for (auto &BSIter : BlocksSchedules) {
2489  scheduleBlock(BSIter.second.get());
2490  }
2491 
2492  Builder.SetInsertPoint(F->getEntryBlock().begin());
2493  vectorizeTree(&VectorizableTree[0]);
2494 
2495  DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");
2496 
2497  // Extract all of the elements with the external uses.
2498  for (UserList::iterator it = ExternalUses.begin(), e = ExternalUses.end();
2499  it != e; ++it) {
2500  Value *Scalar = it->Scalar;
2501  llvm::User *User = it->User;
2502 
2503  // Skip users that we already RAUW. This happens when one instruction
2504  // has multiple uses of the same value.
2505  if (std::find(Scalar->user_begin(), Scalar->user_end(), User) ==
2506  Scalar->user_end())
2507  continue;
2508  assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar");
2509 
2510  int Idx = ScalarToTreeEntry[Scalar];
2511  TreeEntry *E = &VectorizableTree[Idx];
2512  assert(!E->NeedToGather && "Extracting from a gather list");
2513 
2514  Value *Vec = E->VectorizedValue;
2515  assert(Vec && "Can't find vectorizable value");
2516 
2517  Value *Lane = Builder.getInt32(it->Lane);
2518  // Generate extracts for out-of-tree users.
2519  // Find the insertion point for the extractelement lane.
2520  if (isa<Instruction>(Vec)){
2521  if (PHINode *PH = dyn_cast<PHINode>(User)) {
2522  for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
2523  if (PH->getIncomingValue(i) == Scalar) {
2524  Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
2525  Value *Ex = Builder.CreateExtractElement(Vec, Lane);
2526  CSEBlocks.insert(PH->getIncomingBlock(i));
2527  PH->setOperand(i, Ex);
2528  }
2529  }
2530  } else {
2531  Builder.SetInsertPoint(cast<Instruction>(User));
2532  Value *Ex = Builder.CreateExtractElement(Vec, Lane);
2533  CSEBlocks.insert(cast<Instruction>(User)->getParent());
2534  User->replaceUsesOfWith(Scalar, Ex);
2535  }
2536  } else {
2537  Builder.SetInsertPoint(F->getEntryBlock().begin());
2538  Value *Ex = Builder.CreateExtractElement(Vec, Lane);
2539  CSEBlocks.insert(&F->getEntryBlock());
2540  User->replaceUsesOfWith(Scalar, Ex);
2541  }
2542 
2543  DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
2544  }
2545 
2546  // For each vectorized value:
2547  for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
2548  TreeEntry *Entry = &VectorizableTree[EIdx];
2549 
2550  // For each lane:
2551  for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
2552  Value *Scalar = Entry->Scalars[Lane];
2553  // No need to handle users of gathered values.
2554  if (Entry->NeedToGather)
2555  continue;
2556 
2557  assert(Entry->VectorizedValue && "Can't find vectorizable value");
2558 
2559  Type *Ty = Scalar->getType();
2560  if (!Ty->isVoidTy()) {
2561 #ifndef NDEBUG
2562  for (User *U : Scalar->users()) {
2563  DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
2564 
2565  assert((ScalarToTreeEntry.count(U) ||
2566  // It is legal to replace users in the ignorelist by undef.
2567  (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), U) !=
2568  UserIgnoreList.end())) &&
2569  "Replacing out-of-tree value with undef");
2570  }
2571 #endif
2572  Value *Undef = UndefValue::get(Ty);
2573  Scalar->replaceAllUsesWith(Undef);
2574  }
2575  DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
2576  eraseInstruction(cast<Instruction>(Scalar));
2577  }
2578  }
2579 
2580  Builder.ClearInsertionPoint();
2581 
2582  return VectorizableTree[0].VectorizedValue;
2583 }
2584 
2585 void BoUpSLP::optimizeGatherSequence() {
2586  DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
2587  << " gather sequences instructions.\n");
2588  // LICM InsertElementInst sequences.
2589  for (SetVector<Instruction *>::iterator it = GatherSeq.begin(),
2590  e = GatherSeq.end(); it != e; ++it) {
2592 
2593  if (!Insert)
2594  continue;
2595 
2596  // Check if this block is inside a loop.
2597  Loop *L = LI->getLoopFor(Insert->getParent());
2598  if (!L)
2599  continue;
2600 
2601  // Check if it has a preheader.
2602  BasicBlock *PreHeader = L->getLoopPreheader();
2603  if (!PreHeader)
2604  continue;
2605 
2606  // If the vector or the element that we insert into it are
2607  // instructions that are defined in this basic block then we can't
2608  // hoist this instruction.
2609  Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
2610  Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
2611  if (CurrVec && L->contains(CurrVec))
2612  continue;
2613  if (NewElem && L->contains(NewElem))
2614  continue;
2615 
2616  // We can hoist this instruction. Move it to the pre-header.
2617  Insert->moveBefore(PreHeader->getTerminator());
2618  }
2619 
2620  // Make a list of all reachable blocks in our CSE queue.
2622  CSEWorkList.reserve(CSEBlocks.size());
2623  for (BasicBlock *BB : CSEBlocks)
2624  if (DomTreeNode *N = DT->getNode(BB)) {
2625  assert(DT->isReachableFromEntry(N));
2626  CSEWorkList.push_back(N);
2627  }
2628 
2629  // Sort blocks by domination. This ensures we visit a block after all blocks
2630  // dominating it are visited.
2631  std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(),
2632  [this](const DomTreeNode *A, const DomTreeNode *B) {
2633  return DT->properlyDominates(A, B);
2634  });
2635 
2636  // Perform O(N^2) search over the gather sequences and merge identical
2637  // instructions. TODO: We can further optimize this scan if we split the
2638  // instructions into different buckets based on the insert lane.
2640  for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
2641  assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
2642  "Worklist not sorted properly!");
2643  BasicBlock *BB = (*I)->getBlock();
2644  // For all instructions in blocks containing gather sequences:
2645  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
2646  Instruction *In = it++;
2647  if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
2648  continue;
2649 
2650  // Check if we can replace this instruction with any of the
2651  // visited instructions.
2652  for (SmallVectorImpl<Instruction *>::iterator v = Visited.begin(),
2653  ve = Visited.end();
2654  v != ve; ++v) {
2655  if (In->isIdenticalTo(*v) &&
2656  DT->dominates((*v)->getParent(), In->getParent())) {
2657  In->replaceAllUsesWith(*v);
2658  eraseInstruction(In);
2659  In = nullptr;
2660  break;
2661  }
2662  }
2663  if (In) {
2664  assert(std::find(Visited.begin(), Visited.end(), In) == Visited.end());
2665  Visited.push_back(In);
2666  }
2667  }
2668  }
2669  CSEBlocks.clear();
2670  GatherSeq.clear();
2671 }
2672 
2673 // Groups the instructions to a bundle (which is then a single scheduling entity)
2674 // and schedules instructions until the bundle gets ready.
2675 bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
2676  BoUpSLP *SLP) {
2677  if (isa<PHINode>(VL[0]))
2678  return true;
2679 
2680  // Initialize the instruction bundle.
2681  Instruction *OldScheduleEnd = ScheduleEnd;
2682  ScheduleData *PrevInBundle = nullptr;
2683  ScheduleData *Bundle = nullptr;
2684  bool ReSchedule = false;
2685  DEBUG(dbgs() << "SLP: bundle: " << *VL[0] << "\n");
2686  for (Value *V : VL) {
2687  extendSchedulingRegion(V);
2688  ScheduleData *BundleMember = getScheduleData(V);
2689  assert(BundleMember &&
2690  "no ScheduleData for bundle member (maybe not in same basic block)");
2691  if (BundleMember->IsScheduled) {
2692  // A bundle member was scheduled as single instruction before and now
2693  // needs to be scheduled as part of the bundle. We just get rid of the
2694  // existing schedule.
2695  DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
2696  << " was already scheduled\n");
2697  ReSchedule = true;
2698  }
2699  assert(BundleMember->isSchedulingEntity() &&
2700  "bundle member already part of other bundle");
2701  if (PrevInBundle) {
2702  PrevInBundle->NextInBundle = BundleMember;
2703  } else {
2704  Bundle = BundleMember;
2705  }
2706  BundleMember->UnscheduledDepsInBundle = 0;
2707  Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
2708 
2709  // Group the instructions to a bundle.
2710  BundleMember->FirstInBundle = Bundle;
2711  PrevInBundle = BundleMember;
2712  }
2713  if (ScheduleEnd != OldScheduleEnd) {
2714  // The scheduling region got new instructions at the lower end (or it is a
2715  // new region for the first bundle). This makes it necessary to
2716  // recalculate all dependencies.
2717  // It is seldom that this needs to be done a second time after adding the
2718  // initial bundle to the region.
2719  for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
2720  ScheduleData *SD = getScheduleData(I);
2721  SD->clearDependencies();
2722  }
2723  ReSchedule = true;
2724  }
2725  if (ReSchedule) {
2726  resetSchedule();
2727  initialFillReadyList(ReadyInsts);
2728  }
2729 
2730  DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
2731  << BB->getName() << "\n");
2732 
2733  calculateDependencies(Bundle, true, SLP);
2734 
2735  // Now try to schedule the new bundle. As soon as the bundle is "ready" it
2736  // means that there are no cyclic dependencies and we can schedule it.
2737  // Note that's important that we don't "schedule" the bundle yet (see
2738  // cancelScheduling).
2739  while (!Bundle->isReady() && !ReadyInsts.empty()) {
2740 
2741  ScheduleData *pickedSD = ReadyInsts.back();
2742  ReadyInsts.pop_back();
2743 
2744  if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
2745  schedule(pickedSD, ReadyInsts);
2746  }
2747  }
2748  return Bundle->isReady();
2749 }
2750 
2751 void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) {
2752  if (isa<PHINode>(VL[0]))
2753  return;
2754 
2755  ScheduleData *Bundle = getScheduleData(VL[0]);
2756  DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
2757  assert(!Bundle->IsScheduled &&
2758  "Can't cancel bundle which is already scheduled");
2759  assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
2760  "tried to unbundle something which is not a bundle");
2761 
2762  // Un-bundle: make single instructions out of the bundle.
2763  ScheduleData *BundleMember = Bundle;
2764  while (BundleMember) {
2765  assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
2766  BundleMember->FirstInBundle = BundleMember;
2767  ScheduleData *Next = BundleMember->NextInBundle;
2768  BundleMember->NextInBundle = nullptr;
2769  BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
2770  if (BundleMember->UnscheduledDepsInBundle == 0) {
2771  ReadyInsts.insert(BundleMember);
2772  }
2773  BundleMember = Next;
2774  }
2775 }
2776 
2777 void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
2778  if (getScheduleData(V))
2779  return;
2780  Instruction *I = dyn_cast<Instruction>(V);
2781  assert(I && "bundle member must be an instruction");
2782  assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
2783  if (!ScheduleStart) {
2784  // It's the first instruction in the new region.
2785  initScheduleData(I, I->getNextNode(), nullptr, nullptr);
2786  ScheduleStart = I;
2787  ScheduleEnd = I->getNextNode();
2788  assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
2789  DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
2790  return;
2791  }
2792  // Search up and down at the same time, because we don't know if the new
2793  // instruction is above or below the existing scheduling region.
2794  BasicBlock::reverse_iterator UpIter(ScheduleStart);
2795  BasicBlock::reverse_iterator UpperEnd = BB->rend();
2796  BasicBlock::iterator DownIter(ScheduleEnd);
2797  BasicBlock::iterator LowerEnd = BB->end();
2798  for (;;) {
2799  if (UpIter != UpperEnd) {
2800  if (&*UpIter == I) {
2801  initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
2802  ScheduleStart = I;
2803  DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n");
2804  return;
2805  }
2806  UpIter++;
2807  }
2808  if (DownIter != LowerEnd) {
2809  if (&*DownIter == I) {
2810  initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
2811  nullptr);
2812  ScheduleEnd = I->getNextNode();
2813  assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
2814  DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
2815  return;
2816  }
2817  DownIter++;
2818  }
2819  assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
2820  "instruction not found in block");
2821  }
2822 }
2823 
2824 void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
2825  Instruction *ToI,
2826  ScheduleData *PrevLoadStore,
2827  ScheduleData *NextLoadStore) {
2828  ScheduleData *CurrentLoadStore = PrevLoadStore;
2829  for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
2830  ScheduleData *SD = ScheduleDataMap[I];
2831  if (!SD) {
2832  // Allocate a new ScheduleData for the instruction.
2833  if (ChunkPos >= ChunkSize) {
2834  ScheduleDataChunks.push_back(
2835  llvm::make_unique<ScheduleData[]>(ChunkSize));
2836  ChunkPos = 0;
2837  }
2838  SD = &(ScheduleDataChunks.back()[ChunkPos++]);
2839  ScheduleDataMap[I] = SD;
2840  SD->Inst = I;
2841  }
2842  assert(!isInSchedulingRegion(SD) &&
2843  "new ScheduleData already in scheduling region");
2844  SD->init(SchedulingRegionID);
2845 
2846  if (I->mayReadOrWriteMemory()) {
2847  // Update the linked list of memory accessing instructions.
2848  if (CurrentLoadStore) {
2849  CurrentLoadStore->NextLoadStore = SD;
2850  } else {
2851  FirstLoadStoreInRegion = SD;
2852  }
2853  CurrentLoadStore = SD;
2854  }
2855  }
2856  if (NextLoadStore) {
2857  if (CurrentLoadStore)
2858  CurrentLoadStore->NextLoadStore = NextLoadStore;
2859  } else {
2860  LastLoadStoreInRegion = CurrentLoadStore;
2861  }
2862 }
2863 
2864 void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
2865  bool InsertInReadyList,
2866  BoUpSLP *SLP) {
2867  assert(SD->isSchedulingEntity());
2868 
2870  WorkList.push_back(SD);
2871 
2872  while (!WorkList.empty()) {
2873  ScheduleData *SD = WorkList.back();
2874  WorkList.pop_back();
2875 
2876  ScheduleData *BundleMember = SD;
2877  while (BundleMember) {
2878  assert(isInSchedulingRegion(BundleMember));
2879  if (!BundleMember->hasValidDependencies()) {
2880 
2881  DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
2882  BundleMember->Dependencies = 0;
2883  BundleMember->resetUnscheduledDeps();
2884 
2885  // Handle def-use chain dependencies.
2886  for (User *U : BundleMember->Inst->users()) {
2887  if (isa<Instruction>(U)) {
2888  ScheduleData *UseSD = getScheduleData(U);
2889  if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
2890  BundleMember->Dependencies++;
2891  ScheduleData *DestBundle = UseSD->FirstInBundle;
2892  if (!DestBundle->IsScheduled) {
2893  BundleMember->incrementUnscheduledDeps(1);
2894  }
2895  if (!DestBundle->hasValidDependencies()) {
2896  WorkList.push_back(DestBundle);
2897  }
2898  }
2899  } else {
2900  // I'm not sure if this can ever happen. But we need to be safe.
2901  // This lets the instruction/bundle never be scheduled and eventally
2902  // disable vectorization.
2903  BundleMember->Dependencies++;
2904  BundleMember->incrementUnscheduledDeps(1);
2905  }
2906  }
2907 
2908  // Handle the memory dependencies.
2909  ScheduleData *DepDest = BundleMember->NextLoadStore;
2910  if (DepDest) {
2911  Instruction *SrcInst = BundleMember->Inst;
2912  MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA);
2913  bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
2914  unsigned numAliased = 0;
2915  unsigned DistToSrc = 1;
2916 
2917  while (DepDest) {
2918  assert(isInSchedulingRegion(DepDest));
2919 
2920  // We have two limits to reduce the complexity:
2921  // 1) AliasedCheckLimit: It's a small limit to reduce calls to
2922  // SLP->isAliased (which is the expensive part in this loop).
2923  // 2) MaxMemDepDistance: It's for very large blocks and it aborts
2924  // the whole loop (even if the loop is fast, it's quadratic).
2925  // It's important for the loop break condition (see below) to
2926  // check this limit even between two read-only instructions.
2927  if (DistToSrc >= MaxMemDepDistance ||
2928  ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
2929  (numAliased >= AliasedCheckLimit ||
2930  SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
2931 
2932  // We increment the counter only if the locations are aliased
2933  // (instead of counting all alias checks). This gives a better
2934  // balance between reduced runtime and accurate dependencies.
2935  numAliased++;
2936 
2937  DepDest->MemoryDependencies.push_back(BundleMember);
2938  BundleMember->Dependencies++;
2939  ScheduleData *DestBundle = DepDest->FirstInBundle;
2940  if (!DestBundle->IsScheduled) {
2941  BundleMember->incrementUnscheduledDeps(1);
2942  }
2943  if (!DestBundle->hasValidDependencies()) {
2944  WorkList.push_back(DestBundle);
2945  }
2946  }
2947  DepDest = DepDest->NextLoadStore;
2948 
2949  // Example, explaining the loop break condition: Let's assume our
2950  // starting instruction is i0 and MaxMemDepDistance = 3.
2951  //
2952  // +--------v--v--v
2953  // i0,i1,i2,i3,i4,i5,i6,i7,i8
2954  // +--------^--^--^
2955  //
2956  // MaxMemDepDistance let us stop alias-checking at i3 and we add
2957  // dependencies from i0 to i3,i4,.. (even if they are not aliased).
2958  // Previously we already added dependencies from i3 to i6,i7,i8
2959  // (because of MaxMemDepDistance). As we added a dependency from
2960  // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
2961  // and we can abort this loop at i6.
2962  if (DistToSrc >= 2 * MaxMemDepDistance)
2963  break;
2964  DistToSrc++;
2965  }
2966  }
2967  }
2968  BundleMember = BundleMember->NextInBundle;
2969  }
2970  if (InsertInReadyList && SD->isReady()) {
2971  ReadyInsts.push_back(SD);
2972  DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst << "\n");
2973  }
2974  }
2975 }
2976 
2977 void BoUpSLP::BlockScheduling::resetSchedule() {
2978  assert(ScheduleStart &&
2979  "tried to reset schedule on block which has not been scheduled");
2980  for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
2981  ScheduleData *SD = getScheduleData(I);
2982  assert(isInSchedulingRegion(SD));
2983  SD->IsScheduled = false;
2984  SD->resetUnscheduledDeps();
2985  }
2986  ReadyInsts.clear();
2987 }
2988 
2989 void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
2990 
2991  if (!BS->ScheduleStart)
2992  return;
2993 
2994  DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
2995 
2996  BS->resetSchedule();
2997 
2998  // For the real scheduling we use a more sophisticated ready-list: it is
2999  // sorted by the original instruction location. This lets the final schedule
3000  // be as close as possible to the original instruction order.
3001  struct ScheduleDataCompare {
3002  bool operator()(ScheduleData *SD1, ScheduleData *SD2) {
3003  return SD2->SchedulingPriority < SD1->SchedulingPriority;
3004  }
3005  };
3006  std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
3007 
3008  // Ensure that all depencency data is updated and fill the ready-list with
3009  // initial instructions.
3010  int Idx = 0;
3011  int NumToSchedule = 0;
3012  for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
3013  I = I->getNextNode()) {
3014  ScheduleData *SD = BS->getScheduleData(I);
3015  assert(
3016  SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) &&
3017  "scheduler and vectorizer have different opinion on what is a bundle");
3018  SD->FirstInBundle->SchedulingPriority = Idx++;
3019  if (SD->isSchedulingEntity()) {
3020  BS->calculateDependencies(SD, false, this);
3021  NumToSchedule++;
3022  }
3023  }
3024  BS->initialFillReadyList(ReadyInsts);
3025 
3026  Instruction *LastScheduledInst = BS->ScheduleEnd;
3027 
3028  // Do the "real" scheduling.
3029  while (!ReadyInsts.empty()) {
3030  ScheduleData *picked = *ReadyInsts.begin();
3031  ReadyInsts.erase(ReadyInsts.begin());
3032 
3033  // Move the scheduled instruction(s) to their dedicated places, if not
3034  // there yet.
3035  ScheduleData *BundleMember = picked;
3036  while (BundleMember) {
3037  Instruction *pickedInst = BundleMember->Inst;
3038  if (LastScheduledInst->getNextNode() != pickedInst) {
3039  BS->BB->getInstList().remove(pickedInst);
3040  BS->BB->getInstList().insert(LastScheduledInst, pickedInst);
3041  }
3042  LastScheduledInst = pickedInst;
3043  BundleMember = BundleMember->NextInBundle;
3044  }
3045 
3046  BS->schedule(picked, ReadyInsts);
3047  NumToSchedule--;
3048  }
3049  assert(NumToSchedule == 0 && "could not schedule all instructions");
3050 
3051  // Avoid duplicate scheduling of the block.
3052  BS->ScheduleStart = nullptr;
3053 }
3054 
3055 /// The SLPVectorizer Pass.
3056 struct SLPVectorizer : public FunctionPass {
3057  typedef SmallVector<StoreInst *, 8> StoreList;
3058  typedef MapVector<Value *, StoreList> StoreListMap;
3059 
3060  /// Pass identification, replacement for typeid
3061  static char ID;
3062 
3063  explicit SLPVectorizer() : FunctionPass(ID) {
3065  }
3066 
3067  ScalarEvolution *SE;
3068  TargetTransformInfo *TTI;
3069  TargetLibraryInfo *TLI;
3070  AliasAnalysis *AA;
3071  LoopInfo *LI;
3072  DominatorTree *DT;
3073  AssumptionCache *AC;
3074 
3075  bool runOnFunction(Function &F) override {
3076  if (skipOptnoneFunction(F))
3077  return false;
3078 
3079  SE = &getAnalysis<ScalarEvolution>();
3080  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
3081  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
3082  TLI = TLIP ? &TLIP->getTLI() : nullptr;
3083  AA = &getAnalysis<AliasAnalysis>();
3084  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
3085  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
3086  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
3087 
3088  StoreRefs.clear();
3089  bool Changed = false;
3090 
3091  // If the target claims to have no vector registers don't attempt
3092  // vectorization.
3093  if (!TTI->getNumberOfRegisters(true))
3094  return false;
3095 
3096  // Use the vector register size specified by the target unless overridden
3097  // by a command-line option.
3098  // TODO: It would be better to limit the vectorization factor based on
3099  // data type rather than just register size. For example, x86 AVX has
3100  // 256-bit registers, but it does not support integer operations
3101  // at that width (that requires AVX2).
3102  if (MaxVectorRegSizeOption.getNumOccurrences())
3103  MaxVecRegSize = MaxVectorRegSizeOption;
3104  else
3105  MaxVecRegSize = TTI->getRegisterBitWidth(true);
3106 
3107  // Don't vectorize when the attribute NoImplicitFloat is used.
3109  return false;
3110 
3111  DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
3112 
3113  // Use the bottom up slp vectorizer to construct chains that start with
3114  // store instructions.
3115  BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC);
3116 
3117  // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
3118  // delete instructions.
3119 
3120  // Scan the blocks in the function in post order.
3121  for (auto BB : post_order(&F.getEntryBlock())) {
3122  // Vectorize trees that end at stores.
3123  if (unsigned count = collectStores(BB, R)) {
3124  (void)count;
3125  DEBUG(dbgs() << "SLP: Found " << count << " stores to vectorize.\n");
3126  Changed |= vectorizeStoreChains(R);
3127  }
3128 
3129  // Vectorize trees that end at reductions.
3130  Changed |= vectorizeChainsInBlock(BB, R);
3131  }
3132 
3133  if (Changed) {
3134  R.optimizeGatherSequence();
3135  DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
3136  DEBUG(verifyFunction(F));
3137  }
3138  return Changed;
3139  }
3140 
3141  void getAnalysisUsage(AnalysisUsage &AU) const override {
3145  AU.addRequired<AliasAnalysis>();
3151  AU.setPreservesCFG();
3152  }
3153 
3154 private:
3155 
3156  /// \brief Collect memory references and sort them according to their base
3157  /// object. We sort the stores to their base objects to reduce the cost of the
3158  /// quadratic search on the stores. TODO: We can further reduce this cost
3159  /// if we flush the chain creation every time we run into a memory barrier.
3160  unsigned collectStores(BasicBlock *BB, BoUpSLP &R);
3161 
3162  /// \brief Try to vectorize a chain that starts at two arithmetic instrs.
3163  bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R);
3164 
3165  /// \brief Try to vectorize a list of operands.
3166  /// \@param BuildVector A list of users to ignore for the purpose of
3167  /// scheduling and that don't need extracting.
3168  /// \returns true if a value was vectorized.
3169  bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
3170  ArrayRef<Value *> BuildVector = None,
3171  bool allowReorder = false);
3172 
3173  /// \brief Try to vectorize a chain that may start at the operands of \V;
3174  bool tryToVectorize(BinaryOperator *V, BoUpSLP &R);
3175 
3176  /// \brief Vectorize the stores that were collected in StoreRefs.
3177  bool vectorizeStoreChains(BoUpSLP &R);
3178 
3179  /// \brief Scan the basic block and look for patterns that are likely to start
3180  /// a vectorization chain.
3181  bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R);
3182 
3183  bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold,
3184  BoUpSLP &R, unsigned VecRegSize);
3185 
3186  bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold,
3187  BoUpSLP &R);
3188 private:
3189  StoreListMap StoreRefs;
3190  unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
3191 };
3192 
3193 /// \brief Check that the Values in the slice in VL array are still existent in
3194 /// the WeakVH array.
3195 /// Vectorization of part of the VL array may cause later values in the VL array
3196 /// to become invalid. We track when this has happened in the WeakVH array.
3197 static bool hasValueBeenRAUWed(ArrayRef<Value *> VL, ArrayRef<WeakVH> VH,
3198  unsigned SliceBegin, unsigned SliceSize) {
3199  VL = VL.slice(SliceBegin, SliceSize);
3200  VH = VH.slice(SliceBegin, SliceSize);
3201  return !std::equal(VL.begin(), VL.end(), VH.begin());
3202 }
3203 
3204 bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
3205  int CostThreshold, BoUpSLP &R,
3206  unsigned VecRegSize) {
3207  unsigned ChainLen = Chain.size();
3208  DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
3209  << "\n");
3210  Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
3211  auto &DL = cast<StoreInst>(Chain[0])->getModule()->getDataLayout();
3212  unsigned Sz = DL.getTypeSizeInBits(StoreTy);
3213  unsigned VF = VecRegSize / Sz;
3214 
3215  if (!isPowerOf2_32(Sz) || VF < 2)
3216  return false;
3217 
3218  // Keep track of values that were deleted by vectorizing in the loop below.
3219  SmallVector<WeakVH, 8> TrackValues(Chain.begin(), Chain.end());
3220 
3221  bool Changed = false;
3222  // Look for profitable vectorizable trees at all offsets, starting at zero.
3223  for (unsigned i = 0, e = ChainLen; i < e; ++i) {
3224  if (i + VF > e)
3225  break;
3226 
3227  // Check that a previous iteration of this loop did not delete the Value.
3228  if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
3229  continue;
3230 
3231  DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
3232  << "\n");
3233  ArrayRef<Value *> Operands = Chain.slice(i, VF);
3234 
3235  R.buildTree(Operands);
3236 
3237  int Cost = R.getTreeCost();
3238 
3239  DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
3240  if (Cost < CostThreshold) {
3241  DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
3242  R.vectorizeTree();
3243 
3244  // Move to the next bundle.
3245  i += VF - 1;
3246  Changed = true;
3247  }
3248  }
3249 
3250  return Changed;
3251 }
3252 
3253 bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
3254  int costThreshold, BoUpSLP &R) {
3255  SetVector<StoreInst *> Heads, Tails;
3256  SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
3257 
3258  // We may run into multiple chains that merge into a single chain. We mark the
3259  // stores that we vectorized so that we don't visit the same store twice.
3260  BoUpSLP::ValueSet VectorizedStores;
3261  bool Changed = false;
3262 
3263  // Do a quadratic search on all of the given stores and find
3264  // all of the pairs of stores that follow each other.
3265  for (unsigned i = 0, e = Stores.size(); i < e; ++i) {
3266  for (unsigned j = 0; j < e; ++j) {
3267  if (i == j)
3268  continue;
3269  const DataLayout &DL = Stores[i]->getModule()->getDataLayout();
3270  if (R.isConsecutiveAccess(Stores[i], Stores[j], DL)) {
3271  Tails.insert(Stores[j]);
3272  Heads.insert(Stores[i]);
3273  ConsecutiveChain[Stores[i]] = Stores[j];
3274  }
3275  }
3276  }
3277 
3278  // For stores that start but don't end a link in the chain:
3279  for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
3280  it != e; ++it) {
3281  if (Tails.count(*it))
3282  continue;
3283 
3284  // We found a store instr that starts a chain. Now follow the chain and try
3285  // to vectorize it.
3287  StoreInst *I = *it;
3288  // Collect the chain into a list.
3289  while (Tails.count(I) || Heads.count(I)) {
3290  if (VectorizedStores.count(I))
3291  break;
3292  Operands.push_back(I);
3293  // Move to the next value in the chain.
3294  I = ConsecutiveChain[I];
3295  }
3296 
3297  // FIXME: Is division-by-2 the correct step? Should we assert that the
3298  // register size is a power-of-2?
3299  for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; Size /= 2) {
3300  if (vectorizeStoreChain(Operands, costThreshold, R, Size)) {
3301  // Mark the vectorized stores so that we don't vectorize them again.
3302  VectorizedStores.insert(Operands.begin(), Operands.end());
3303  Changed = true;
3304  break;
3305  }
3306  }
3307  }
3308 
3309  return Changed;
3310 }
3311 
3312 
3313 unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
3314  unsigned count = 0;
3315  StoreRefs.clear();
3316  const DataLayout &DL = BB->getModule()->getDataLayout();
3317  for (Instruction &I : *BB) {
3318  StoreInst *SI = dyn_cast<StoreInst>(&I);
3319  if (!SI)
3320  continue;
3321 
3322  // Don't touch volatile stores.
3323  if (!SI->isSimple())
3324  continue;
3325 
3326  // Check that the pointer points to scalars.
3327  Type *Ty = SI->getValueOperand()->getType();
3328  if (!isValidElementType(Ty))
3329  continue;
3330 
3331  // Find the base pointer.
3332  Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), DL);
3333 
3334  // Save the store locations.
3335  StoreRefs[Ptr].push_back(SI);
3336  count++;
3337  }
3338  return count;
3339 }
3340 
3341 bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
3342  if (!A || !B)
3343  return false;
3344  Value *VL[] = { A, B };
3345  return tryToVectorizeList(VL, R, None, true);
3346 }
3347 
3348 bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
3349  ArrayRef<Value *> BuildVector,
3350  bool allowReorder) {
3351  if (VL.size() < 2)
3352  return false;
3353 
3354  DEBUG(dbgs() << "SLP: Vectorizing a list of length = " << VL.size() << ".\n");
3355 
3356  // Check that all of the parts are scalar instructions of the same type.
3357  Instruction *I0 = dyn_cast<Instruction>(VL[0]);
3358  if (!I0)
3359  return false;
3360 
3361  unsigned Opcode0 = I0->getOpcode();
3362  const DataLayout &DL = I0->getModule()->getDataLayout();
3363 
3364  Type *Ty0 = I0->getType();
3365  unsigned Sz = DL.getTypeSizeInBits(Ty0);
3366  // FIXME: Register size should be a parameter to this function, so we can
3367  // try different vectorization factors.
3368  unsigned VF = MinVecRegSize / Sz;
3369 
3370  for (Value *V : VL) {
3371  Type *Ty = V->getType();
3372  if (!isValidElementType(Ty))
3373  return false;
3374  Instruction *Inst = dyn_cast<Instruction>(V);
3375  if (!Inst || Inst->getOpcode() != Opcode0)
3376  return false;
3377  }
3378 
3379  bool Changed = false;
3380 
3381  // Keep track of values that were deleted by vectorizing in the loop below.
3382  SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end());
3383 
3384  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
3385  unsigned OpsWidth = 0;
3386 
3387  if (i + VF > e)
3388  OpsWidth = e - i;
3389  else
3390  OpsWidth = VF;
3391 
3392  if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
3393  break;
3394 
3395  // Check that a previous iteration of this loop did not delete the Value.
3396  if (hasValueBeenRAUWed(VL, TrackValues, i, OpsWidth))
3397  continue;
3398 
3399  DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
3400  << "\n");
3401  ArrayRef<Value *> Ops = VL.slice(i, OpsWidth);
3402 
3403  ArrayRef<Value *> BuildVectorSlice;
3404  if (!BuildVector.empty())
3405  BuildVectorSlice = BuildVector.slice(i, OpsWidth);
3406 
3407  R.buildTree(Ops, BuildVectorSlice);
3408  // TODO: check if we can allow reordering also for other cases than
3409  // tryToVectorizePair()
3410  if (allowReorder && R.shouldReorder()) {
3411  assert(Ops.size() == 2);
3412  assert(BuildVectorSlice.empty());
3413  Value *ReorderedOps[] = { Ops[1], Ops[0] };
3414  R.buildTree(ReorderedOps, None);
3415  }
3416  int Cost = R.getTreeCost();
3417 
3418  if (Cost < -SLPCostThreshold) {
3419  DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
3420  Value *VectorizedRoot = R.vectorizeTree();
3421 
3422  // Reconstruct the build vector by extracting the vectorized root. This
3423  // way we handle the case where some elements of the vector are undefined.
3424  // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
3425  if (!BuildVectorSlice.empty()) {
3426  // The insert point is the last build vector instruction. The vectorized
3427  // root will precede it. This guarantees that we get an instruction. The
3428  // vectorized tree could have been constant folded.
3429  Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
3430  unsigned VecIdx = 0;
3431  for (auto &V : BuildVectorSlice) {
3432  IRBuilder<true, NoFolder> Builder(
3433  ++BasicBlock::iterator(InsertAfter));
3434  InsertElementInst *IE = cast<InsertElementInst>(V);
3435  Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement(
3436  VectorizedRoot, Builder.getInt32(VecIdx++)));
3437  IE->setOperand(1, Extract);
3438  IE->removeFromParent();
3439  IE->insertAfter(Extract);
3440  InsertAfter = IE;
3441  }
3442  }
3443  // Move to the next bundle.
3444  i += VF - 1;
3445  Changed = true;
3446  }
3447  }
3448 
3449  return Changed;
3450 }
3451 
3452 bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
3453  if (!V)
3454  return false;
3455 
3456  // Try to vectorize V.
3457  if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R))
3458  return true;
3459 
3462  // Try to skip B.
3463  if (B && B->hasOneUse()) {
3464  BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
3465  BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
3466  if (tryToVectorizePair(A, B0, R)) {
3467  return true;
3468  }
3469  if (tryToVectorizePair(A, B1, R)) {
3470  return true;
3471  }
3472  }
3473 
3474  // Try to skip A.
3475  if (A && A->hasOneUse()) {
3478  if (tryToVectorizePair(A0, B, R)) {
3479  return true;
3480  }
3481  if (tryToVectorizePair(A1, B, R)) {
3482  return true;
3483  }
3484  }
3485  return 0;
3486 }
3487 
3488 /// \brief Generate a shuffle mask to be used in a reduction tree.
3489 ///
3490 /// \param VecLen The length of the vector to be reduced.
3491 /// \param NumEltsToRdx The number of elements that should be reduced in the
3492 /// vector.
3493 /// \param IsPairwise Whether the reduction is a pairwise or splitting
3494 /// reduction. A pairwise reduction will generate a mask of
3495 /// <0,2,...> or <1,3,..> while a splitting reduction will generate
3496 /// <2,3, undef,undef> for a vector of 4 and NumElts = 2.
3497 /// \param IsLeft True will generate a mask of even elements, odd otherwise.
3498 static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
3499  bool IsPairwise, bool IsLeft,
3500  IRBuilder<> &Builder) {
3501  assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask");
3502 
3503  SmallVector<Constant *, 32> ShuffleMask(
3504  VecLen, UndefValue::get(Builder.getInt32Ty()));
3505 
3506  if (IsPairwise)
3507  // Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
3508  for (unsigned i = 0; i != NumEltsToRdx; ++i)
3509  ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft);
3510  else
3511  // Move the upper half of the vector to the lower half.
3512  for (unsigned i = 0; i != NumEltsToRdx; ++i)
3513  ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i);
3514 
3515  return ConstantVector::get(ShuffleMask);
3516 }
3517 
3518 
3519 /// Model horizontal reductions.
3520 ///
3521 /// A horizontal reduction is a tree of reduction operations (currently add and
3522 /// fadd) that has operations that can be put into a vector as its leaf.
3523 /// For example, this tree:
3524 ///
3525 /// mul mul mul mul
3526 /// \ / \ /
3527 /// + +
3528 /// \ /
3529 /// +
3530 /// This tree has "mul" as its reduced values and "+" as its reduction
3531 /// operations. A reduction might be feeding into a store or a binary operation
3532 /// feeding a phi.
3533 /// ...
3534 /// \ /
3535 /// +
3536 /// |
3537 /// phi +=
3538 ///
3539 /// Or:
3540 /// ...
3541 /// \ /
3542 /// +
3543 /// |
3544 /// *p =
3545 ///
3546 class HorizontalReduction {
3547  SmallVector<Value *, 16> ReductionOps;
3548  SmallVector<Value *, 32> ReducedVals;
3549 
3550  BinaryOperator *ReductionRoot;
3551  PHINode *ReductionPHI;
3552 
3553  /// The opcode of the reduction.
3554  unsigned ReductionOpcode;
3555  /// The opcode of the values we perform a reduction on.
3556  unsigned ReducedValueOpcode;
3557  /// The width of one full horizontal reduction operation.
3558  unsigned ReduxWidth;
3559  /// Should we model this reduction as a pairwise reduction tree or a tree that
3560  /// splits the vector in halves and adds those halves.
3561  bool IsPairwiseReduction;
3562 
3563 public:
3564  HorizontalReduction()
3565  : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0),
3566  ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {}
3567 
3568  /// \brief Try to find a reduction tree.
3569  bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) {
3570  assert((!Phi ||
3571  std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) &&
3572  "Thi phi needs to use the binary operator");
3573 
3574  // We could have a initial reductions that is not an add.
3575  // r *= v1 + v2 + v3 + v4
3576  // In such a case start looking for a tree rooted in the first '+'.
3577  if (Phi) {
3578  if (B->getOperand(0) == Phi) {
3579  Phi = nullptr;
3580  B = dyn_cast<BinaryOperator>(B->getOperand(1));
3581  } else if (B->getOperand(1) == Phi) {
3582  Phi = nullptr;
3583  B = dyn_cast<BinaryOperator>(B->getOperand(0));
3584  }
3585  }
3586 
3587  if (!B)
3588  return false;
3589 
3590  Type *Ty = B->getType();
3591  if (!isValidElementType(Ty))
3592  return false;
3593 
3594  const DataLayout &DL = B->getModule()->getDataLayout();
3595  ReductionOpcode = B->getOpcode();
3596  ReducedValueOpcode = 0;
3597  // FIXME: Register size should be a parameter to this function, so we can
3598  // try different vectorization factors.
3599  ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty);
3600  ReductionRoot = B;
3601  ReductionPHI = Phi;
3602 
3603  if (ReduxWidth < 4)
3604  return false;
3605 
3606  // We currently only support adds.
3607  if (ReductionOpcode != Instruction::Add &&
3608  ReductionOpcode != Instruction::FAdd)
3609  return false;
3610 
3611  // Post order traverse the reduction tree starting at B. We only handle true
3612  // trees containing only binary operators.
3614  Stack.push_back(std::make_pair(B, 0));
3615  while (!Stack.empty()) {
3616  BinaryOperator *TreeN = Stack.back().first;
3617  unsigned EdgeToVist = Stack.back().second++;
3618  bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode;
3619 
3620  // Only handle trees in the current basic block.
3621  if (TreeN->getParent() != B->getParent())
3622  return false;
3623 
3624  // Each tree node needs to have one user except for the ultimate
3625  // reduction.
3626  if (!TreeN->hasOneUse() && TreeN != B)
3627  return false;
3628 
3629  // Postorder vist.
3630  if (EdgeToVist == 2 || IsReducedValue) {
3631  if (IsReducedValue) {
3632  // Make sure that the opcodes of the operations that we are going to
3633  // reduce match.
3634  if (!ReducedValueOpcode)
3635  ReducedValueOpcode = TreeN->getOpcode();
3636  else if (ReducedValueOpcode != TreeN->getOpcode())
3637  return false;
3638  ReducedVals.push_back(TreeN);
3639  } else {
3640  // We need to be able to reassociate the adds.
3641  if (!TreeN->isAssociative())
3642  return false;
3643  ReductionOps.push_back(TreeN);
3644  }
3645  // Retract.
3646  Stack.pop_back();
3647  continue;
3648  }
3649 
3650  // Visit left or right.
3651  Value *NextV = TreeN->getOperand(EdgeToVist);
3652  BinaryOperator *Next = dyn_cast<BinaryOperator>(NextV);
3653  if (Next)
3654  Stack.push_back(std::make_pair(Next, 0));
3655  else if (NextV != Phi)
3656  return false;
3657  }
3658  return true;
3659  }
3660 
3661  /// \brief Attempt to vectorize the tree found by
3662  /// matchAssociativeReduction.
3663  bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
3664  if (ReducedVals.empty())
3665  return false;
3666 
3667  unsigned NumReducedVals = ReducedVals.size();
3668  if (NumReducedVals < ReduxWidth)
3669  return false;
3670 
3671  Value *VectorizedTree = nullptr;
3672  IRBuilder<> Builder(ReductionRoot);
3673  FastMathFlags Unsafe;
3674  Unsafe.setUnsafeAlgebra();
3675  Builder.SetFastMathFlags(Unsafe);
3676  unsigned i = 0;
3677 
3678  for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
3679  V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps);
3680 
3681  // Estimate cost.
3682  int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
3683  if (Cost >= -SLPCostThreshold)
3684  break;
3685 
3686  DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
3687  << ". (HorRdx)\n");
3688 
3689  // Vectorize a tree.
3690  DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
3691  Value *VectorizedRoot = V.vectorizeTree();
3692 
3693  // Emit a reduction.
3694  Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder);
3695  if (VectorizedTree) {
3696  Builder.SetCurrentDebugLocation(Loc);
3697  VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
3698  ReducedSubTree, "bin.rdx");
3699  } else
3700  VectorizedTree = ReducedSubTree;
3701  }
3702 
3703  if (VectorizedTree) {
3704  // Finish the reduction.
3705  for (; i < NumReducedVals; ++i) {
3706  Builder.SetCurrentDebugLocation(
3707  cast<Instruction>(ReducedVals[i])->getDebugLoc());
3708  VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
3709  ReducedVals[i]);
3710  }
3711  // Update users.
3712  if (ReductionPHI) {
3713  assert(ReductionRoot && "Need a reduction operation");
3714  ReductionRoot->setOperand(0, VectorizedTree);
3715  ReductionRoot->setOperand(1, ReductionPHI);
3716  } else
3717  ReductionRoot->replaceAllUsesWith(VectorizedTree);
3718  }
3719  return VectorizedTree != nullptr;
3720  }
3721 
3722 private:
3723 
3724  /// \brief Calcuate the cost of a reduction.
3725  int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) {
3726  Type *ScalarTy = FirstReducedVal->getType();
3727  Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
3728 
3729  int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true);
3730  int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false);
3731 
3732  IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
3733  int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
3734 
3735  int ScalarReduxCost =
3736  ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy);
3737 
3738  DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
3739  << " for reduction that starts with " << *FirstReducedVal
3740  << " (It is a "
3741  << (IsPairwiseReduction ? "pairwise" : "splitting")
3742  << " reduction)\n");
3743 
3744  return VecReduxCost - ScalarReduxCost;
3745  }
3746 
3747  static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L,
3748  Value *R, const Twine &Name = "") {
3749  if (Opcode == Instruction::FAdd)
3750  return Builder.CreateFAdd(L, R, Name);
3751  return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name);
3752  }
3753 
3754  /// \brief Emit a horizontal reduction of the vectorized value.
3755  Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) {
3756  assert(VectorizedValue && "Need to have a vectorized tree node");
3757  assert(isPowerOf2_32(ReduxWidth) &&
3758  "We only handle power-of-two reductions for now");
3759 
3760  Value *TmpVec = VectorizedValue;
3761  for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
3762  if (IsPairwiseReduction) {
3763  Value *LeftMask =
3764  createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
3765  Value *RightMask =
3766  createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
3767 
3768  Value *LeftShuf = Builder.CreateShuffleVector(
3769  TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
3770  Value *RightShuf = Builder.CreateShuffleVector(
3771  TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
3772  "rdx.shuf.r");
3773  TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf,
3774  "bin.rdx");
3775  } else {
3776  Value *UpperHalf =
3777  createRdxShuffleMask(ReduxWidth, i, false, false, Builder);
3778  Value *Shuf = Builder.CreateShuffleVector(
3779  TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");
3780  TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx");
3781  }
3782  }
3783 
3784  // The result is in the first element of the vector.
3785  return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
3786  }
3787 };
3788 
3789 /// \brief Recognize construction of vectors like
3790 /// %ra = insertelement <4 x float> undef, float %s0, i32 0
3791 /// %rb = insertelement <4 x float> %ra, float %s1, i32 1
3792 /// %rc = insertelement <4 x float> %rb, float %s2, i32 2
3793 /// %rd = insertelement <4 x float> %rc, float %s3, i32 3
3794 ///
3795 /// Returns true if it matches
3796 ///
3797 static bool findBuildVector(InsertElementInst *FirstInsertElem,
3798  SmallVectorImpl<Value *> &BuildVector,
3799  SmallVectorImpl<Value *> &BuildVectorOpds) {
3800  if (!isa<UndefValue>(FirstInsertElem->getOperand(0)))
3801  return false;
3802 
3803  InsertElementInst *IE = FirstInsertElem;
3804  while (true) {
3805  BuildVector.push_back(IE);
3806  BuildVectorOpds.push_back(IE->getOperand(1));
3807 
3808  if (IE->use_empty())
3809  return false;
3810 
3812  if (!NextUse)
3813  return true;
3814 
3815  // If this isn't the final use, make sure the next insertelement is the only
3816  // use. It's OK if the final constructed vector is used multiple times
3817  if (!IE->hasOneUse())
3818  return false;
3819 
3820  IE = NextUse;
3821  }
3822 
3823  return false;
3824 }
3825 
3826 static bool PhiTypeSorterFunc(Value *V, Value *V2) {
3827  return V->getType() < V2->getType();
3828 }
3829 
3830 bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
3831  bool Changed = false;
3832  SmallVector<Value *, 4> Incoming;
3833  SmallSet<Value *, 16> VisitedInstrs;
3834 
3835  bool HaveVectorizedPhiNodes = true;
3836  while (HaveVectorizedPhiNodes) {
3837  HaveVectorizedPhiNodes = false;
3838 
3839  // Collect the incoming values from the PHIs.
3840  Incoming.clear();
3841  for (BasicBlock::iterator instr = BB->begin(), ie = BB->end(); instr != ie;
3842  ++instr) {
3843  PHINode *P = dyn_cast<PHINode>(instr);
3844  if (!P)
3845  break;
3846 
3847  if (!VisitedInstrs.count(P))
3848  Incoming.push_back(P);
3849  }
3850 
3851  // Sort by type.
3852  std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc);
3853 
3854  // Try to vectorize elements base on their type.
3855  for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
3856  E = Incoming.end();
3857  IncIt != E;) {
3858 
3859  // Look for the next elements with the same type.
3860  SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
3861  while (SameTypeIt != E &&
3862  (*SameTypeIt)->getType() == (*IncIt)->getType()) {
3863  VisitedInstrs.insert(*SameTypeIt);
3864  ++SameTypeIt;
3865  }
3866 
3867  // Try to vectorize them.
3868  unsigned NumElts = (SameTypeIt - IncIt);
3869  DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
3870  if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) {
3871  // Success start over because instructions might have been changed.
3872  HaveVectorizedPhiNodes = true;
3873  Changed = true;
3874  break;
3875  }
3876 
3877  // Start over at the next instruction of a different type (or the end).
3878  IncIt = SameTypeIt;
3879  }
3880  }
3881 
3882  VisitedInstrs.clear();
3883 
3884  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
3885  // We may go through BB multiple times so skip the one we have checked.
3886  if (!VisitedInstrs.insert(it).second)
3887  continue;
3888 
3889  if (isa<DbgInfoIntrinsic>(it))
3890  continue;
3891 
3892  // Try to vectorize reductions that use PHINodes.
3893  if (PHINode *P = dyn_cast<PHINode>(it)) {
3894  // Check that the PHI is a reduction PHI.
3895  if (P->getNumIncomingValues() != 2)
3896  return Changed;
3897  Value *Rdx =
3898  (P->getIncomingBlock(0) == BB
3899  ? (P->getIncomingValue(0))
3900  : (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1)
3901  : nullptr));
3902  // Check if this is a Binary Operator.
3903  BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
3904  if (!BI)
3905  continue;
3906 
3907  // Try to match and vectorize a horizontal reduction.
3908  HorizontalReduction HorRdx;
3909  if (ShouldVectorizeHor && HorRdx.matchAssociativeReduction(P, BI) &&
3910  HorRdx.tryToReduce(R, TTI)) {
3911  Changed = true;
3912  it = BB->begin();
3913  e = BB->end();
3914  continue;
3915  }
3916 
3917  Value *Inst = BI->getOperand(0);
3918  if (Inst == P)
3919  Inst = BI->getOperand(1);
3920 
3921  if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) {
3922  // We would like to start over since some instructions are deleted
3923  // and the iterator may become invalid value.
3924  Changed = true;
3925  it = BB->begin();
3926  e = BB->end();
3927  continue;
3928  }
3929 
3930  continue;
3931  }
3932 
3933  // Try to vectorize horizontal reductions feeding into a store.
3935  if (StoreInst *SI = dyn_cast<StoreInst>(it))
3936  if (BinaryOperator *BinOp =
3937  dyn_cast<BinaryOperator>(SI->getValueOperand())) {
3938  HorizontalReduction HorRdx;
3939  if (((HorRdx.matchAssociativeReduction(nullptr, BinOp) &&
3940  HorRdx.tryToReduce(R, TTI)) ||
3941  tryToVectorize(BinOp, R))) {
3942  Changed = true;
3943  it = BB->begin();
3944  e = BB->end();
3945  continue;
3946  }
3947  }
3948 
3949  // Try to vectorize horizontal reductions feeding into a return.
3950  if (ReturnInst *RI = dyn_cast<ReturnInst>(it))
3951  if (RI->getNumOperands() != 0)
3952  if (BinaryOperator *BinOp =
3953  dyn_cast<BinaryOperator>(RI->getOperand(0))) {
3954  DEBUG(dbgs() << "SLP: Found a return to vectorize.\n");
3955  if (tryToVectorizePair(BinOp->getOperand(0),
3956  BinOp->getOperand(1), R)) {
3957  Changed = true;
3958  it = BB->begin();
3959  e = BB->end();
3960  continue;
3961  }
3962  }
3963 
3964  // Try to vectorize trees that start at compare instructions.
3965  if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
3966  if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
3967  Changed = true;
3968  // We would like to start over since some instructions are deleted
3969  // and the iterator may become invalid value.
3970  it = BB->begin();
3971  e = BB->end();
3972  continue;
3973  }
3974 
3975  for (int i = 0; i < 2; ++i) {
3976  if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) {
3977  if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) {
3978  Changed = true;
3979  // We would like to start over since some instructions are deleted
3980  // and the iterator may become invalid value.
3981  it = BB->begin();
3982  e = BB->end();
3983  break;
3984  }
3985  }
3986  }
3987  continue;
3988  }
3989 
3990  // Try to vectorize trees that start at insertelement instructions.
3991  if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) {
3992  SmallVector<Value *, 16> BuildVector;
3993  SmallVector<Value *, 16> BuildVectorOpds;
3994  if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds))
3995  continue;
3996 
3997  // Vectorize starting with the build vector operands ignoring the
3998  // BuildVector instructions for the purpose of scheduling and user
3999  // extraction.
4000  if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) {
4001  Changed = true;
4002  it = BB->begin();
4003  e = BB->end();
4004  }
4005 
4006  continue;
4007  }
4008  }
4009 
4010  return Changed;
4011 }
4012 
4013 bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
4014  bool Changed = false;
4015  // Attempt to sort and vectorize each of the store-groups.
4016  for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end();
4017  it != e; ++it) {
4018  if (it->second.size() < 2)
4019  continue;
4020 
4021  DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
4022  << it->second.size() << ".\n");
4023 
4024  // Process the stores in chunks of 16.
4025  // TODO: The limit of 16 inhibits greater vectorization factors.
4026  // For example, AVX2 supports v32i8. Increasing this limit, however,
4027  // may cause a significant compile-time increase.
4028  for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
4029  unsigned Len = std::min<unsigned>(CE - CI, 16);
4030  Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len),
4031  -SLPCostThreshold, R);
4032  }
4033  }
4034  return Changed;
4035 }
4036 
4037 } // end anonymous namespace
4038 
4039 char SLPVectorizer::ID = 0;
4040 static const char lv_name[] = "SLP Vectorizer";
4041 INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
4046 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
4047 INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
4048 
4049 namespace llvm {
4050 Pass *createSLPVectorizerPass() { return new SLPVectorizer(); }
4051 }
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:82
const NoneType None
Definition: None.h:23
ReturnInst - Return a value (possibly void), from a function.
Value * getValueOperand()
Definition: Instructions.h:406
Intrinsic::ID getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:104
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
Definition: InstrTypes.h:649
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:679
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
raw_ostream & errs()
This returns a reference to a raw_ostream for standard error.
static IntegerType * getInt1Ty(LLVMContext &C)
Definition: Type.cpp:236
void addIncoming(Value *V, BasicBlock *BB)
addIncoming - Add an incoming value to the end of the PHI list
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
void dropAllReferences()
Drop all references to operands.
Definition: User.h:227
STATISTIC(NumFunctions,"Total number of functions")
bool hasValue() const
Definition: Optional.h:125
A Module instance is used to store all the information related to an LLVM module. ...
Definition: Module.h:114
unsigned getNumOperands() const
Definition: User.h:138
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
ScalarEvolution - This class is the main scalar evolution driver.
iterator end() const
Definition: ArrayRef.h:123
bool isSimple() const
Definition: Instructions.h:401
static MDNode * getMostGenericAliasScope(MDNode *A, MDNode *B)
Definition: Metadata.cpp:810
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:78
CallInst - This class represents a function call, abstracting a target machine's calling convention...
size_type count(PtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:276
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of .assume calls within a function.
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this ""number "))
This class implements a map that also provides access to all stored values in a deterministic order...
Definition: MapVector.h:32
A debug info location.
Definition: DebugLoc.h:34
Metadata node.
Definition: Metadata.h:740
F(f)
reverse_iterator rend()
Definition: BasicBlock.h:238
LoadInst - an instruction for reading from memory.
Definition: Instructions.h:177
reverse_iterator rbegin()
Definition: BasicBlock.h:236
FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys=None)
Return the function type for an intrinsic.
Definition: Function.cpp:822
#define op(i)
void reserve(size_type N)
Definition: SmallVector.h:401
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:1522
bool isSimple() const
Definition: Instructions.h:279
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:79
op_iterator op_begin()
Definition: User.h:183
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:923
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:188
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:231
unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info=OK_AnyValue, OperandValueKind Opd2Info=OK_AnyValue, OperandValueProperties Opd1PropInfo=OP_None, OperandValueProperties Opd2PropInfo=OP_None) const
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:306
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:70
static Value * getPointerOperand(Instruction &Inst)
bool isIdenticalTo(const Instruction *I) const
isIdenticalTo - Return true if the specified instruction is exactly identical to the current one...
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:79
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:389
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:106
NodeTy * getNextNode()
Get the next node, or 0 for the list tail.
Definition: ilist_node.h:80
ArrayRef< T > makeArrayRef(const T &OneElt)
Construct an ArrayRef from a single element.
Definition: ArrayRef.h:308
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr it the function does no...
Definition: BasicBlock.cpp:116
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:98
A Use represents the edge between a Value definition and its users.
Definition: Use.h:69
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:75
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APInt.h:33
unsigned getNumArgOperands() const
getNumArgOperands - Return the number of call arguments.
Instruction * getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:165
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1057
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:517
Windows NT (Windows on ARM)
static ConstantInt * ExtractElement(Constant *V, Constant *Idx)
Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset)
Accumulate offsets from stripInBoundsConstantOffsets().
Definition: Value.cpp:470
#define false
Definition: ConvertUTF.c:65
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:117
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:414
Choose alternate elements from vector.
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:102
void clear()
Definition: SmallSet.h:107
void SetFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:178
bool LLVM_ATTRIBUTE_UNUSED_RESULT empty() const
Definition: SmallVector.h:57
bool isAssociative() const
isAssociative - Return true if the instruction is associative:
ArrayRef< T > slice(unsigned N) const
slice(n) - Chop off the first N elements of the array.
Definition: ArrayRef.h:165
const T & getValue() const LLVM_LVALUE_FUNCTION
Definition: Optional.h:121
static bool isValidElementType(Type *ElemTy)
isValidElementType - Return true if the specified type is valid as a element type.
Definition: Type.cpp:729
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:69
Base class for the actual dominator tree node.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: ArrayRef.h:31
static MDNode * intersect(MDNode *A, MDNode *B)
Definition: Metadata.cpp:796
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:866
StoreInst - an instruction for storing to memory.
Definition: Instructions.h:316
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:109
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:351
unsigned getNumElements() const
Return the number of elements in the Vector type.
Definition: DerivedTypes.h:432
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:67
bool isPPC_FP128Ty() const
isPPC_FP128Ty - Return true if this is powerpc long double.
Definition: Type.h:155
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:134
bool mayReadOrWriteMemory() const
mayReadOrWriteMemory - Return true if this instruction may read or write memory.
Definition: Instruction.h:362
unsigned getNumIncomingValues() const
getNumIncomingValues - Return the number of incoming edges
void replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:24
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:95
#define P(N)
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:325
InsertElementInst - This instruction inserts a single (scalar) element into a VectorType value...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
unsigned getAlignment() const
getAlignment - Return the alignment of the access that is being performed
Definition: Instructions.h:365
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Subclasses of this class are all able to terminate a basic block.
Definition: InstrTypes.h:35
Wrapper pass for TargetTransformInfo.
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
BlockT * getLoopPreheader() const
getLoopPreheader - If there is a preheader for this loop, return it.
Definition: LoopInfoImpl.h:108
std::enable_if<!std::is_array< T >::value, std::unique_ptr< T > >::type make_unique(Args &&...args)
Constructs a new T() with the given args and returns a unique_ptr<T> which owns the object...
Definition: STLExtras.h:354
LLVM Basic Block Representation.
Definition: BasicBlock.h:65
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:45
size_type size() const
Definition: SmallPtrSet.h:79
void getAllMetadataOtherThanDebugLoc(SmallVectorImpl< std::pair< unsigned, MDNode * >> &MDs) const
getAllMetadataOtherThanDebugLoc - This does the same thing as getAllMetadata, except that it filters ...
Definition: Instruction.h:190
bool isVectorTy() const
isVectorTy - True if this is an instance of VectorType.
Definition: Type.h:226
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:32
APInt Or(const APInt &LHS, const APInt &RHS)
Bitwise OR function for APInt.
Definition: APInt.h:1895
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:264
APInt Xor(const APInt &LHS, const APInt &RHS)
Bitwise XOR function for APInt.
Definition: APInt.h:1900
const DebugLoc & getDebugLoc() const
getDebugLoc - Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:230
void setUnsafeAlgebra()
Definition: Operator.h:200
Represent the analysis usage information of a pass.
op_iterator op_end()
Definition: User.h:185
BasicBlock * getIncomingBlock(unsigned i) const
getIncomingBlock - Return incoming basic block number i.
bool contains(const LoopT *L) const
contains - Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:105
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang","erlang-compatible garbage collector")
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:697
iterator begin() const
Definition: ArrayRef.h:122
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:294
Value * getOperand(unsigned i) const
Definition: User.h:118
op_range operands()
Definition: User.h:191
Value * getPointerOperand()
Definition: Instructions.h:284
bool isCommutative() const
isCommutative - Return true if the instruction is commutative:
Definition: Instruction.h:327
iterator_range< po_iterator< T > > post_order(const T &G)
SI Fold Operands
std::pair< NoneType, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:69
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:760
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:129
void setAlignment(unsigned Align)
#define INITIALIZE_AG_DEPENDENCY(depName)
Definition: PassSupport.h:72
static MDNode * getMostGenericTBAA(MDNode *A, MDNode *B)
static UndefValue * get(Type *T)
get() - Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1473
PointerType * getPointerTo(unsigned AddrSpace=0)
getPointerTo - Return a pointer to the current type.
Definition: Type.cpp:764
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:386
Value * GetUnderlyingObject(Value *V, const DataLayout &DL, unsigned MaxLookup=6)
GetUnderlyingObject - This method strips off any GEP address adjustments and pointer casts from the s...
void setMetadata(unsigned KindID, MDNode *Node)
setMetadata - Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1083
const T & back() const
back - Get the last element.
Definition: ArrayRef.h:143
void dump() const
Support for debugging, callable in GDB: V->dump()
Definition: AsmWriter.cpp:3353
OperandValueProperties
Additional properties of an operand's values.
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:53
const Value * Ptr
The address of the start of the location.
#define SV_NAME
Representation for a specific memory location.
BinaryOps getOpcode() const
Definition: InstrTypes.h:323
unsigned getABITypeAlignment(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:674
MemIntrinsic - This is the common base class for memset/memcpy/memmove.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:299
This is the shared class of boolean and integer constants.
Definition: Constants.h:47
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:1495
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Value * getIncomingValue(unsigned i) const
getIncomingValue - Return incoming value number x
unsigned getVectorNumElements() const
Definition: Type.cpp:212
iterator end()
Definition: BasicBlock.h:233
virtual AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB)
Alias Queries...
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:57
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:861
Module.h This file contains the declarations for the Module class.
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:222
Instruction * user_back()
user_back - Specialize the methods defined in Value, as we know that an instruction can only be used ...
Definition: Instruction.h:69
Provides information about what library functions are available for the current target.
MDNode * getMetadata(unsigned KindID) const
getMetadata - Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:167
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:67
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:266
Function * getCalledFunction() const
getCalledFunction - Return the function called, or null if this is an indirect function invocation...
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:263
const BasicBlock & getEntryBlock() const
Definition: Function.h:442
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:289
bool hasVectorInstrinsicScalarOpd(Intrinsic::ID ID, unsigned ScalarOpdIdx)
Identifies if the intrinsic has a scalar operand.
Definition: VectorUtils.cpp:62
Intrinsic::ID getIntrinsicID() const LLVM_READONLY
getIntrinsicID - This method returns the ID number of the specified function, or Intrinsic::not_intri...
Definition: Function.h:159
void setOperand(unsigned i, Value *Val)
Definition: User.h:122
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:123
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:576
Value * getArgOperand(unsigned i) const
getArgOperand/setArgOperand - Return/set the i-th call argument.
VectorType - Class to represent vector types.
Definition: DerivedTypes.h:362
Class for arbitrary precision integers.
Definition: APInt.h:73
Value * getIncomingValueForBlock(const BasicBlock *BB) const
iterator_range< user_iterator > users()
Definition: Value.h:300
LLVM_ATTRIBUTE_UNUSED_RESULT std::enable_if< !is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:285
Value * CreateFAdd(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:718
APInt And(const APInt &LHS, const APInt &RHS)
Bitwise AND function for APInt.
Definition: APInt.h:1890
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing basic block, but does not delete it...
Definition: Instruction.cpp:66
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.cpp:372
bool isX86_FP80Ty() const
isX86_FP80Ty - Return true if this is x86 long double.
Definition: Type.h:149
void initializeSLPVectorizerPass(PassRegistry &)
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:156
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.h:217
SCEV - This class represents an analyzed expression in the program.
unsigned getAlignment() const
getAlignment - Return the alignment of the access that is being performed
Definition: Instructions.h:243
unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) const
Calculate the cost of performing a vector reduction.
void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction...
Definition: Instruction.cpp:82
#define I(x, y, z)
Definition: MD5.cpp:54
#define N
TerminatorInst * getTerminator()
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.cpp:124
bool hasOneUse() const
Return true if there is exactly one user of this value.
Definition: Value.h:311
User(Type *ty, unsigned vty, Use *OpList, unsigned NumOps)
Definition: User.h:54
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:329
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:3639
void size_t size
ExtractElementInst - This instruction extracts a single (scalar) element from a VectorType value...
uint64_t getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type...
Definition: DataLayout.h:371
raw_ostream & operator<<(raw_ostream &OS, const APInt &I)
Definition: APInt.h:1738
Pass * createSLPVectorizerPass()
bool use_empty() const
Definition: Value.h:275
user_iterator user_begin()
Definition: Value.h:294
const ARM::ArchExtKind Kind
static const char lv_name[]
LLVM Value Representation.
Definition: Value.h:69
void setAlignment(unsigned Align)
vector_type::const_iterator iterator
Definition: SetVector.h:45
unsigned getOpcode() const
getOpcode() returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:112
A vector that has set insertion semantics.
Definition: SetVector.h:37
static VectorType * get(Type *ElementType, unsigned NumElements)
VectorType::get - This static method is the primary way to construct an VectorType.
Definition: Type.cpp:713
Disable implicit floating point insts.
Definition: Attributes.h:87
static const Function * getParent(const Value *V)
void moveBefore(Instruction *MovePos)
moveBefore - Unlink this instruction from its current basic block and insert it into the basic block ...
Definition: Instruction.cpp:89
Broadcast element 0 to all other elements.
uint64_t getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:507
This class implements an extremely fast bulk output stream that can only output to a stream...
Definition: raw_ostream.h:38
#define DEBUG(X)
Definition: Debug.h:92
The legacy pass manager's analysis pass to compute loop information.
Definition: LoopInfo.h:737
C - The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
bool isPowerOf2_32(uint32_t Value)
isPowerOf2_32 - This function returns true if the argument is a power of two > 0. ...
Definition: MathExtras.h:354
Convenience struct for specifying and reasoning about fast-math flags.
Definition: Operator.h:164
OperandValueKind
Additional information about an operand's possible values.
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:203
This pass exposes codegen information to IR-level passes.
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop)...
Definition: CodeMetrics.cpp:70
static MDNode * getMostGenericFPMath(MDNode *A, MDNode *B)
Definition: Metadata.cpp:824
Root of the metadata hierarchy.
Definition: Metadata.h:45
Value * getPointerOperand()
Definition: Instructions.h:409
const BasicBlock * getParent() const
Definition: Instruction.h:72
bool isVoidTy() const
isVoidTy - Return true if this is 'void'.
Definition: Type.h:137
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
Definition: VectorUtils.cpp:26
user_iterator user_end()
Definition: Value.h:296